码迷,mamicode.com
首页 > 编程语言 > 详细

算法之智能搜索(上)

时间:2018-11-25 01:26:20      阅读:232      评论:0      收藏:0      [点我收藏+]

标签:vat   url   roc   private   lag   learn   stringbu   type   enc   

笔者并不了解,各大搜索网站是怎么实现智能搜索的。以下只是笔者一时的想法,笔者觉得这个方法可以实现智能匹配搜索内容。

一、首先我们获取细胞词库内容

①建表语句:

DROP TABLE IF EXISTS `sougou_ciku`;
CREATE TABLE `sougou_ciku` (
    `id` varchar(50) NOT NULL,
    `text` varchar(100) NOT NULL,
    `below` varchar(50) default NULL,
    `remark` varchar(100) default NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

②创建映射实体类:

package com.css.java.learning.model;
public class SouGouCiKu {
private String id;//主键
private String text; //内容 
private String below;//所属
private String remark;//备注

public String getId() {
    return id;
}
public void setId(String id) {
    this.id = id;
}
public String getText() {
    return text;
}
public void setText(String text) {
    this.text = text;
}
public String getBelow() {
    return below;
}
public void setBelow(String below) {
    this.below = below;
}
public String getRemark() {
    return remark;
}
public void setRemark(String remark) {
    this.remark = remark;
}

}

③创建搜狗scel文件阅读器:

package com.css.java.learning.massbag;
import java.util.List;
import java.util.Map;
public class SougouScelMdel {
        private Map<String, List<String>> wordMap;
        private String name;
        private String type;
        private String description;
        private String sample;
public Map<String, List<String>> getWordMap() {
    return wordMap;
}
void setWordMap(Map<String, List<String>> wordMap) {
    this.wordMap = wordMap;
}
public String getType() {
    return type;
}
public void setType(String type) {
    this.type = type;
}
public String getDescription() {
    return description;
}
public void setDescription(String description) {
    this.description = description;
}
public String getSample() {
    return sample;
}
public void setSample(String sample) {
    this.sample = sample;
}
public String getName() {
    return name;
}
public void setName(String name) {
    this.name = name;
}

}

package com.css.java.learning.massbag;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class SougouScelReader {
public SougouScelMdel read(File file) throws IOException {
    return read(new FileInputStream(file));
}
public SougouScelMdel read(URL url) throws IOException {
    return read(url.openStream());
}
protected ByteArrayOutputStream output=new ByteArrayOutputStream();

protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
    int read=reads[0];
    input.skip(pos-read);
    read=pos;
    output.reset();
    while(true) {
        int c1 = input.read();
        int c2 = input.read();
        read+=2;
        if(c1==0 && c2==0) {
            break;
        } else {
            output.write(c1);
            output.write(c2);
        }
    }
    reads[0]=read;
    return new String(output.toByteArray(),encoding);
}

protected static String encoding = "UTF-16LE";

public SougouScelMdel read(InputStream in) throws IOException {
    SougouScelMdel model = new SougouScelMdel();
    DataInputStream input = new DataInputStream(in);
    int read;
    try {
        byte[] bytes = new byte[4];
        input.readFully(bytes);
        assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
        input.readFully(bytes);
        int flag1 = bytes[0];
        assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
        int[] reads=new int[]{8};
        model.setName(readString(input,0x130,reads));
        model.setType(readString(input,0x338,reads));
        model.setDescription(readString(input,0x540,reads));
        model.setSample(readString(input,0xd40,reads));
        read = reads[0];
        input.skip(0x1540 - read);
        read=0x1540;
        input.readFully(bytes);
        read += 4;
        assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
        bytes = new byte[128];
        Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
        while (true) {
            int mark = readUnsignedShort(input);
            int size = input.readUnsignedByte();
            input.skip(1);
            read += 4;
            assert (size > 0 && (size % 2) == 0);
            input.readFully(bytes, 0, size);
            read += size;
            String py = new String(bytes, 0, size, encoding);
            pyMap.put(mark, py);
            if ("zuo".equals(py)) {
                break;
            }
        }
        if (flag1 == 0x44) {
            input.skip(0x2628 - read);
        } else if (flag1 == 0x45) {
            input.skip(0x26C4 - read);
        }
        StringBuffer buffer = new StringBuffer();
        Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
        while (true) {
            int size = readUnsignedShort(input);
            if (size < 0) {
                break;
            }
            int count = readUnsignedShort(input);
            int len = count / 2;
            assert (len * 2 == count);
            buffer.setLength(0);
            for (int i = 0; i < len; i++) {
                int key = readUnsignedShort(input);
                buffer.append(pyMap.get(key)).append("‘");
            }
            buffer.setLength(buffer.length() - 1);
            String py = buffer.toString();
            List<String> list = wordMap.get(py);
            if (list == null) {
                list = new ArrayList<String>();
                wordMap.put(py, list);
            }
            for (int i = 0; i < size; i++) {
                count = readUnsignedShort(input);
                if (count > bytes.length) {
                    bytes = new byte[count];
                }
                input.readFully(bytes, 0, count);
                String word = new String(bytes, 0, count, encoding);
                input.skip(12);
                list.add(word);
            }
        }
        model.setWordMap(wordMap);
        return model;
    } finally {
        in.close();
    }
}
protected final int readUnsignedShort(InputStream in) throws IOException {
    int ch1 = in.read();
    int ch2 = in.read();
    if ((ch1 | ch2) < 0) {
        return Integer.MIN_VALUE;
    }
    return (ch2 << 8) + (ch1 << 0);
}

}

④搜狗官网下下载细胞词库.scel文件
略!

⑤读取细胞词库文件.scel插入数据库

private static void sogou(String path) throws IOException{  
            File file=new File(path);  
            SougouScelMdel model = new SougouScelReader().read(file);  
            Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>  
            Set<Entry<String,List<String>>> set = words.entrySet();  
            Iterator<Entry<String,List<String>>> iter = set.iterator();  
            while(iter.hasNext()){  
                    Entry<String,List<String>> entry = iter.next();  
                    List<String> list = entry.getValue();  
                    int size = list.size();  
                    for(int i = 0; i < size; i++){  
                            String word = list.get(i); 
                            /*判断,该词是否在数据库中出现,无则加之有则不做处理
                             * 此处方法不做呈现
                             */
                            boolean is_exit = jugeWord(word);
                            if(is_exit) {
                                /*将该词,插入到数据库中,供后续使用
                                 * 此方法亦不做呈现
                                 */
                                insert(word);
                            }
                            System.out.println(word);  
                    }  
            }  
    } 

⑥执行搜狗细胞词库插入数据库

笔者以下面的文件为例:

技术分享图片

得到以下等数据

技术分享图片

下篇讲解,笔者自创的简单算法,拆分输入语句匹配词库完成搜索过程。

算法之智能搜索(上)

标签:vat   url   roc   private   lag   learn   stringbu   type   enc   

原文地址:http://blog.51cto.com/13479739/2321588

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!