网页内容相似度之SimHash算法

时间：2014-08-17 23:59:23 阅读：709 评论：0 收藏：0 [点我收藏+]

标签：Lucene style blog http color java os io

　　抓取的网页内容中，有大部分会是相似的，抓取时就要过滤掉，开始考虑用VSM算法，后来发现不对，要比较太多东西了，然后就发现了simHash算法，这个算法的解释我就懒得copy了，simhash算法对于短数据的支持不好，但是，我本来就是很长的数据，用上！

　　源码实现网上也有不少，但是貌似都是同样的，里面写得不清不楚的，虽然效果基本能达到，但是不清楚的东西，我用来做啥？

　　仔细研究simhash算法的说明后，把里面字符串的hash算法换成的fvn-1算法，这个在http://www.isthe.com/chongo/tech/comp/fnv/里面有说明了，具体的那些固定数值，网站上都写了。原先代码里面有些处理，和算法不符的，也换掉了。

　　首先搞起IKAnalyzer，切词并计算每个词的频率：

package com.cnblogs.zxub.lucene.similarity;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class WordsSpliter {

    public static Map<String, Integer> getSplitedWords(String str)
            throws IOException {
        // str = str.replaceAll("[0-9a-zA-Z]", "");
        Analyzer analyzer = new IKAnalyzer();
        Reader r = new StringReader(str);
        TokenStream ts = analyzer.tokenStream("searchValue", r);
        ts.addAttribute(CharTermAttribute.class);

        Map<String, Integer> result = new HashMap<String, Integer>();
        while (ts.incrementToken()) {
            CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
            String word = ta.toString();
            if (!result.containsKey(word)) {
                result.put(word, 0);
            }
            result.put(word, result.get(word) + 1);
        }

        return result;
    }
}

　　然后把SimHash的算法搞上：

package com.cnblogs.zxub.lucene.similarity;

import java.io.IOException;
import java.math.BigInteger;
import java.util.Map;
import java.util.Set;

public class SimHash {

    private static final int HASH_BITS = 64;
    private static final BigInteger FNV_64_INIT = new BigInteger(
            "14695981039346656037");
    private static final BigInteger FNV_64_PRIME = new BigInteger(
            "1099511628211");
    private static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(
            HASH_BITS).subtract(BigInteger.ONE);

    private String hash;
    private BigInteger signature;

    public SimHash(String content) throws IOException {
        super();
        this.setFingerPrint(WordsSpliter.getSplitedWords(content));
    }

    public String getHash() {
        return this.hash;
    }

    public BigInteger getSignature() {
        return this.signature;
    }

    private void setFingerPrint(Map<String, Integer> wordInfos) {
        int[] featureVector = new int[SimHash.HASH_BITS];
        Set<String> words = wordInfos.keySet();
        for (String word : words) {
            BigInteger wordhash = this.fnv1_64_hash(word);
            for (int i = 0; i < SimHash.HASH_BITS; i++) {
                BigInteger bitmask = BigInteger.ONE.shiftLeft(SimHash.HASH_BITS
                        - i - 1);
                if (wordhash.and(bitmask).signum() != 0) {
                    featureVector[i] += wordInfos.get(word);
                } else {
                    featureVector[i] -= wordInfos.get(word);
                }
            }
        }

        BigInteger signature = BigInteger.ZERO;
        StringBuffer hashBuffer = new StringBuffer();
        for (int i = 0; i < SimHash.HASH_BITS; i++) {
            if (featureVector[i] >= 0) {
                signature = signature.add(BigInteger.ONE
                        .shiftLeft(SimHash.HASH_BITS - i - 1));
                hashBuffer.append("1");
            } else {
                hashBuffer.append("0");
            }
        }
        this.hash = hashBuffer.toString();
        this.signature = signature;
    }

    // fnv-1 hash算法，将字符串转换为64位hash值
    private BigInteger fnv1_64_hash(String str) {
        BigInteger hash = FNV_64_INIT;
        int len = str.length();
        for (int i = 0; i < len; i++) {
            hash = hash.multiply(FNV_64_PRIME);
            hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
        }
        hash = hash.and(MASK_64);
        return hash;
    }

    public int getHammingDistance(BigInteger targetSignature) {
        BigInteger x = this.getSignature().xor(targetSignature);
        String s = x.toString(2);
        return s.replaceAll("0", "").length();
    }

    public int getHashDistance(String targetHash) {
        int distance;
        if (this.getHash().length() != targetHash.length()) {
            distance = -1;
        } else {
            distance = 0;
            for (int i = 0; i < this.getHash().length(); i++) {
                if (this.getHash().charAt(i) != targetHash.charAt(i)) {
                    distance++;
                }
            }
        }
        return distance;
    }
}

　　数据库里面存个签名就好了，至于距离运算，本打算全部拉出来计算，后来发现oracle的bitand函数，就用它了！异或之后，转二进制字符串，把0去掉，取长度，再count一下长度小于4的，得到的结果就是很相似的内容数目了。以后再把计算改成用缓存的去，先偷个懒。

　　收摊！

网页内容相似度之SimHash算法,布布扣,bubuko.com

网页内容相似度之SimHash算法

标签：Lucene style blog http color java os io

原文地址：http://www.cnblogs.com/zxub/p/3918422.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行