标签:
工作上,有需要对纯数字的字符串做分词,好像CJK二元分词器对这样的数字不会做分词,所以自己写了个分词器,分词器达到以下效果:对字符串"哈哈1234呵呵456",分词后为:1 12 123 1234 4 45 456
Analyzer:
package org.apache.lucene.analysis.core; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.Version; public final class MyNumberAnalyzer extends Analyzer { public MyNumberAnalyzer() {} @Deprecated public MyNumberAnalyzer(Version matchVersion) { setVersion(matchVersion); } @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { return new TokenStreamComponents(new MyNumberTokenizer(getVersion(), reader)); } public static void main(String[] args) throws Exception { MyNumberAnalyzer analyzer = new MyNumberAnalyzer(); //WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); //TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("哈哈2345呵呵3456我是")); // TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("")); // TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("1234")); // TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("哈哈1234")); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("1234哈哈")); tokenStream.reset(); tokenStream.addAttribute(CharTermAttribute.class); while(tokenStream.incrementToken()){ CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); System.out.println(termAttribute.toString()); } } }
Tokenizer:
package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.util.MyCharTokenizer;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.Version;
public final class MyNumberTokenizer extends MyCharTokenizer {
private long count = 0;
public MyNumberTokenizer(Reader in) {
super(in);
}
@Deprecated
public MyNumberTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
public MyNumberTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
@Deprecated
public MyNumberTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
@Override
protected boolean isTokenChar(int c) {
return get();
}
public synchronized boolean get() {
count++;
if(count%2 == 0) {
count = 0L;
return false;
}
return true;
}
}
Tokenizer:
package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.Version; public abstract class MyCharTokenizer extends Tokenizer { public MyCharTokenizer(Reader input) { super(input); charUtils = CharacterUtils.getInstance(); } @Deprecated public MyCharTokenizer(Version matchVersion, Reader input) { super(input); charUtils = CharacterUtils.getInstance(matchVersion); } public MyCharTokenizer(AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(); } @Deprecated public MyCharTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); } private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final CharacterUtils charUtils; private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); protected abstract boolean isTokenChar(int c); protected int normalize(int c) { return c; } public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end = -1; char[] buffer = termAtt.buffer(); while(true) { //第一次进入,bufferIndex=dataLen=0 if (bufferIndex >= dataLen) { offset += dataLen;//offset=0 //将数据拷贝进入ioBuffer charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils // if (ioBuffer.getLength() == 0) { dataLen = 0; // so next offset += dataLen won‘t decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); //返回false,表示分词结束 return false; } } //重置数据的长度 dataLen = ioBuffer.getLength(); //重置起始位置 bufferIndex = 0; } //循环下标大于输入字符串总长度 if(bufferIndex >= ioBuffer.getLength()) { break; } //取得ioBuffer中第bufferIndex位的字符 final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); //获得字符长度 final int charCount = Character.charCount(c); //起始位置加charCount bufferIndex += charCount; //判断字符是否数值型 比如输入为:哈哈2345呵呵3456我是 boolean isDigit = Character.isDigit(c); //不是数字,并且不是字母 if(!isDigit && !String.valueOf((char)c).toLowerCase().matches("[a-z]")) { offset = bufferIndex-1; offset++; continue; } //第一次为2345中2的下标值 start = offset; //start = 0; end = bufferIndex; length = bufferIndex-start; if(length >= buffer.length) { buffer = termAtt.resizeBuffer(2+length); } for(int i=start; i<bufferIndex; i++) { int cc = charUtils.codePointAt(ioBuffer.getBuffer(), i, ioBuffer.getLength()); Character.toChars(normalize(cc), buffer, i-start); } if(length <= 2) { continue; } break; } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); return true; } @Override public final void end() throws IOException { super.end(); // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset() throws IOException { super.reset(); bufferIndex = 0; offset = 0; dataLen = 0; finalOffset = 0; ioBuffer.reset(); // make sure to reset the IO buffer!! } }
TokenizerFactory:
package org.apache.lucene.analysis.core; import java.io.Reader; import java.util.Map; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; public class MyNumberTokenizerFactory extends TokenizerFactory { public MyNumberTokenizerFactory(Map<String,String> args) { super(args); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } @Override public MyNumberTokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion == null) { return new MyNumberTokenizer(factory, input); } return new MyNumberTokenizer(luceneMatchVersion, factory, input); } }
标签:
原文地址:http://my.oschina.net/sniperLi/blog/497944