标签:
工作上,有需要对纯数字的字符串做分词,好像CJK二元分词器对这样的数字不会做分词,所以自己写了个分词器,分词器达到以下效果:对字符串1234567,分词后为:12 34 56 7
Analyzer:
package org.apache.lucene.analysis.core; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.Version; public final class MyNumberAnalyzer extends Analyzer { public MyNumberAnalyzer() {} @Deprecated public MyNumberAnalyzer(Version matchVersion) { setVersion(matchVersion); } @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { return new TokenStreamComponents(new MyNumberTokenizer(getVersion(), reader)); } public static void main(String[] args) throws Exception { MyNumberAnalyzer analyzer = new MyNumberAnalyzer(); //WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader("123467899988")); tokenStream.reset(); tokenStream.addAttribute(CharTermAttribute.class); while(tokenStream.incrementToken()){ CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); System.out.println(termAttribute.toString()); } } }
Tokenizer:
package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.util.MyCharTokenizer;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.Version;
public final class MyNumberTokenizer extends MyCharTokenizer {
private long count = 0;
public MyNumberTokenizer(Reader in) {
super(in);
}
@Deprecated
public MyNumberTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
public MyNumberTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
@Deprecated
public MyNumberTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
@Override
protected boolean isTokenChar(int c) {
return get();
}
public synchronized boolean get() {
count++;
if(count%2 == 0) {
count = 0L;
return false;
}
return true;
}
}
Tokenizer:
package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.Version; public abstract class MyCharTokenizer extends Tokenizer { public MyCharTokenizer(Reader input) { super(input); charUtils = CharacterUtils.getInstance(); } @Deprecated public MyCharTokenizer(Version matchVersion, Reader input) { super(input); charUtils = CharacterUtils.getInstance(matchVersion); } public MyCharTokenizer(AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(); } @Deprecated public MyCharTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); } private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final CharacterUtils charUtils; private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); protected abstract boolean isTokenChar(int c); protected int normalize(int c) { return c; } @Override public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end = -1; char[] buffer = termAtt.buffer(); while (true) { //第一次进入,bufferIndex=dataLen=0 if (bufferIndex >= dataLen) { offset += dataLen;//offset=0 //将数据拷贝进入ioBuffer charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils // if (ioBuffer.getLength() == 0) { dataLen = 0; // so next offset += dataLen won‘t decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); //返回false,表示分词结束 return false; } } //重置数据的长度 dataLen = ioBuffer.getLength(); //重置起始位置 bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone //取得ioBuffer中第bufferIndex位的字符 final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); //获得字符长度 final int charCount = Character.charCount(c); //起始位置加charCount bufferIndex += charCount; // if(isTokenChar(c)) { // if it‘s a token char if (length == 0) { // start of token assert start == -1; start = offset + bufferIndex - charCount; end = start; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer } end += charCount; length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test break; } } else if (length > 0) { // at non-Letter w/ chars //length++; //end++; if (length == 0) { // start of token assert start == -1; start = offset + bufferIndex - charCount; end = start; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer } end += charCount; length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test break; } break; // return ‘em } } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); return true; } @Override public final void end() throws IOException { super.end(); // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset() throws IOException { super.reset(); bufferIndex = 0; offset = 0; dataLen = 0; finalOffset = 0; ioBuffer.reset(); // make sure to reset the IO buffer!! } }
TokenizerFactory:
package org.apache.lucene.analysis.core; import java.io.Reader; import java.util.Map; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; public class MyNumberTokenizerFactory extends TokenizerFactory { public MyNumberTokenizerFactory(Map<String,String> args) { super(args); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } @Override public MyNumberTokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion == null) { return new MyNumberTokenizer(factory, input); } return new MyNumberTokenizer(luceneMatchVersion, factory, input); } }
标签:
原文地址:http://my.oschina.net/sniperLi/blog/489528