diff --git a/src/main/java/com/hankcs/lucene/HanLPTokenizer.java b/src/main/java/com/hankcs/lucene/HanLPTokenizer.java index cd66ae8..cb27ade 100644 --- a/src/main/java/com/hankcs/lucene/HanLPTokenizer.java +++ b/src/main/java/com/hankcs/lucene/HanLPTokenizer.java @@ -5,6 +5,7 @@ import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.seg.Segment; import com.hankcs.hanlp.seg.common.Term; +import com.hankcs.hanlp.utility.TextUtility; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -40,9 +41,8 @@ public class HanLPTokenizer extends Tokenizer private int totalOffset = 0; /** - * - * @param segment HanLP中的某个分词器 - * @param filter 停用词 + * @param segment HanLP中的某个分词器 + * @param filter 停用词 * @param enablePorterStemming 英文原型转换 */ public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) @@ -74,6 +74,10 @@ final public boolean incrementToken() throws IOException { break; } + if (TextUtility.isBlank(term.word)) // 过滤掉空白符,提高索引效率 + { + continue; + } if (enablePorterStemming && term.nature == Nature.nx) { term.word = stemmer.stem(term.word); @@ -96,7 +100,7 @@ final public boolean incrementToken() throws IOException positionAttr.setPositionIncrement(position); termAtt.setEmpty().append(term.word); offsetAtt.setOffset(correctOffset(totalOffset + term.offset), - correctOffset(totalOffset + term.offset + term.word.length())); + correctOffset(totalOffset + term.offset + term.word.length())); typeAtt.setType(term.nature == null ? "null" : term.nature.toString()); return true; } diff --git a/src/test/java/com/hankcs/lucene/HighLighterTest.java b/src/test/java/com/hankcs/lucene/HighLighterTest.java index 976288c..d6af866 100644 --- a/src/test/java/com/hankcs/lucene/HighLighterTest.java +++ b/src/test/java/com/hankcs/lucene/HighLighterTest.java @@ -61,7 +61,7 @@ public void testHightlight() throws Exception // 加入一个文档 Document doc = new Document(); doc.add(new TextField(fieldName, "\n返回值\r\n返回", Field.Store.YES)); - doc.add(new TextField("title", "测试回测换行符", Field.Store.YES)); + doc.add(new TextField("title", "测试回车换行符", Field.Store.YES)); iwriter.addDocument(doc); } {