Skip to content

Commit

Permalink
过滤掉空白符,提高索引效率
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Aug 27, 2018
1 parent 732f8b0 commit 7a9d54a
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
12 changes: 8 additions & 4 deletions src/main/java/com/hankcs/lucene/HanLPTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.TextUtility;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Expand Down Expand Up @@ -40,9 +41,8 @@ public class HanLPTokenizer extends Tokenizer
private int totalOffset = 0;

/**
*
* @param segment HanLP中的某个分词器
* @param filter 停用词
* @param segment HanLP中的某个分词器
* @param filter 停用词
* @param enablePorterStemming 英文原型转换
*/
public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming)
Expand Down Expand Up @@ -74,6 +74,10 @@ final public boolean incrementToken() throws IOException
{
break;
}
if (TextUtility.isBlank(term.word)) // 过滤掉空白符,提高索引效率
{
continue;
}
if (enablePorterStemming && term.nature == Nature.nx)
{
term.word = stemmer.stem(term.word);
Expand All @@ -96,7 +100,7 @@ final public boolean incrementToken() throws IOException
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(term.word);
offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
correctOffset(totalOffset + term.offset + term.word.length()));
correctOffset(totalOffset + term.offset + term.word.length()));
typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/com/hankcs/lucene/HighLighterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public void testHightlight() throws Exception
// 加入一个文档
Document doc = new Document();
doc.add(new TextField(fieldName, "\n返回值\r\n返回", Field.Store.YES));
doc.add(new TextField("title", "测试回测换行符", Field.Store.YES));
doc.add(new TextField("title", "测试回车换行符", Field.Store.YES));
iwriter.addDocument(doc);
}
{
Expand Down

0 comments on commit 7a9d54a

Please sign in to comment.