diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a6d5b5e --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries +.idea/** + + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +.idea/compiler.xml +.idea/copyright/ +.idea/libraries/ +.idea/markdown-navigator/ +.idea/misc.xml +.idea/modules.xml +.idea/preferred-vcs.xml +.idea/workspace.xml +elasticsearch-merge-token-filter.iml diff --git a/.ignore b/.ignore new file mode 100644 index 0000000..0f4a907 --- /dev/null +++ b/.ignore @@ -0,0 +1,65 @@ +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries +.idea/** + + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +.idea/compiler.xml +.idea/copyright/ +.idea/libraries/ +.idea/markdown-navigator/ +.idea/misc.xml +.idea/modules.xml +.idea/preferred-vcs.xml +.idea/workspace.xml +.idea/encodings.xml +.idea/vcs.xml +elasticsearch-merge-token-filter.iml diff --git a/plugin-descriptor.properties b/plugin-descriptor.properties new file mode 100755 index 0000000..5b387bf --- /dev/null +++ b/plugin-descriptor.properties @@ -0,0 +1,6 @@ +description=Merge token filter +version=${elasticsearch.version} +name=elasticsearch-merge-token-filter +classname=org.elasticsearch.plugin.analysis.starstory.AnalysisMergeFilterPlugin +java.version=1.8 +elasticsearch.version=${elasticsearch.version} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..f7d931e --- /dev/null +++ b/pom.xml @@ -0,0 +1,138 @@ + + + 4.0.0 + + org.elasticsearch + elasticsearch-merge-token-filter + 6.1.0 + + jar + + + 7.2.1 + 6.1.1 + UTF-8 + + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + + + org.elasticsearch.test + framework + 6.1.1 + test + + + + org.slf4j + slf4j-api + 1.7.5 + + + + org.slf4j + slf4j-simple + 1.6.4 + + + + org.apache.lucene + lucene-test-framework + ${lucene.version} + test + + + junit + junit + 4.12 + test + + + + org.apache.logging.log4j + log4j-core + 2.8.2 + test + + + + + org.apache.logging.log4j + log4j-api + 2.8.2 + test + + + + org.apache.logging.log4j + log4j-core + 2.8.2 + + + + + + + + + ${project.basedir}/src/test/resources + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.2 + + 1.8 + 1.8 + + + + maven-assembly-plugin + 2.3 + + ${project.build.directory}/releases/ + + ${basedir}/src/main/assemblies/plugin.xml + + + + + package + + single + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.14 + + + org.apache.maven.plugins + maven-install-plugin + 2.4 + + + + + \ No newline at end of file diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 0000000..c038441 --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,34 @@ + + + plugin + + zip + + false + + + elasticsearch/ + true + true + + org.elasticsearch:elasticsearch + + + + elasticsearch/ + true + true + + org.slf4j:slf4j-api + org.slf4j:slf4j-simple + + + + + + plugin-descriptor.properties + elasticsearch/ + true + + + \ No newline at end of file diff --git a/src/main/java/elasticsearch/merge/MergeTokenFilter.java b/src/main/java/elasticsearch/merge/MergeTokenFilter.java new file mode 100644 index 0000000..c15f918 --- /dev/null +++ b/src/main/java/elasticsearch/merge/MergeTokenFilter.java @@ -0,0 +1,64 @@ +package elasticsearch.merge; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.AttributeSource; + +import java.io.IOException; + +/** + * Created by nobaksan on 2018. 4. 9.. + */ +public class MergeTokenFilter extends TokenFilter { + private final CharTermAttribute termAtt = (CharTermAttribute)addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)addAttribute(PositionIncrementAttribute.class); + private String tokenSeparator = null; + private int incrementGap = 100; + private StringBuilder builder = new StringBuilder(); + private AttributeSource.State previousState = null; + private boolean recheckPrevious = false; + + public MergeTokenFilter(TokenStream input, String tokenSeparator) + { + super(input); + this.tokenSeparator = (tokenSeparator != null ? tokenSeparator : "_"); + } + + @Override + public boolean incrementToken() throws IOException + { + boolean empty = false; + this.builder.setLength(0); + if (this.recheckPrevious) + { + restoreState(this.previousState); + + this.builder.append(this.termAtt.buffer(), 0, this.termAtt.length()); + this.recheckPrevious = false; + } + while (this.input.incrementToken()) { + if (this.posIncrAtt.getPositionIncrement() <= this.incrementGap) + { + if (this.builder.length() > 0) { + this.builder.append(this.tokenSeparator); + } + this.builder.append(this.termAtt.buffer(), 0, this.termAtt.length()); + } + else + { + this.recheckPrevious = true; + this.previousState = captureState(); + } + } + if (this.builder.length() > 0) + { + this.termAtt.setEmpty().append(this.builder); + if (!this.recheckPrevious) { + empty = true; + } + } + return empty; + } +} diff --git a/src/main/java/elasticsearch/merge/MergeTokenFilterFactory.java b/src/main/java/elasticsearch/merge/MergeTokenFilterFactory.java new file mode 100644 index 0000000..a02d694 --- /dev/null +++ b/src/main/java/elasticsearch/merge/MergeTokenFilterFactory.java @@ -0,0 +1,29 @@ +package elasticsearch.merge; + +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +/** + * Created by nobaksan on 2018. 4. 9.. + */ +public class MergeTokenFilterFactory extends AbstractTokenFilterFactory { + + private String mergeBy = "_"; + public MergeTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + +// //merge Symbol +// String mergeBy = settings.get("merge_by", null); +// if(Strings.isNullOrEmpty(mergeBy)==false) this.mergeBy = mergeBy; + + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new MergeTokenFilter(tokenStream,mergeBy); + } +} diff --git a/src/main/java/org/elasticsearch/plugin/analysis/starstory/AnalysisMergeFilterPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/starstory/AnalysisMergeFilterPlugin.java new file mode 100644 index 0000000..7bc4e29 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/starstory/AnalysisMergeFilterPlugin.java @@ -0,0 +1,25 @@ +package org.elasticsearch.plugin.analysis.starstory; + +import elasticsearch.merge.MergeTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; + +import java.util.HashMap; +import java.util.Map; + + +/** + * Created by nobaksan on 2018. 4. 9.. + */ +public class AnalysisMergeFilterPlugin extends Plugin implements AnalysisPlugin { + @Override + public Map> getTokenFilters() { + Map> extra = new HashMap<>(); + extra.put("mergeFilter", MergeTokenFilterFactory::new); + + return extra; + } +} + diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 0000000..fec1331 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.starstory.AnalysisMergeFilterPlugin \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/index/analysis/MergeFilterAnalyzerTest.java b/src/test/java/org/elasticsearch/index/analysis/MergeFilterAnalyzerTest.java new file mode 100644 index 0000000..d44417c --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/MergeFilterAnalyzerTest.java @@ -0,0 +1,37 @@ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.plugin.analysis.starstory.AnalysisMergeFilterPlugin; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.elasticsearch.index.analysis.AnalysisTestsHelper.createTestAnalysisFromSettings; + +/** + * Created by nobaksan on 2018. 4. 9.. + */ +public class MergeFilterAnalyzerTest extends ESTokenStreamTestCase { + + @Test + public void testMergeTokenFilter() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + ESTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings, new AnalysisMergeFilterPlugin()); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("mergeFilter"); + String source = "아이폰 삼성 전자 아이폰"; + String[] expected = new String[]{"아이폰_삼성_전자_아이폰" }; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } +}