-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,819 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* | ||
* The contents of this file are subject to the Mozilla Public License Version | ||
* 2.0 (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* https://www.mozilla.org/en-US/MPL/2.0/ | ||
* | ||
* Software distributed under the License is distributed on an "AS IS" basis, | ||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | ||
* for the specific language governing rights and limitations under the | ||
* License. | ||
* | ||
* The Original Code is Libvoikko: Library of natural language processing tools. | ||
* The Initial Developer of the Original Code is Harri Pitkänen <[email protected]>. | ||
* Portions created by the Initial Developer are Copyright (C) 2012 | ||
* the Initial Developer. All Rights Reserved. | ||
* | ||
* Raudikko, the Java port of the Initial Code is Copyright (C) 2020 by | ||
* Evident Solutions Oy. All Rights Reserved. | ||
* | ||
* Alternatively, the contents of this file may be used under the terms of | ||
* either the GNU General Public License Version 2 or later (the "GPL"), or | ||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | ||
* in which case the provisions of the GPL or the LGPL are applicable instead | ||
* of those above. If you wish to allow use of your version of this file only | ||
* under the terms of either the GPL or the LGPL, and not to allow others to | ||
* use your version of this file under the terms of the MPL, indicate your | ||
* decision by deleting the provisions above and replace them with the notice | ||
* and other provisions required by the GPL or the LGPL. If you do not delete | ||
* the provisions above, a recipient may use your version of this file under | ||
* the terms of any one of the MPL, the GPL or the LGPL. | ||
*/ | ||
|
||
package fi.evident.raudikko; | ||
|
||
import org.jetbrains.annotations.NotNull; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* Provides suggestions for misspelled words. | ||
*/ | ||
public interface SpellingSuggester { | ||
|
||
/** | ||
* Given a word, provides a list of spelling suggestions for it. | ||
*/ | ||
@NotNull List<String> provideSpellingSuggestions(@NotNull String word); | ||
} |
194 changes: 194 additions & 0 deletions
194
src/main/java/fi/evident/raudikko/internal/suggestions/DefaultSpellingSuggester.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
/* | ||
* The contents of this file are subject to the Mozilla Public License Version | ||
* 2.0 (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* https://www.mozilla.org/en-US/MPL/2.0/ | ||
* | ||
* Software distributed under the License is distributed on an "AS IS" basis, | ||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | ||
* for the specific language governing rights and limitations under the | ||
* License. | ||
* | ||
* The Original Code is Libvoikko: Library of natural language processing tools. | ||
* The Initial Developer of the Original Code is Harri Pitkänen <[email protected]>. | ||
* Portions created by the Initial Developer are Copyright (C) 2012 | ||
* the Initial Developer. All Rights Reserved. | ||
* | ||
* Raudikko, the Java port of the Initial Code is Copyright (C) 2020 by | ||
* Evident Solutions Oy. All Rights Reserved. | ||
* | ||
* Alternatively, the contents of this file may be used under the terms of | ||
* either the GNU General Public License Version 2 or later (the "GPL"), or | ||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | ||
* in which case the provisions of the GPL or the LGPL are applicable instead | ||
* of those above. If you wish to allow use of your version of this file only | ||
* under the terms of either the GPL or the LGPL, and not to allow others to | ||
* use your version of this file under the terms of the MPL, indicate your | ||
* decision by deleting the provisions above and replace them with the notice | ||
* and other provisions required by the GPL or the LGPL. If you do not delete | ||
* the provisions above, a recipient may use your version of this file under | ||
* the terms of any one of the MPL, the GPL or the LGPL. | ||
*/ | ||
|
||
package fi.evident.raudikko.internal.suggestions; | ||
|
||
import fi.evident.raudikko.Analyzer; | ||
import fi.evident.raudikko.AnalyzerConfiguration; | ||
import fi.evident.raudikko.Morphology; | ||
import fi.evident.raudikko.SpellingSuggester; | ||
import fi.evident.raudikko.internal.suggestions.Suggestion.SimpleSuggestion; | ||
import fi.evident.raudikko.internal.suggestions.Suggestion.SplitSuggestion; | ||
import fi.evident.raudikko.internal.utils.StringUtils; | ||
import org.jetbrains.annotations.NotNull; | ||
import org.jetbrains.annotations.Nullable; | ||
|
||
import java.util.List; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
import java.util.function.BiFunction; | ||
import java.util.function.Function; | ||
import java.util.function.UnaryOperator; | ||
import java.util.stream.Stream; | ||
|
||
import static fi.evident.raudikko.internal.suggestions.Replacements.*; | ||
import static fi.evident.raudikko.internal.utils.StringUtils.isAllUpper; | ||
import static java.lang.Character.isUpperCase; | ||
import static java.util.Collections.emptyList; | ||
import static java.util.Comparator.comparing; | ||
|
||
public final class DefaultSpellingSuggester implements SpellingSuggester { | ||
|
||
private final @NotNull SpellChecker spellChecker; | ||
|
||
/** | ||
* How many suggestions are returned to user | ||
*/ | ||
private static final int MAX_SUGGESTIONS_RETURNED = 5; | ||
|
||
/** | ||
* How many variations are generated for words? | ||
*/ | ||
private static final int MAX_VARIATIONS = 800; | ||
|
||
/** | ||
* Generate more suggestions than required so that sorting gets to pick the best ones | ||
*/ | ||
private static final int MAX_SUGGESTIONS_GENERATED = 3 * MAX_SUGGESTIONS_RETURNED; | ||
|
||
private static final int MAX_WORD_SIZE = 255; | ||
private static final @NotNull String COMMON_LETTERS = "aitesn"; | ||
private static final @NotNull String UNCOMMON_LETTERS = "ulkoämrvpyhjdögfbcw:xzqå'."; | ||
|
||
private static final @NotNull List<Function<? super String, ? extends Stream<? extends Suggestion>>> primaryGenerators = List.of( | ||
simple(Stream::of), | ||
simple(SuggestionGenerators::removeSoftHyphens) | ||
); | ||
|
||
private static final @NotNull List<Function<? super String, ? extends Stream<? extends Suggestion>>> secondaryGenerators = List.of( | ||
simple(SuggestionGenerators::vowelChange), | ||
simple(SuggestionGenerators::replace, REPLACEMENTS_1_FULL), | ||
simple(SuggestionGenerators::delete), | ||
simple(SuggestionGenerators::insertHyphen), | ||
simple(SuggestionGenerators::duplicateCharacters), | ||
SuggestionGenerators::splitWord, | ||
simple(SuggestionGenerators::replaceTwo, REPLACEMENTS_1), | ||
simple(SuggestionGenerators::replace, REPLACEMENTS_2_FULL), | ||
simple(SuggestionGenerators::insertion, COMMON_LETTERS), | ||
simple(SuggestionGenerators::swap), | ||
simple(SuggestionGenerators::replace, REPLACEMENTS_3_FULL), | ||
simple(SuggestionGenerators::insertion, UNCOMMON_LETTERS), | ||
simple(SuggestionGenerators::replace, REPLACEMENTS_4_FULL), | ||
simple(SuggestionGenerators::replaceTwo, REPLACEMENTS_2), | ||
simple(SuggestionGenerators::replaceTwo, REPLACEMENTS_3), | ||
simple(SuggestionGenerators::replaceTwo, REPLACEMENTS_4), | ||
simple(SuggestionGenerators::deleteTwo), | ||
simple(SuggestionGenerators::replace, REPLACEMENTS_5_FULL) | ||
); | ||
|
||
public DefaultSpellingSuggester(@NotNull Morphology morphology) { | ||
this.spellChecker = new SpellChecker(newAnalyzer(morphology)); | ||
} | ||
|
||
@Override | ||
public @NotNull List<String> provideSpellingSuggestions(@NotNull String word) { | ||
if (word.length() <= 1 || word.length() > MAX_WORD_SIZE) | ||
return emptyList(); | ||
|
||
var capitalizer = capitalizer(word); | ||
|
||
var results1 = generateSuggestions(word, primaryGenerators); | ||
var results2 = generateSuggestions(word, secondaryGenerators); | ||
|
||
return Stream.concat(results1, results2) | ||
.map(capitalizer.compose(WordWithPriority::word)) | ||
.distinct() | ||
.limit(MAX_SUGGESTIONS_RETURNED) | ||
.toList(); | ||
} | ||
|
||
private @NotNull Stream<WordWithPriority> generateSuggestions( | ||
@NotNull String word, | ||
@NotNull List<Function<? super String, ? extends Stream<? extends Suggestion>>> generators | ||
) { | ||
AtomicInteger count = new AtomicInteger(0); // atomicity not really needed, just box for counter | ||
|
||
return generators.stream() | ||
.flatMap(g -> g.apply(word)) | ||
.distinct() | ||
.limit(MAX_VARIATIONS) | ||
.flatMap(s -> Stream.ofNullable(processSuggestion(s, spellChecker))) | ||
.limit(MAX_SUGGESTIONS_GENERATED) | ||
.map(s -> new WordWithPriority(s.word(), s.priority() * (count.getAndIncrement() + 5))) | ||
.sorted(comparing(WordWithPriority::priority)); | ||
} | ||
|
||
private static @Nullable WordWithPriority processSuggestion(@NotNull Suggestion suggestion, @NotNull SpellChecker spellChecker) { | ||
if (suggestion instanceof SimpleSuggestion s) { | ||
return spellChecker.spellCheck(s.word()); | ||
|
||
} else if (suggestion instanceof SplitSuggestion s) { | ||
var s1 = spellChecker.spellCheck(s.word1()); | ||
if (s1 == null) | ||
return null; | ||
|
||
var s2 = spellChecker.spellCheck(s.word2()); | ||
if (s2 == null) | ||
return null; | ||
|
||
return new WordWithPriority(s1.word() + " " + s2.word(), (s1.priority() + s2.priority()) * s.priorityMultiplier()); | ||
} else { | ||
throw new IllegalStateException("unexpected suggestion: " + suggestion); | ||
} | ||
} | ||
|
||
private static @NotNull UnaryOperator<String> capitalizer(@NotNull String word) { | ||
if (isAllUpper(word)) | ||
return String::toUpperCase; | ||
else if (isUpperCase(word.charAt(0))) | ||
return StringUtils::capitalizeIfLower; | ||
else | ||
return UnaryOperator.identity(); | ||
} | ||
|
||
private static @NotNull Analyzer newAnalyzer(@NotNull Morphology morphology) { | ||
var config = new AnalyzerConfiguration(); | ||
|
||
config.setIncludeWord(true); | ||
config.setIncludeStructure(true); | ||
config.setIncludeBasicAttributes(true); | ||
config.setIncludeOrganizationNameAnalysis(true); | ||
|
||
config.setIncludeBaseForm(false); | ||
config.setIncludeBaseFormParts(false); | ||
config.setIncludeFstOutput(false); | ||
|
||
return morphology.newAnalyzer(config); | ||
} | ||
|
||
private static @NotNull Function<String, Stream<SimpleSuggestion>> simple(@NotNull Function<String, Stream<String>> f) { | ||
return w -> f.apply(w).map(SimpleSuggestion::new); | ||
} | ||
|
||
private static @NotNull <T> Function<String, Stream<SimpleSuggestion>> simple(@NotNull BiFunction<String, T, Stream<String>> f, T param) { | ||
return w -> f.apply(w, param).map(SimpleSuggestion::new); | ||
} | ||
} |
97 changes: 97 additions & 0 deletions
97
src/main/java/fi/evident/raudikko/internal/suggestions/Replacements.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* The contents of this file are subject to the Mozilla Public License Version | ||
* 2.0 (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* https://www.mozilla.org/en-US/MPL/2.0/ | ||
* | ||
* Software distributed under the License is distributed on an "AS IS" basis, | ||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | ||
* for the specific language governing rights and limitations under the | ||
* License. | ||
* | ||
* The Original Code is Libvoikko: Library of natural language processing tools. | ||
* The Initial Developer of the Original Code is Harri Pitkänen <[email protected]>. | ||
* Portions created by the Initial Developer are Copyright (C) 2012 | ||
* the Initial Developer. All Rights Reserved. | ||
* | ||
* Raudikko, the Java port of the Initial Code is Copyright (C) 2020 by | ||
* Evident Solutions Oy. All Rights Reserved. | ||
* | ||
* Alternatively, the contents of this file may be used under the terms of | ||
* either the GNU General Public License Version 2 or later (the "GPL"), or | ||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | ||
* in which case the provisions of the GPL or the LGPL are applicable instead | ||
* of those above. If you wish to allow use of your version of this file only | ||
* under the terms of either the GPL or the LGPL, and not to allow others to | ||
* use your version of this file under the terms of the MPL, indicate your | ||
* decision by deleting the provisions above and replace them with the notice | ||
* and other provisions required by the GPL or the LGPL. If you do not delete | ||
* the provisions above, a recipient may use your version of this file under | ||
* the terms of any one of the MPL, the GPL or the LGPL. | ||
*/ | ||
|
||
package fi.evident.raudikko.internal.suggestions; | ||
|
||
import fi.evident.raudikko.internal.utils.CharMap; | ||
import fi.evident.raudikko.internal.utils.CollectionUtils; | ||
import org.jetbrains.annotations.NotNull; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
|
||
import static java.lang.Character.isLowerCase; | ||
import static java.lang.Character.toUpperCase; | ||
|
||
final class Replacements { | ||
|
||
static final @NotNull Replacements REPLACEMENTS_1 = parse(".,asiuiotrtdersšsanmuilkklkgoiäömnrertvbpbpoythjjhjkdtdsdföägfghgkfgfdbpbncvcswewvxczžzxqaåoåpåäåöaeiktyea"); | ||
static final @NotNull Replacements REPLACEMENTS_2 = parse("1q2q2w3w3e4e4r5r5t6t6y7y7u8u8i9i9o0o0p+pie"); | ||
static final @NotNull Replacements REPLACEMENTS_3 = parse("essdnhujlökjopäpmkrdvgplyhhujideölgtfvbvckwaxszaqkåaaåeéaâkcscijxz"); | ||
static final @NotNull Replacements REPLACEMENTS_4 = parse("qwqswqwswdedefrfrgtftgthygyjuhukilokolpöpäsesxdrbgfefrftfcgygbgvhyhnhbhgjujmjnkikokmlolpöpöåäåzsxdcdcfcxvfbhnjnbmjewpåaqswszdwdcdxvcawazsq"); | ||
static final @NotNull Replacements REPLACEMENTS_5 = parse("aooaoutlsraieääeuvvuoddokqpvvpqeeqaddarsetteryyrtuutyiiyuoippioåhvvhhmmh"); | ||
static final @NotNull Replacements REPLACEMENTS_1_FULL = REPLACEMENTS_1.extendWithMatchingUpperCaseReplacements(); | ||
static final @NotNull Replacements REPLACEMENTS_2_FULL = REPLACEMENTS_2.extendWithMatchingUpperCaseReplacements(); | ||
static final @NotNull Replacements REPLACEMENTS_3_FULL = REPLACEMENTS_3.extendWithMatchingUpperCaseReplacements(); | ||
static final @NotNull Replacements REPLACEMENTS_4_FULL = REPLACEMENTS_4.extendWithMatchingUpperCaseReplacements(); | ||
static final @NotNull Replacements REPLACEMENTS_5_FULL = REPLACEMENTS_5.extendWithMatchingUpperCaseReplacements(); | ||
|
||
private final @NotNull CharMap<char[]> replacementMapping; | ||
private static final char @NotNull[] EMPTY_MAPPING = new char[0]; | ||
|
||
private Replacements(@NotNull CharMap<char[]> replacementMapping) { | ||
this.replacementMapping = replacementMapping; | ||
} | ||
|
||
public char @NotNull [] forCharacter(char from) { | ||
return replacementMapping.getOrDefault(from, EMPTY_MAPPING); | ||
} | ||
|
||
private @NotNull Replacements extendWithMatchingUpperCaseReplacements() { | ||
var newMapping = replacementMapping.copy(); | ||
for (char ch : newMapping.keys()) { | ||
var values = newMapping.get(ch); | ||
if (values != null && isLowerCase(ch)) | ||
newMapping.put(toUpperCase(ch), CollectionUtils.toUpperCase(values)); | ||
} | ||
|
||
return new Replacements(newMapping); | ||
} | ||
|
||
private static @NotNull Replacements parse(@NotNull String input) { | ||
if (input.length() % 2 != 0) throw new IllegalArgumentException("invalid replacement string " + input); | ||
|
||
var mapping = new HashMap<Character, List<Character>>(); | ||
for (int i = 0; i < input.length(); i += 2) { | ||
char from = input.charAt(i); | ||
char to = input.charAt(i + 1); | ||
|
||
var targets = mapping.computeIfAbsent(from, k -> new ArrayList<>()); | ||
targets.add(to); | ||
} | ||
|
||
var result = new CharMap<char[]>(); | ||
mapping.forEach((key, value) -> result.put(key, CollectionUtils.toCharArray(value))); | ||
return new Replacements(result); | ||
} | ||
} |
Oops, something went wrong.