feat: [WiP] transliterate

medzuslovjansky · May 13, 2024 · 289242d · 289242d
1 parent 25b95cf
commit 289242d
Show file tree

Hide file tree

Showing 15 changed files with 345 additions and 106 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
diff --git a/package.json b/package.json
@@ -7,7 +7,7 @@
   "types": "dist/index.d.ts",
   "files": [
     "dist",
-    "src",
+    "src/**/*.ts",
     "!**/__*__",
     "!**/*.json",
     "!**/*.test.*"

diff --git a/scripts/dictionary.txt b/scripts/dictionary.txt
diff --git a/scripts/generate-rule-exceptions.mjs b/scripts/generate-rule-exceptions.mjs
@@ -0,0 +1,140 @@
+#!/usr/bin/env node
+
+import fs from "node:fs";
+import utils from '../dist/index.js';
+import _ from 'lodash';
+
+function* extractWords(str) {
+  // include letters and combining marks
+  const regex = /([\p{L}\p{M}]+)/gu;
+  let match;
+
+  while ((match = regex.exec(str)) !== null) {
+    yield match[1];
+  }
+}
+
+function* extractWordsFromFile(filePath) {
+  const raw = fs.readFileSync(filePath, 'utf8');
+  yield* extractWords(raw);
+}
+
+function* readAllWords() {
+  yield* extractWordsFromFile('src/adjective/testCases.json');
+  yield* extractWordsFromFile('src/noun/__snapshots__/declensionNoun.test.ts.snap');
+  yield* extractWordsFromFile('src/numeral/testCases.json');
+  yield* extractWordsFromFile('src/pronoun/testCases.json');
+  yield* extractWordsFromFile('src/verb/testCases.json');
+  yield* extractWordsFromFile('scripts/dictionary.txt');
+}
+
+console.log('Reading all words...');
+const allWords = _.uniq([...readAllWords()]).map(word => {
+  const lower = word.toLowerCase();
+  const standard = utils.transliterate(lower, 'art-Latn-x-interslv');
+  return [lower, standard];
+});
+
+function buildExceptionList(predicate) {
+  const set = new Set();
+  for (const [lower, standard] of allWords) {
+    if (predicate(lower, standard)) {
+      set.add(standard);
+    }
+  }
+  return [...set].sort();
+}
+
+function toTrieToken(word) {
+  return '%' + word + '%';
+}
+
+/**
+ * @param {string[]} tokens
+ * @returns string
+ */
+function buildSuffixTrie(tokens) {
+  const trie = {};
+
+  // iterate over the tokens array.
+  tokens.forEach((token) => {
+    let lettersBreakdown = token.split("").reverse();
+    let current = trie;
+
+    // iterate over every letter in the token/word.
+    lettersBreakdown.forEach((letter, index) => {
+      const position = current[letter];
+
+      if (position == null) {
+        // for the last letter of the word, assign 0. For others, assign empty object.
+        current = current[letter] = index === lettersBreakdown.length - 1 ? 0 : {};
+      } else if (position === 0) {
+        current = current[letter] = { $: 0 };
+      } else {
+        current = current[letter];
+      }
+    });
+
+  });
+
+  return JSON.stringify(trie) + '\n';
+}
+
+function generateRuleExceptions(predicate) {
+  return buildSuffixTrie(buildExceptionList(predicate).map(toTrieToken));
+}
+
+function containsLjj(word) {
+  return word.includes('ľj');
+}
+
+function containsNjj(word, standard) {
+  return word.includes('ńj') && !endsWithNonTypicalNje(standard) && !standard.endsWith('nju');
+}
+
+function endsWithNonTypicalNje(word) {
+  return word.endsWith('nja')
+    || word.endsWith('njah')
+    || word.endsWith('njam')
+    || word.endsWith('njami')
+    || word.endsWith('nje')
+    || word.endsWith('njem');
+}
+
+function endsWithNonTypicalNjju(word) {
+  return word.endsWith('ńju') || word.endsWith('ńjų');
+}
+
+console.log('Generating fixtures...');
+
+fs.writeFileSync(
+  'src/transliterate/lj-nj/list-ljj.json',
+  generateRuleExceptions(containsLjj)
+);
+
+fs.writeFileSync(
+  'src/transliterate/lj-nj/list-njj.json',
+  generateRuleExceptions(containsNjj)
+);
+
+fs.writeFileSync(
+  'src/transliterate/lj-nj/exceptions-nje.json',
+  generateRuleExceptions(endsWithNonTypicalNje)
+);
+
+fs.writeFileSync(
+  'src/transliterate/lj-nj/exceptions-njju.json',
+  generateRuleExceptions(endsWithNonTypicalNjju)
+);
+
+fs.writeFileSync(
+  'src/transliterate/lj-nj/endings-nje.json',
+  buildSuffixTrie([
+    'nja%',
+    'njah%',
+    'njam%',
+    'njami%',
+    'nje%',
+    'njem%',
+  ]),
+);
diff --git a/src/transliterate/index.test.ts b/src/transliterate/index.test.ts
@@ -55,16 +55,13 @@ describe('transliterate to', () => {
     },
   );
 
-  test.failing(
-    'double transliteration should work equally from Latin and Cyrillic scripts',
-    () => {
-      const latn2cyrl = transliterate(latin, 'isv-Cyrl');
-      const cyrl2latn = transliterate(cyrillic, 'isv-Latn');
+  test('double transliteration should work equally from Latin and Cyrillic scripts', () => {
+    const latn2cyrl = transliterate(latin, 'isv-Cyrl');
+    const cyrl2latn = transliterate(cyrillic, 'isv-Latn');
 
-      expect(transliterate(latn2cyrl, 'isv-Latn')).toEqual(cyrl2latn);
-      expect(transliterate(cyrl2latn, 'isv-Cyrl')).toEqual(latn2cyrl);
-    },
-  );
+    expect(transliterate(latn2cyrl, 'isv-Latn')).toEqual(cyrl2latn);
+    expect(transliterate(cyrl2latn, 'isv-Cyrl')).toEqual(latn2cyrl);
+  });
 
   test('unknown code', () => {
     expect(() => transliterate('', 'en' as any)).toThrowErrorMatchingSnapshot();

diff --git a/src/transliterate/lj-nj/endings-nje.json b/src/transliterate/lj-nj/endings-nje.json
@@ -0,0 +1 @@
+{"%":{"a":{"j":{"n":0}},"h":{"a":{"j":{"n":0}}},"m":{"a":{"j":{"n":0}},"e":{"j":{"n":0}}},"i":{"m":{"a":{"j":{"n":0}}}},"e":{"j":{"n":0}}}}
diff --git a/src/transliterate/lj-nj/exceptions-nj.json b/src/transliterate/lj-nj/exceptions-nj.json
diff --git a/src/transliterate/lj-nj/exceptions-nje.json b/src/transliterate/lj-nj/exceptions-nje.json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"%":{"a":{"j":{"n":0}},"h":{"a":{"j":{"n":0}}},"m":{"a":{"j":{"n":0}},"e":{"j":{"n":0}}},"i":{"m":{"a":{"j":{"n":0}}}},"e":{"j":{"n":0}}}}