Skip to content

Commit

Permalink
fix: lj and nj transliteration
Browse files Browse the repository at this point in the history
  • Loading branch information
noomorph committed Dec 16, 2023
1 parent f0d9ffe commit 9a73f0e
Show file tree
Hide file tree
Showing 14 changed files with 345 additions and 28 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"types": "dist/index.d.ts",
"files": [
"dist",
"src",
"src/**/*.ts",
"!**/*.json",
"!**/*.test.*"
],
Expand Down
1 change: 1 addition & 0 deletions scripts/dictionary.txt

Large diffs are not rendered by default.

140 changes: 140 additions & 0 deletions scripts/generate-rule-exceptions.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env node

import fs from "node:fs";
import utils from '../dist/index.js';
import _ from 'lodash';

function* extractWords(str) {
// include letters and combining marks
const regex = /([\p{L}\p{M}]+)/gu;
let match;

while ((match = regex.exec(str)) !== null) {
yield match[1];
}
}

function* extractWordsFromFile(filePath) {
const raw = fs.readFileSync(filePath, 'utf8');
yield* extractWords(raw);
}

function* readAllWords() {
yield* extractWordsFromFile('src/adjective/testCases.json');
yield* extractWordsFromFile('src/noun/__snapshots__/declensionNoun.test.ts.snap');
yield* extractWordsFromFile('src/numeral/testCases.json');
yield* extractWordsFromFile('src/pronoun/testCases.json');
yield* extractWordsFromFile('src/verb/testCases.json');
yield* extractWordsFromFile('scripts/dictionary.txt');
}

console.log('Reading all words...');
const allWords = _.uniq([...readAllWords()]).map(word => {
const lower = word.toLowerCase();
const standard = utils.transliterate(lower, 'art-Latn-x-interslv');
return [lower, standard];
});

function buildExceptionList(predicate) {
const set = new Set();
for (const [lower, standard] of allWords) {
if (predicate(lower, standard)) {
set.add(standard);
}
}
return [...set].sort();
}

function toTrieToken(word) {
return '%' + word + '%';
}

/**
* @param {string[]} tokens
* @returns string
*/
function buildSuffixTrie(tokens) {
const trie = {};

// iterate over the tokens array.
tokens.forEach((token) => {
let lettersBreakdown = token.split("").reverse();
let current = trie;

// iterate over every letter in the token/word.
lettersBreakdown.forEach((letter, index) => {
const position = current[letter];

if (position == null) {
// for the last letter of the word, assign 0. For others, assign empty object.
current = current[letter] = index === lettersBreakdown.length - 1 ? 0 : {};
} else if (position === 0) {
current = current[letter] = { $: 0 };
} else {
current = current[letter];
}
});

});

return JSON.stringify(trie) + '\n';
}

function generateRuleExceptions(predicate) {
return buildSuffixTrie(buildExceptionList(predicate).map(toTrieToken));
}

function containsLjj(word) {
return word.includes('ľj');
}

function containsNjj(word, standard) {
return word.includes('ńj') && !endsWithNonTypicalNje(standard) && !standard.endsWith('nju');
}

function endsWithNonTypicalNje(word) {
return word.endsWith('nja')
|| word.endsWith('njah')
|| word.endsWith('njam')
|| word.endsWith('njami')
|| word.endsWith('nje')
|| word.endsWith('njem');
}

function endsWithNonTypicalNjju(word) {
return word.endsWith('ńju') || word.endsWith('ńjų');
}

console.log('Generating fixtures...');

fs.writeFileSync(
'src/transliterate/lj-nj/list-ljj.json',
generateRuleExceptions(containsLjj)
);

fs.writeFileSync(
'src/transliterate/lj-nj/list-njj.json',
generateRuleExceptions(containsNjj)
);

fs.writeFileSync(
'src/transliterate/lj-nj/exceptions-nje.json',
generateRuleExceptions(endsWithNonTypicalNje)
);

fs.writeFileSync(
'src/transliterate/lj-nj/exceptions-njju.json',
generateRuleExceptions(endsWithNonTypicalNjju)
);

fs.writeFileSync(
'src/transliterate/lj-nj/endings-nje.json',
buildSuffixTrie([
'nja%',
'njah%',
'njam%',
'njami%',
'nje%',
'njem%',
]),
);
19 changes: 6 additions & 13 deletions src/transliterate/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,13 @@ describe('transliterate to', () => {
},
);

test.failing(
'double transliteration should work equally from Latin and Cyrillic scripts',
() => {
const latn2cyrl = transliterate(latin, 'art-Cyrl-x-interslv');
const cyrl2latn = transliterate(cyrillic, 'art-Latn-x-interslv');
test('double transliteration should work equally from Latin and Cyrillic scripts', () => {
const latn2cyrl = transliterate(latin, 'art-Cyrl-x-interslv');
const cyrl2latn = transliterate(cyrillic, 'art-Latn-x-interslv');

expect(transliterate(latn2cyrl, 'art-Latn-x-interslv')).toEqual(
cyrl2latn,
);
expect(transliterate(cyrl2latn, 'art-Cyrl-x-interslv')).toEqual(
latn2cyrl,
);
},
);
expect(transliterate(latn2cyrl, 'art-Latn-x-interslv')).toEqual(cyrl2latn);
expect(transliterate(cyrl2latn, 'art-Cyrl-x-interslv')).toEqual(latn2cyrl);
});

test('unknown code', () => {
expect(() => transliterate('', 'en' as any)).toThrowErrorMatchingSnapshot();
Expand Down
1 change: 1 addition & 0 deletions src/transliterate/lj-nj/endings-nje.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"%":{"a":{"j":{"n":0}},"h":{"a":{"j":{"n":0}}},"m":{"a":{"j":{"n":0}},"e":{"j":{"n":0}}},"i":{"m":{"a":{"j":{"n":0}}}},"e":{"j":{"n":0}}}}
1 change: 1 addition & 0 deletions src/transliterate/lj-nj/exceptions-nj.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/transliterate/lj-nj/exceptions-nje.json

Large diffs are not rendered by default.

Loading

0 comments on commit 9a73f0e

Please sign in to comment.