instructlab · booxter · Mar 10, 2024
diff --git a/README.md b/README.md
@@ -417,6 +417,21 @@ Below is an illustrative directory structure to show this layout:
 
 For an extensive example of this layout see, [taxonomy_tree_layout](https://github.com/instructlab/taxonomy/tree/main/docs/taxonomy_tree_layout.md) in the documentation folder.
 
+## Dynamic `qna.yaml` generation
+
+For some skills, it may make sense to instead generate `qna.yaml` files
+programmatically. In this case, instead of manually defining a `qna.yaml` file,
+you can provide a `Containerfile` that, once built, could be executed to
+produce the intended `qna.yaml` file.
+
+The interface of a dynamic `qna.yaml` generator is as follows:
+
+- The taxonomy file is a `Containerfile` (or `Dockerfile`).
+- The container should define a `CMD` command that, once executed, will put the
+  intended `qna.yaml` file under `/out/qna.yaml` location.
+- The CLI will then run the generator, then use the resulting `qna.yaml` file
+- as a seed input, as usual.
+
 ## Contribute knowledge and skills to the taxonomy!
 
 The ability to contribute to a Large Language Model (LLM) has been difficult in no small part because it is difficult to get access to the necessary compute infrastructure.

diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile b/compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile
@@ -0,0 +1,11 @@
+FROM python
+
+RUN apt-get update
+RUN apt-get install -y aspell-ru aspell-uk
+
+RUN pip install pyyaml
+RUN pip install cyrtranslit
+
+COPY . /app
+
+CMD ["/app/run"]
diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/dict b/compositional_skills/extraction/linguistic/translit/cyrillic/dict
@@ -0,0 +1,3 @@
+#!/bin/sh
+lang=$1
+aspell -d $lang dump master | aspell -l $lang expand | sed 's/ /\n/g'
diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/run b/compositional_skills/extraction/linguistic/translit/cyrillic/run
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import itertools
+import os
+import random
+
+import yaml
+from cyrtranslit import mapping
+
+
+_LANGUAGES = {
+    'ru': 'Russian',
+    'ua': 'Ukrainian',
+}
+_LAN_TO_ASPELL_LANG = {
+    'ua': 'uk',
+}
+_MAX_WORDS_PER_LANG = 10
+_DICT_CMD = '/app/dict'
+
+
+def lang_name(lang):
+    return _LANGUAGES[lang]
+
+
+def join_letters(word):
+    letters = list(word)
+    return ', '.join(letters[:-1]) + ', and ' + letters[-1]
+
+
+def mapping_snippet(word, latin_letters):
+    mapping = zip(word, latin_letters)
+
+    snippet = ''
+    for cyrillic, latin in mapping:
+        snippet += f'- {cyrillic} = {latin}\n'
+    return snippet
+
+
+# todo: consider adjusting qna pair wording a bit to increase variability?
+def get_answer(lang, word, latin_letters):
+    return f'''
+In {lang_name(lang)}, the Cyrillic characters {join_letters(word)} can be
+translated to their corresponding Latin (or Roman) equivalents using the
+following transliteration scheme:
+
+{mapping_snippet(word, latin_letters)}
+Applying this transliteration scheme to the given Cyrillic sequence "{word}"
+results in the following Latin sequence: "{"".join(latin_letters)}".
+'''
+
+
+def tolatin(lang, word):
+    mapping_ = mapping.TRANSLIT_DICT[lang]["tolatin"]
+    return [mapping_.get(c, c) for c in word]
+
+
+def get_transliteration_scheme(lang):
+    return ", ".join(
+        f"{cyr} = {lat}"
+        for cyr, lat in mapping.TRANSLIT_DICT[lang]["tolatin"].items()
+    )
+
+
+def get_seed_example(lang, word):
+    context = f"""
+Word to transliterate: {word}
+
+Transliteration scheme: {get_transliteration_scheme(lang)}
+"""
+    return {
+        "question": f"Transform {lang_name(lang)} word into Latin letters.",
+        "context": context,
+        "answer": get_answer(lang, word, tolatin(lang, word)),
+    }
+
+
+def get_aspell_lang(lang):
+    return _LAN_TO_ASPELL_LANG.get(lang, lang)
+
+
+def get_words(lang):
+    cmd = f'{_DICT_CMD} {get_aspell_lang(lang)}'
+    words = list(os.popen(cmd))
+    random.shuffle(words)
+    for word in words:
+        yield word.strip()
+
+
+def main():
+    qna = {
+        'created_by': 'booxter',
+        'seed_examples': [],
+        'task_description': (
+            'Transliterating words from Cyrillic to Latin script.'
+        ),
+    }
+
+    for lang in _LANGUAGES:
+        for word in itertools.islice(get_words(lang), _MAX_WORDS_PER_LANG):
+            qna["seed_examples"].append(get_seed_example(lang, word))
+
+    with open('/out/qna.yaml', 'w') as f:
+        yaml.dump(qna, f, allow_unicode=True, default_style="|")
+
+
+if __name__ == '__main__':
+    main()