From 0291a0c836d24868b662b07bafabbe6ebb75f92a Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Sun, 10 Mar 2024 22:23:33 +0000 Subject: [PATCH] RFC: WIP: add dynamic cyrillic translit linguistic skill This is an attempt to add support for a Cyrillic-to-Latin transliteration skill that would also be language aware. This is not validated and not working well with the default prompt defined in cli repo. (The prompt produces new instructions that are not retaining the transliteration scheme.) Whether we need to use a model to generate variations of the generated instructions is itself a topic for exploration. (Perhaps we could live with feeding the seed samples generated by the skill directly into fine tuning.) But if so, additional changes may be required on `cli` side to e.g. allow to use modified (or completely new?) prompts if needed for a particular skill. This is work-in-progress and is posted as a discussion starter. Signed-off-by: Ihar Hrachyshka --- README.md | 15 +++ .../linguistic/translit/cyrillic/Dockerfile | 11 ++ .../linguistic/translit/cyrillic/dict | 3 + .../linguistic/translit/cyrillic/run | 108 ++++++++++++++++++ 4 files changed, 137 insertions(+) create mode 100644 compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile create mode 100755 compositional_skills/extraction/linguistic/translit/cyrillic/dict create mode 100755 compositional_skills/extraction/linguistic/translit/cyrillic/run diff --git a/README.md b/README.md index 35e944cbd..d27a1f7f6 100644 --- a/README.md +++ b/README.md @@ -417,6 +417,21 @@ Below is an illustrative directory structure to show this layout: For an extensive example of this layout see, [taxonomy_tree_layout](https://github.com/instructlab/taxonomy/tree/main/docs/taxonomy_tree_layout.md) in the documentation folder. +## Dynamic `qna.yaml` generation + +For some skills, it may make sense to instead generate `qna.yaml` files +programmatically. In this case, instead of manually defining a `qna.yaml` file, +you can provide a `Containerfile` that, once built, could be executed to +produce the intended `qna.yaml` file. + +The interface of a dynamic `qna.yaml` generator is as follows: + +- The taxonomy file is a `Containerfile` (or `Dockerfile`). +- The container should define a `CMD` command that, once executed, will put the + intended `qna.yaml` file under `/out/qna.yaml` location. +- The CLI will then run the generator, then use the resulting `qna.yaml` file +- as a seed input, as usual. + ## Contribute knowledge and skills to the taxonomy! The ability to contribute to a Large Language Model (LLM) has been difficult in no small part because it is difficult to get access to the necessary compute infrastructure. diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile b/compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile new file mode 100644 index 000000000..7aef9d11d --- /dev/null +++ b/compositional_skills/extraction/linguistic/translit/cyrillic/Dockerfile @@ -0,0 +1,11 @@ +FROM python + +RUN apt-get update +RUN apt-get install -y aspell-ru aspell-uk + +RUN pip install pyyaml +RUN pip install cyrtranslit + +COPY . /app + +CMD ["/app/run"] diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/dict b/compositional_skills/extraction/linguistic/translit/cyrillic/dict new file mode 100755 index 000000000..384fea3d0 --- /dev/null +++ b/compositional_skills/extraction/linguistic/translit/cyrillic/dict @@ -0,0 +1,3 @@ +#!/bin/sh +lang=$1 +aspell -d $lang dump master | aspell -l $lang expand | sed 's/ /\n/g' diff --git a/compositional_skills/extraction/linguistic/translit/cyrillic/run b/compositional_skills/extraction/linguistic/translit/cyrillic/run new file mode 100755 index 000000000..02c2d2f1f --- /dev/null +++ b/compositional_skills/extraction/linguistic/translit/cyrillic/run @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import itertools +import os +import random + +import yaml +from cyrtranslit import mapping + + +_LANGUAGES = { + 'ru': 'Russian', + 'ua': 'Ukrainian', +} +_LAN_TO_ASPELL_LANG = { + 'ua': 'uk', +} +_MAX_WORDS_PER_LANG = 10 +_DICT_CMD = '/app/dict' + + +def lang_name(lang): + return _LANGUAGES[lang] + + +def join_letters(word): + letters = list(word) + return ', '.join(letters[:-1]) + ', and ' + letters[-1] + + +def mapping_snippet(word, latin_letters): + mapping = zip(word, latin_letters) + + snippet = '' + for cyrillic, latin in mapping: + snippet += f'- {cyrillic} = {latin}\n' + return snippet + + +# todo: consider adjusting qna pair wording a bit to increase variability? +def get_answer(lang, word, latin_letters): + return f''' +In {lang_name(lang)}, the Cyrillic characters {join_letters(word)} can be +translated to their corresponding Latin (or Roman) equivalents using the +following transliteration scheme: + +{mapping_snippet(word, latin_letters)} +Applying this transliteration scheme to the given Cyrillic sequence "{word}" +results in the following Latin sequence: "{"".join(latin_letters)}". +''' + + +def tolatin(lang, word): + mapping_ = mapping.TRANSLIT_DICT[lang]["tolatin"] + return [mapping_.get(c, c) for c in word] + + +def get_transliteration_scheme(lang): + return ", ".join( + f"{cyr} = {lat}" + for cyr, lat in mapping.TRANSLIT_DICT[lang]["tolatin"].items() + ) + + +def get_seed_example(lang, word): + context = f""" +Word to transliterate: {word} + +Transliteration scheme: {get_transliteration_scheme(lang)} +""" + return { + "question": f"Transform {lang_name(lang)} word into Latin letters.", + "context": context, + "answer": get_answer(lang, word, tolatin(lang, word)), + } + + +def get_aspell_lang(lang): + return _LAN_TO_ASPELL_LANG.get(lang, lang) + + +def get_words(lang): + cmd = f'{_DICT_CMD} {get_aspell_lang(lang)}' + words = list(os.popen(cmd)) + random.shuffle(words) + for word in words: + yield word.strip() + + +def main(): + qna = { + 'created_by': 'booxter', + 'seed_examples': [], + 'task_description': ( + 'Transliterating words from Cyrillic to Latin script.' + ), + } + + for lang in _LANGUAGES: + for word in itertools.islice(get_words(lang), _MAX_WORDS_PER_LANG): + qna["seed_examples"].append(get_seed_example(lang, word)) + + with open('/out/qna.yaml', 'w') as f: + yaml.dump(qna, f, allow_unicode=True, default_style="|") + + +if __name__ == '__main__': + main()