From d627f4b6049fbde2aab7f395d7def1e04f265d02 Mon Sep 17 00:00:00 2001 From: Brendan Molloy Date: Wed, 5 Feb 2025 01:06:57 +0100 Subject: [PATCH] Add 639-5 --- README.md | 11 +- iso639-5.json | 938 ++++++++++++++++++++++++++++++++++++++++++++ iso639-autonyms.tsv | 115 ++++++ 3 files changed, 1063 insertions(+), 1 deletion(-) create mode 100644 iso639-5.json diff --git a/README.md b/README.md index 695ac2e..6e36a51 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ While the data has initially been generated from standard sources, contributions ### Interpretation -`tag3` and `tag1` columns represent the ISO 639-3 and -1 tags respectively. The `name` field is the "most recognisable" form of the language name, typically in English, to be used as a fallback where an autonym is not available. +`tag3` and `tag1` columns represent the ISO 639-3 (or -5) and -1 tags respectively. The `name` field is the "most recognisable" form of the language name, typically in English, to be used as a fallback where an autonym is not available. The `autonym` field is the name of the language *in* that language. If this field is blank, it means that there is no confirmed autonym for this language in this database and you may use the `name` field as a fallback. @@ -23,6 +23,7 @@ Here are the currently utilised sources by this table: - `cldr` - [Unicode Common Locale Data Repository](http://cldr.unicode.org/) - `ethnologue` - [Ethnologue](https://www.ethnologue.com/), only autonyms where specified. - `iso639-3` - [ISO 639-3](http://www.sil.org/iso639-3/), reference language names only. +- `iso639-5` - [ISO 639-5](https://www.loc.gov/standards/iso639-5/id.php), reference language family names only. - `github` - [This repository](https://github.com/bbqsrc/iso639-databases) ## ISO 639 Default Script (ISO 15924) - iso639-default-script.tsv @@ -51,6 +52,14 @@ The LCIDs used in this table are defined by [MS-OE376: 2.1.1906 Part 4 Section 7 In effect, if a field is blank, it indicates that in BCP47 form, that segment should not be included to be equal to the given LCID. For example, `ar` is `1025`, while `ar-AE` is `14337`. +## ISO 639-5 Language Families - iso639-5.json + +This table is sourced from the authoritative source, the [Library of Congress](https://www.loc.gov/standards/iso639-5/id.php). + +### Interpretation + +`tag5` represents the ISO 639-5 language family tag, `name` represents the English name of the language family, and `hierarchy` represents the hierarchy of the language family. + # License Databases in many countries do not attract intellectual property rights, and where they do it, they very rarely attract copyright due to the raw and inexpressive nature of the data. However, to alleviate doubt, this data is being published by a resident of Sweden where sui generis database rights do not apply to non-EU datasets. CLDR and Ethnologue are both datasets published in the US, where database rights also do not apply. diff --git a/iso639-5.json b/iso639-5.json new file mode 100644 index 0000000..d2cc3b0 --- /dev/null +++ b/iso639-5.json @@ -0,0 +1,938 @@ +[ + { + "tag5": "aav", + "name": "Austro-Asiatic languages", + "hierarchy": [ + "aav" + ] + }, + { + "tag5": "afa", + "name": "Afro-Asiatic languages", + "hierarchy": [ + "afa" + ] + }, + { + "tag5": "alg", + "name": "Algonquian languages", + "hierarchy": [ + "nai", + "aql", + "alg" + ] + }, + { + "tag5": "alv", + "name": "Atlantic-Congo languages", + "hierarchy": [ + "nic", + "alv" + ] + }, + { + "tag5": "apa", + "name": "Apache languages", + "hierarchy": [ + "nai", + "xnd", + "ath", + "apa" + ] + }, + { + "tag5": "aqa", + "name": "Alacalufan languages", + "hierarchy": [ + "sai", + "aqa" + ] + }, + { + "tag5": "aql", + "name": "Algic languages", + "hierarchy": [ + "nai", + "aql" + ] + }, + { + "tag5": "art", + "name": "Artificial languages", + "hierarchy": [ + "art" + ] + }, + { + "tag5": "ath", + "name": "Athapascan languages", + "hierarchy": [ + "nai", + "xnd", + "ath" + ] + }, + { + "tag5": "auf", + "name": "Arauan languages", + "hierarchy": [ + "sai", + "awd", + "auf" + ] + }, + { + "tag5": "aus", + "name": "Australian languages", + "hierarchy": [ + "aus" + ] + }, + { + "tag5": "awd", + "name": "Arawakan languages", + "hierarchy": [ + "sai", + "awd" + ] + }, + { + "tag5": "azc", + "name": "Uto-Aztecan languages", + "hierarchy": [ + "nai", + "azc" + ] + }, + { + "tag5": "bad", + "name": "Banda languages", + "hierarchy": [ + "nic", + "alv", + "bad" + ] + }, + { + "tag5": "bai", + "name": "Bamileke languages", + "hierarchy": [ + "nic", + "alv", + "bai" + ] + }, + { + "tag5": "bat", + "name": "Baltic languages", + "hierarchy": [ + "ine", + "bat" + ] + }, + { + "tag5": "ber", + "name": "Berber languages", + "hierarchy": [ + "afa", + "ber" + ] + }, + { + "tag5": "bih", + "name": "Bihari languages", + "hierarchy": [ + "ine", + "iir", + "bih" + ] + }, + { + "tag5": "bnt", + "name": "Bantu languages", + "hierarchy": [ + "nic", + "alv", + "bnt" + ] + }, + { + "tag5": "btk", + "name": "Batak languages", + "hierarchy": [ + "map", + "poz", + "pqw", + "btk" + ] + }, + { + "tag5": "cai", + "name": "Central American Indian languages", + "hierarchy": [ + "cai" + ] + }, + { + "tag5": "cau", + "name": "Caucasian languages", + "hierarchy": [ + "cau" + ] + }, + { + "tag5": "cba", + "name": "Chibchan languages", + "hierarchy": [ + "sai", + "cba" + ] + }, + { + "tag5": "ccn", + "name": "North Caucasian languages", + "hierarchy": [ + "cau", + "ccn" + ] + }, + { + "tag5": "ccs", + "name": "South Caucasian languages", + "hierarchy": [ + "cau", + "ccs" + ] + }, + { + "tag5": "cdc", + "name": "Chadic languages", + "hierarchy": [ + "afa", + "cdc" + ] + }, + { + "tag5": "cdd", + "name": "Caddoan languages", + "hierarchy": [ + "nai", + "cdd" + ] + }, + { + "tag5": "cel", + "name": "Celtic languages", + "hierarchy": [ + "ine", + "cel" + ] + }, + { + "tag5": "cmc", + "name": "Chamic languages", + "hierarchy": [ + "map", + "poz", + "pqw", + "cmc" + ] + }, + { + "tag5": "cpe", + "name": "Creoles and pidgins, English‑based", + "hierarchy": [ + "crp", + "cpe" + ] + }, + { + "tag5": "cpf", + "name": "Creoles and pidgins, French‑based", + "hierarchy": [ + "crp", + "cpf" + ] + }, + { + "tag5": "cpp", + "name": "Creoles and pidgins, Portuguese-based", + "hierarchy": [ + "crp", + "cpp" + ] + }, + { + "tag5": "crp", + "name": "Creoles and pidgins", + "hierarchy": [ + "crp" + ] + }, + { + "tag5": "csu", + "name": "Central Sudanic languages", + "hierarchy": [ + "ssa", + "csu" + ] + }, + { + "tag5": "cus", + "name": "Cushitic languages", + "hierarchy": [ + "afa", + "cus" + ] + }, + { + "tag5": "day", + "name": "Land Dayak languages", + "hierarchy": [ + "day" + ] + }, + { + "tag5": "dmn", + "name": "Mande languages", + "hierarchy": [ + "nic", + "dmn" + ] + }, + { + "tag5": "dra", + "name": "Dravidian languages", + "hierarchy": [ + "dra" + ] + }, + { + "tag5": "egx", + "name": "Egyptian languages", + "hierarchy": [ + "afa", + "egx" + ] + }, + { + "tag5": "esx", + "name": "Eskimo-Aleut languages", + "hierarchy": [ + "esx" + ] + }, + { + "tag5": "euq", + "name": "Basque (family)", + "hierarchy": [ + "euq" + ] + }, + { + "tag5": "fiu", + "name": "Finno-Ugrian languages", + "hierarchy": [ + "urj", + "fiu" + ] + }, + { + "tag5": "fox", + "name": "Formosan languages", + "hierarchy": [ + "map", + "fox" + ] + }, + { + "tag5": "gem", + "name": "Germanic languages", + "hierarchy": [ + "ine", + "gem" + ] + }, + { + "tag5": "gme", + "name": "East Germanic languages", + "hierarchy": [ + "ine", + "gem", + "gme" + ] + }, + { + "tag5": "gmq", + "name": "North Germanic languages", + "hierarchy": [ + "ine", + "gem", + "gmq" + ] + }, + { + "tag5": "gmw", + "name": "West Germanic languages", + "hierarchy": [ + "ine", + "gem", + "gmw" + ] + }, + { + "tag5": "grk", + "name": "Greek languages", + "hierarchy": [ + "ine", + "grk" + ] + }, + { + "tag5": "hmx", + "name": "Hmong-Mien languages", + "hierarchy": [ + "hmx" + ] + }, + { + "tag5": "hok", + "name": "Hokan languages", + "hierarchy": [ + "nai", + "hok" + ] + }, + { + "tag5": "hyx", + "name": "Armenian (family)", + "hierarchy": [ + "ine", + "hyx" + ] + }, + { + "tag5": "iir", + "name": "Indo-Iranian languages", + "hierarchy": [ + "ine", + "iir" + ] + }, + { + "tag5": "ijo", + "name": "Ijo languages", + "hierarchy": [ + "nic", + "alv", + "ijo" + ] + }, + { + "tag5": "inc", + "name": "Indic languages", + "hierarchy": [ + "ine", + "iir", + "inc" + ] + }, + { + "tag5": "ine", + "name": "Indo-European languages", + "hierarchy": [ + "ine" + ] + }, + { + "tag5": "ira", + "name": "Iranian languages", + "hierarchy": [ + "ine", + "iir", + "ira" + ] + }, + { + "tag5": "iro", + "name": "Iroquoian languages", + "hierarchy": [ + "nai", + "iro" + ] + }, + { + "tag5": "itc", + "name": "Italic languages", + "hierarchy": [ + "ine", + "itc" + ] + }, + { + "tag5": "jpx", + "name": "Japanese (family)", + "hierarchy": [ + "jpx" + ] + }, + { + "tag5": "kar", + "name": "Karen languages", + "hierarchy": [ + "sit", + "tbq", + "kar" + ] + }, + { + "tag5": "kdo", + "name": "Kordofanian languages", + "hierarchy": [ + "nic", + "kdo" + ] + }, + { + "tag5": "khi", + "name": "Khoisan languages", + "hierarchy": [ + "khi" + ] + }, + { + "tag5": "kro", + "name": "Kru languages", + "hierarchy": [ + "nic", + "alv", + "kro" + ] + }, + { + "tag5": "map", + "name": "Austronesian languages", + "hierarchy": [ + "map" + ] + }, + { + "tag5": "mkh", + "name": "Mon-Khmer languages", + "hierarchy": [ + "aav", + "mkh" + ] + }, + { + "tag5": "mno", + "name": "Manobo languages", + "hierarchy": [ + "map", + "poz", + "pqw", + "phi", + "mno" + ] + }, + { + "tag5": "mun", + "name": "Munda languages", + "hierarchy": [ + "aav", + "mun" + ] + }, + { + "tag5": "myn", + "name": "Mayan languages", + "hierarchy": [ + "cai", + "myn" + ] + }, + { + "tag5": "nah", + "name": "Nahuatl languages", + "hierarchy": [ + "nai", + "azc", + "nah" + ] + }, + { + "tag5": "nai", + "name": "North American Indian languages", + "hierarchy": [ + "nai" + ] + }, + { + "tag5": "ngf", + "name": "Trans-New Guinea languages", + "hierarchy": [ + "paa", + "ngf" + ] + }, + { + "tag5": "nic", + "name": "Niger-Kordofanian languages", + "hierarchy": [ + "nic" + ] + }, + { + "tag5": "nub", + "name": "Nubian languages", + "hierarchy": [ + "ssa", + "sdv", + "nub" + ] + }, + { + "tag5": "omq", + "name": "Oto-Manguean languages", + "hierarchy": [ + "cai", + "omq" + ] + }, + { + "tag5": "omv", + "name": "Omotic languages", + "hierarchy": [ + "afa", + "omv" + ] + }, + { + "tag5": "oto", + "name": "Otomian languages", + "hierarchy": [ + "cai", + "omq", + "oto" + ] + }, + { + "tag5": "paa", + "name": "Papuan languages", + "hierarchy": [ + "paa" + ] + }, + { + "tag5": "phi", + "name": "Philippine languages", + "hierarchy": [ + "map", + "poz", + "pqw", + "phi" + ] + }, + { + "tag5": "plf", + "name": "Central Malayo-Polynesian languages", + "hierarchy": [ + "map", + "poz", + "plf" + ] + }, + { + "tag5": "poz", + "name": "Malayo-Polynesian languages", + "hierarchy": [ + "map", + "poz" + ] + }, + { + "tag5": "pqe", + "name": "Eastern Malayo-Polynesian languages", + "hierarchy": [ + "map", + "poz", + "pqe" + ] + }, + { + "tag5": "pqw", + "name": "Western Malayo-Polynesian languages", + "hierarchy": [ + "map", + "poz", + "pqw" + ] + }, + { + "tag5": "pra", + "name": "Prakrit languages", + "hierarchy": [ + "ine", + "iir", + "inc", + "pra" + ] + }, + { + "tag5": "qwe", + "name": "Quechuan (family)", + "hierarchy": [ + "sai", + "qwe" + ] + }, + { + "tag5": "roa", + "name": "Romance languages", + "hierarchy": [ + "ine", + "itc", + "roa" + ] + }, + { + "tag5": "sai", + "name": "South American Indian languages", + "hierarchy": [ + "sai" + ] + }, + { + "tag5": "sal", + "name": "Salishan languages", + "hierarchy": [ + "nai", + "sal" + ] + }, + { + "tag5": "sdv", + "name": "Eastern Sudanic languages", + "hierarchy": [ + "ssa", + "sdv" + ] + }, + { + "tag5": "sem", + "name": "Semitic languages", + "hierarchy": [ + "afa", + "sem" + ] + }, + { + "tag5": "sgn", + "name": "sign languages", + "hierarchy": [ + "sgn" + ] + }, + { + "tag5": "sio", + "name": "Siouan languages", + "hierarchy": [ + "nai", + "sio" + ] + }, + { + "tag5": "sit", + "name": "Sino-Tibetan languages", + "hierarchy": [ + "sit" + ] + }, + { + "tag5": "sla", + "name": "Slavic languages", + "hierarchy": [ + "ine", + "sla" + ] + }, + { + "tag5": "smi", + "name": "Sami languages", + "hierarchy": [ + "urj", + "fiu", + "smi" + ] + }, + { + "tag5": "son", + "name": "Songhai languages", + "hierarchy": [ + "ssa", + "son" + ] + }, + { + "tag5": "sqj", + "name": "Albanian languages", + "hierarchy": [ + "ine", + "sqj" + ] + }, + { + "tag5": "ssa", + "name": "Nilo-Saharan languages", + "hierarchy": [ + "ssa" + ] + }, + { + "tag5": "syd", + "name": "Samoyedic languages", + "hierarchy": [ + "urj", + "syd" + ] + }, + { + "tag5": "tai", + "name": "Tai languages", + "hierarchy": [ + "tai" + ] + }, + { + "tag5": "tbq", + "name": "Tibeto-Burman languages", + "hierarchy": [ + "sit", + "tbq" + ] + }, + { + "tag5": "trk", + "name": "Turkic languages", + "hierarchy": [ + "tut", + "trk" + ] + }, + { + "tag5": "tup", + "name": "Tupi languages", + "hierarchy": [ + "sai", + "tup" + ] + }, + { + "tag5": "tut", + "name": "Altaic languages", + "hierarchy": [ + "tut" + ] + }, + { + "tag5": "tuw", + "name": "Tungus languages", + "hierarchy": [ + "tut", + "tuw" + ] + }, + { + "tag5": "urj", + "name": "Uralic languages", + "hierarchy": [ + "urj" + ] + }, + { + "tag5": "wak", + "name": "Wakashan languages", + "hierarchy": [ + "nai", + "wak" + ] + }, + { + "tag5": "wen", + "name": "Sorbian languages", + "hierarchy": [ + "ine", + "sla", + "zlw", + "wen" + ] + }, + { + "tag5": "xgn", + "name": "Mongolian languages", + "hierarchy": [ + "tut", + "xgn" + ] + }, + { + "tag5": "xnd", + "name": "Na-Dene languages", + "hierarchy": [ + "nai", + "xnd" + ] + }, + { + "tag5": "ypk", + "name": "Yupik languages", + "hierarchy": [ + "esx", + "ypk" + ] + }, + { + "tag5": "zhx", + "name": "Chinese (family)", + "hierarchy": [ + "sit", + "zhx" + ] + }, + { + "tag5": "zle", + "name": "East Slavic languages", + "hierarchy": [ + "ine", + "sla", + "zle" + ] + }, + { + "tag5": "zls", + "name": "South Slavic languages", + "hierarchy": [ + "ine", + "sla", + "zls" + ] + }, + { + "tag5": "zlw", + "name": "West Slavic languages", + "hierarchy": [ + "ine", + "sla", + "zlw" + ] + }, + { + "tag5": "znd", + "name": "Zande languages", + "hierarchy": [ + "nic", + "alv", + "znd" + ] + } +] diff --git a/iso639-autonyms.tsv b/iso639-autonyms.tsv index 9d20ccd..813d34d 100644 --- a/iso639-autonyms.tsv +++ b/iso639-autonyms.tsv @@ -7848,3 +7848,118 @@ zyn Yongnan Zhuang iso639-3 zyp Zyphe Chin iso639-3 zza Zaza cldr zzj Zuojiang Zhuang iso639-3 +aav Austro-Asiatic languages iso639-5 +afa Afro-Asiatic languages iso639-5 +alg Algonquian languages iso639-5 +alv Atlantic-Congo languages iso639-5 +apa Apache languages iso639-5 +aqa Alacalufan languages iso639-5 +aql Algic languages iso639-5 +art Artificial languages iso639-5 +ath Athapascan languages iso639-5 +auf Arauan languages iso639-5 +aus Australian languages iso639-5 +awd Arawakan languages iso639-5 +azc Uto-Aztecan languages iso639-5 +bad Banda languages iso639-5 +bai Bamileke languages iso639-5 +bat Baltic languages iso639-5 +ber Berber languages iso639-5 +bih Bihari languages iso639-5 +bnt Bantu languages iso639-5 +btk Batak languages iso639-5 +cai Central American Indian languages iso639-5 +cau Caucasian languages iso639-5 +cba Chibchan languages iso639-5 +ccn North Caucasian languages iso639-5 +ccs South Caucasian languages iso639-5 +cdc Chadic languages iso639-5 +cdd Caddoan languages iso639-5 +cel Celtic languages iso639-5 +cmc Chamic languages iso639-5 +cpe Creoles and pidgins, English‑based iso639-5 +cpf Creoles and pidgins, French‑based iso639-5 +cpp Creoles and pidgins, Portuguese-based iso639-5 +crp Creoles and pidgins iso639-5 +csu Central Sudanic languages iso639-5 +cus Cushitic languages iso639-5 +day Land Dayak languages iso639-5 +dmn Mande languages iso639-5 +dra Dravidian languages iso639-5 +egx Egyptian languages iso639-5 +esx Eskimo-Aleut languages iso639-5 +euq Basque (family) iso639-5 +fiu Finno-Ugrian languages iso639-5 +fox Formosan languages iso639-5 +gem Germanic languages iso639-5 +gme East Germanic languages iso639-5 +gmq North Germanic languages iso639-5 +gmw West Germanic languages iso639-5 +grk Greek languages iso639-5 +hmx Hmong-Mien languages iso639-5 +hok Hokan languages iso639-5 +hyx Armenian (family) iso639-5 +iir Indo-Iranian languages iso639-5 +ijo Ijo languages iso639-5 +inc Indic languages iso639-5 +ine Indo-European languages iso639-5 +ira Iranian languages iso639-5 +iro Iroquoian languages iso639-5 +itc Italic languages iso639-5 +jpx Japanese (family) iso639-5 +kar Karen languages iso639-5 +kdo Kordofanian languages iso639-5 +khi Khoisan languages iso639-5 +kro Kru languages iso639-5 +map Austronesian languages iso639-5 +mkh Mon-Khmer languages iso639-5 +mno Manobo languages iso639-5 +mun Munda languages iso639-5 +myn Mayan languages iso639-5 +nah Nahuatl languages iso639-5 +nai North American Indian languages iso639-5 +ngf Trans-New Guinea languages iso639-5 +nic Niger-Kordofanian languages iso639-5 +nub Nubian languages iso639-5 +omq Oto-Manguean languages iso639-5 +omv Omotic languages iso639-5 +oto Otomian languages iso639-5 +paa Papuan languages iso639-5 +phi Philippine languages iso639-5 +plf Central Malayo-Polynesian languages iso639-5 +poz Malayo-Polynesian languages iso639-5 +pqe Eastern Malayo-Polynesian languages iso639-5 +pqw Western Malayo-Polynesian languages iso639-5 +pra Prakrit languages iso639-5 +qwe Quechuan (family) iso639-5 +roa Romance languages iso639-5 +sai South American Indian languages iso639-5 +sal Salishan languages iso639-5 +sdv Eastern Sudanic languages iso639-5 +sem Semitic languages iso639-5 +sgn sign languages iso639-5 +sio Siouan languages iso639-5 +sit Sino-Tibetan languages iso639-5 +sla Slavic languages iso639-5 +smi Sami languages iso639-5 +son Songhai languages iso639-5 +sqj Albanian languages iso639-5 +ssa Nilo-Saharan languages iso639-5 +syd Samoyedic languages iso639-5 +tai Tai languages iso639-5 +tbq Tibeto-Burman languages iso639-5 +trk Turkic languages iso639-5 +tup Tupi languages iso639-5 +tut Altaic languages iso639-5 +tuw Tungus languages iso639-5 +urj Uralic languages iso639-5 +wak Wakashan languages iso639-5 +wen Sorbian languages iso639-5 +xgn Mongolian languages iso639-5 +xnd Na-Dene languages iso639-5 +ypk Yupik languages iso639-5 +zhx Chinese (family) iso639-5 +zle East Slavic languages iso639-5 +zls South Slavic languages iso639-5 +zlw West Slavic languages iso639-5 +znd Zande languages iso639-5 \ No newline at end of file