Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/AG-34157 Translate company names/descriptions #394

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.DS_Store
.vscode
trackerdb.sql
node_modules
node_modules
out
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,18 @@ The favicon of the company website is used as the company icon. It can be checke

[https://icons.adguard.org/icon?domain=adguard.com](https://icons.adguard.org/icon?domain=adguard.com)

## Company description translations

Run the following command to translate company descriptions into different
languages (you need an OpenAI API key for that):
If the description of the companies has not changed, the update will not occur

```bash
OPENAI_API_KEY="YOUR_API_KEY" yarn translate
```

You can modify the list of supported languages in [./translate.ts].

## Policy

The detailed policy currently is under development. The decision to add a company is at the discretion of the maintainers,
Expand Down
14,168 changes: 14,168 additions & 0 deletions dist/companies_i18n.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"main": "index.js",
"scripts": {
"convert": "ts-node index.ts",
"translate": "ts-node translate.ts",
"lint": "eslint ."
},
"author": "AdGuard",
Expand All @@ -23,6 +24,7 @@
"dependencies": {
"consola": "^3.1.0",
"csv-stringify": "^6.4.4",
"openai": "^4.53.2",
"zod": "^3.21.4"
}
}
}
219 changes: 219 additions & 0 deletions translate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
/* eslint-disable no-restricted-syntax, guard-for-in, no-await-in-loop */
import * as fs from 'fs';
import { consola } from 'consola';
import { OpenAI } from 'openai';

const inputFilePath = 'dist/companies.json';
const translationsFilePath = 'dist/companies_i18n.json';
const defaultLanguage = 'en';
const defaultLanguageCount = 1;
const languages = [
'ru',
'de',
'zh-cn',
];

const openai = new OpenAI();

interface Company {
name: string;
websiteUrl: string;
description: string;
}

interface Companies {
[key: string]: Company;
}

interface TranslationResult {
[lang: string]: string;
}

interface Translations {
[companyId: string]: TranslationResult;
}

async function translateContent(content: string, langCode: string): Promise<string> {
const prompt = `Translate the following text to language with code ${langCode}: \n${content}`;

const completion = await openai.chat.completions.create({
model: 'gpt-3.5-turbo',
messages: [
{ role: 'system', content: 'You are a translation assistant.' },
{ role: 'user', content: prompt },
],
});

return completion.choices[0].message.content!;
}

function initTranslations(companies: Companies): Translations {
const translations: Translations = {};

// Init translations with english default strings for now.
for (const companyId in companies) {
const company = companies[companyId];

if (company.description) {
const companyTranslations: TranslationResult = {};
companyTranslations[defaultLanguage] = company.description;
translations[companyId] = companyTranslations;
}
}

return translations;
}

function copyBaseDescriptionToAllLang(
translations: TranslationResult,
description: string,
) : TranslationResult {
const newtranslations = { ...translations };
for (const lang of languages) {
newtranslations[lang] = description;
}

return newtranslations;
}

function isStringNullOrEmpty(value: string | null | undefined) : boolean {
return value === null || value === undefined || value.trim() === '';
}

function isStringNumeric(value: string | null | undefined) : boolean {
return !Number.isNaN(Number(value));
}

function isDescriptionNeedToTranslate(value: string | null | undefined) : boolean {
return !isStringNullOrEmpty(value)
&& !isStringNumeric(value);
}

function removeObsoleteCompanyDescriptions(translations: Translations, companies: Companies) {
const syncedTranslations = translations;

// Remove translations of companies that are no more present in the
// companies object.
for (const companyId in translations) {
const company = companies[companyId];
if (!company || !company.description) {
consola.info(`Removing translations for company ${companyId} as it doesn't exist anymore`);
delete syncedTranslations[companyId];
}
}
}

async function generateTranslations(
companyTranslations: TranslationResult,
baseDescriptionChanged: boolean,
companyId: string,
newDescription: string,
) : Promise<TranslationResult> {
const newCompanyTranslations = { ...companyTranslations };
// If there are no translations for this company, generate them.
for (const lang of languages) {
// Only translate it if there's a language missing or if the
// base description changed.
if (baseDescriptionChanged || newCompanyTranslations[lang] === undefined) {
try {
consola.debug(`Translating ${companyId} description to ${lang}`);
const translatedText = await translateContent(newDescription, lang);
newCompanyTranslations[lang] = translatedText;
} catch (ex) {
consola.error(`Failed to translate ${companyId} description to ${lang}`, ex);
}
}
}

return newCompanyTranslations;
}

async function translateCompanyDescriptions(translations: Translations, companies: Companies) {
const syncedTranslations = translations;
let translatedCompaniesCount = 0;
let translationsCount = 0;
let previousTranslationsCount = 0;

consola.info('Start translate company descriptions');
// Sync translations with the companies object.

for (const companyId in companies) {
const company = companies[companyId];
const newDescription = company.description;
let companyTranslations = syncedTranslations[companyId] ?? {
[defaultLanguage]: newDescription,
};
const baseDescriptionChanged = companyTranslations[defaultLanguage] !== newDescription;

// Update the base language now.
companyTranslations[defaultLanguage] = newDescription;
if (!isDescriptionNeedToTranslate(company.description)) {
companyTranslations = copyBaseDescriptionToAllLang(
companyTranslations,
company.description,
);
} else {
companyTranslations = await generateTranslations(
companyTranslations,
baseDescriptionChanged,
companyId,
newDescription,
);
translationsCount += Object.keys(companyTranslations).length - defaultLanguageCount;
}

// Signals that there were changes in company translations.
if (translationsCount !== previousTranslationsCount) {
translatedCompaniesCount += 1;
previousTranslationsCount = translationsCount;
}

// Update translations for this company.
syncedTranslations[companyId] = companyTranslations;

// Save the updated result to the file once in a while since the script
// may run for a long time and we don't want to lose intermediate
// result.
if (translatedCompaniesCount > 0 && translatedCompaniesCount % 10 === 0) {
consola.info(`Made ${translationsCount} translations in ${translatedCompaniesCount} companies, saving to ${translationsFilePath}`);
fs.writeFileSync(translationsFilePath, JSON.stringify(syncedTranslations, null, 4));
}
}
}

async function syncTranslations(translations: Translations, companies: Companies) {
removeObsoleteCompanyDescriptions(translations, companies);
await translateCompanyDescriptions(translations, companies);
}

async function main() {
consola.info('Start translating companies description');

const rawData = fs.readFileSync(inputFilePath, 'utf-8');
const parsedData = JSON.parse(rawData);
const { companies } = parsedData;

consola.info(`Found ${Object.keys(companies).length} companies in ${inputFilePath}`);

// First, prepare the translations file.
let translations: Translations;

if (!fs.existsSync(translationsFilePath)) {
consola.info(`Prepare the initial i18n file at ${translationsFilePath}`);

// Prepare a translations file if there's nothing yet.
translations = initTranslations(companies);
fs.writeFileSync(translationsFilePath, JSON.stringify(translations, null, 4));
} else {
consola.info(`Reading existing i18n file at ${translationsFilePath}`);

const rawTranslations = fs.readFileSync(translationsFilePath, 'utf-8');
translations = JSON.parse(rawTranslations);
}

consola.info(`Found ${Object.keys(translations).length} companies in ${translationsFilePath}`);

await syncTranslations(translations, companies);
}

main();
7 changes: 6 additions & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
"compilerOptions": {
"module": "commonjs",
"strict": true,
"lib": ["ESNext"],
"lib": [
"ESNext"
],
"target": "ES2015",
"outDir": "out",
"sourceMap": true
}
}
Loading