From 6cd62e3d488afc338dcbb0f5f0cda209480fd730 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Fri, 20 Jan 2023 06:16:26 +0000 Subject: [PATCH] [fix] Fix missing russian and update readme --- model/supervised_finetuning/README.md | 17 +++++++++++++++++ .../custom_datasets/translation.py | 5 +++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/model/supervised_finetuning/README.md b/model/supervised_finetuning/README.md index 822121d8..9f200847 100644 --- a/model/supervised_finetuning/README.md +++ b/model/supervised_finetuning/README.md @@ -60,6 +60,23 @@ python trainer.py --configs defaults your-model-name --deepspeed ## Dataset choices +To specify which translation pair for [WMT](https://huggingface.co/datasets/wmt19) and [TED Talk](https://huggingface.co/datasets/ted_talks_iwslt) translation simply add the supported language pair at the postfix + +``` + datasets: + - wmt2019_zh-en + - wmt2019_ru-en + - wmt2019_de-en + - ted_trans_nl-en + - ted_trans_de-ja +``` + +Currently only these languages are supported via prompt translation: + +``` +ar,de,fr,en,it,nl,tr,ru,ms,ko,ja,zh +``` + ## Results Experimental results in wandb diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index a6d46e9e..79fff0d1 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -63,10 +63,11 @@ TRANSLATION_PROMPT = { "{}, jemahkan ayat ini kepada bahasa melayu" ], "en": ["{}. translate to english", "{} write in english", "english translation: '{}'"], - "tr": ["{}. translate to turkish", "{} write in turkish", "turkish translation: '{}'"], + "ru": ["помогите мне перевести это на русский : {}", "{} перевести на русский язык", "russian translation: '{}'"], + "tr": ["{}. türkçeye çevi̇ri̇n", "{} write in turkish", "turkish translation: '{}'", "türkçeye çevi̇rmek: {}"], "it": ["{}. translate to italian", "{} write in italian", "italian translation: '{}'"], "nl": ["{}. translate to dutch", "{} write in dutch", "dutch translation: '{}'"], - "vi": ["{}. translate to vietnamese", "{} write in vietnamese", "vietnamese translation: '{}'"], + "vi": ["{}. Dịch sang tiếng việt nam", "{} write in vietnamese", "vietnamese translation: '{}'"], "ar": ["{}. translate to arabic", "{} write in arabic", "arabic translation: '{}'"], } class TranslationPair(Dataset):