JoeArmani
commited on
Commit
·
3190e1e
0
Parent(s):
Initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- __pycache__/back_translator.cpython-310.pyc +0 -0
- __pycache__/dialogue_augmenter.cpython-310.pyc +0 -0
- __pycache__/paraphraser.cpython-310.pyc +0 -0
- __pycache__/pipeline_config.cpython-310.pyc +0 -0
- __pycache__/processing_pipeline.cpython-310.pyc +0 -0
- __pycache__/quality_metrics.cpython-310.pyc +0 -0
- __pycache__/schema_guided_dialogue_processor.cpython-310.pyc +0 -0
- __pycache__/taskmaster_processor.cpython-310.pyc +0 -0
- augmented_combined_dataset.json +0 -0
- back_translator.py +56 -0
- datasets/.DS_Store +0 -0
- datasets/schema_guided/dialogues_001.json +0 -0
- datasets/schema_guided/dialogues_002.json +0 -0
- datasets/schema_guided/dialogues_003.json +0 -0
- datasets/schema_guided/dialogues_004.json +0 -0
- datasets/schema_guided/dialogues_005.json +0 -0
- datasets/schema_guided/dialogues_006.json +0 -0
- datasets/schema_guided/dialogues_007.json +0 -0
- datasets/schema_guided/dialogues_008.json +0 -0
- datasets/schema_guided/dialogues_009.json +0 -0
- datasets/schema_guided/dialogues_010.json +0 -0
- datasets/schema_guided/dialogues_011.json +0 -0
- datasets/schema_guided/dialogues_012.json +0 -0
- datasets/schema_guided/dialogues_013.json +0 -0
- datasets/schema_guided/dialogues_014.json +0 -0
- datasets/schema_guided/dialogues_015.json +0 -0
- datasets/schema_guided/dialogues_016.json +0 -0
- datasets/schema_guided/dialogues_017.json +0 -0
- datasets/schema_guided/dialogues_018.json +0 -0
- datasets/schema_guided/dialogues_019.json +0 -0
- datasets/schema_guided/dialogues_020.json +0 -0
- datasets/schema_guided/dialogues_021.json +0 -0
- datasets/schema_guided/dialogues_022.json +0 -0
- datasets/schema_guided/dialogues_023.json +0 -0
- datasets/schema_guided/dialogues_024.json +0 -0
- datasets/schema_guided/dialogues_025.json +0 -0
- datasets/schema_guided/dialogues_026.json +0 -0
- datasets/schema_guided/dialogues_027.json +0 -0
- datasets/schema_guided/dialogues_028.json +0 -0
- datasets/schema_guided/dialogues_029.json +0 -0
- datasets/schema_guided/dialogues_030.json +0 -0
- datasets/schema_guided/dialogues_031.json +0 -0
- datasets/schema_guided/dialogues_032.json +0 -0
- datasets/schema_guided/dialogues_033.json +0 -0
- datasets/schema_guided/dialogues_034.json +0 -0
- datasets/schema_guided/dialogues_035.json +0 -0
- datasets/schema_guided/dialogues_036.json +0 -0
- datasets/schema_guided/dialogues_037.json +0 -0
- datasets/schema_guided/dialogues_038.json +0 -0
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
__pycache__/back_translator.cpython-310.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
__pycache__/dialogue_augmenter.cpython-310.pyc
ADDED
|
Binary file (14.5 kB). View file
|
|
|
__pycache__/paraphraser.cpython-310.pyc
ADDED
|
Binary file (1.55 kB). View file
|
|
|
__pycache__/pipeline_config.cpython-310.pyc
ADDED
|
Binary file (2.06 kB). View file
|
|
|
__pycache__/processing_pipeline.cpython-310.pyc
ADDED
|
Binary file (6.51 kB). View file
|
|
|
__pycache__/quality_metrics.cpython-310.pyc
ADDED
|
Binary file (4.49 kB). View file
|
|
|
__pycache__/schema_guided_dialogue_processor.cpython-310.pyc
ADDED
|
Binary file (5.82 kB). View file
|
|
|
__pycache__/taskmaster_processor.cpython-310.pyc
ADDED
|
Binary file (5.72 kB). View file
|
|
|
augmented_combined_dataset.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
back_translator.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import (
|
| 2 |
+
MarianMTModel,
|
| 3 |
+
MarianTokenizer,
|
| 4 |
+
)
|
| 5 |
+
|
| 6 |
+
class BackTranslator:
|
| 7 |
+
"""
|
| 8 |
+
Perform Back-translation with pivot language. English -> German -> Spanish -> English
|
| 9 |
+
Args:
|
| 10 |
+
source_lang: Source language (default: 'en')
|
| 11 |
+
pivot_lang: Pivot language (default: 'de')
|
| 12 |
+
target_lang: Target language (default: 'es')
|
| 13 |
+
Examples:
|
| 14 |
+
back_translator = BackTranslator()
|
| 15 |
+
back_translator.back_translate("Hello, how are you?")
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, source_lang='en', pivot_lang='de', target_lang='es'):
|
| 18 |
+
# Forward (English to German)
|
| 19 |
+
pivot_forward_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}'
|
| 20 |
+
self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
|
| 21 |
+
self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
|
| 22 |
+
|
| 23 |
+
# Pivot translation model (German to Spanish)
|
| 24 |
+
pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
|
| 25 |
+
self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
|
| 26 |
+
self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
|
| 27 |
+
|
| 28 |
+
# Backward (Spanish to English)
|
| 29 |
+
backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
|
| 30 |
+
self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
|
| 31 |
+
self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
|
| 32 |
+
|
| 33 |
+
def back_translate(self, text):
|
| 34 |
+
"""
|
| 35 |
+
Perform back-translation through German and Spanish to generate text variations.
|
| 36 |
+
Args:
|
| 37 |
+
text (str): The input text to be back-translated
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
str: The back-translated text
|
| 41 |
+
"""
|
| 42 |
+
# 1. English to German
|
| 43 |
+
encoded_pivot = self.tokenizer_pivot_forward([text], padding=True, truncation=True, return_tensors='pt')
|
| 44 |
+
generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
|
| 45 |
+
pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot, skip_special_tokens=True)[0]
|
| 46 |
+
|
| 47 |
+
# 2. German to Spanish
|
| 48 |
+
encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True, truncation=True, return_tensors='pt')
|
| 49 |
+
retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
|
| 50 |
+
tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot, skip_special_tokens=True)[0]
|
| 51 |
+
|
| 52 |
+
# 3. Spanish to English
|
| 53 |
+
encoded_back = self.tokenizer_backward([tgt_text_back], padding=True, truncation=True, return_tensors='pt')
|
| 54 |
+
retranslated = self.model_backward.generate(**encoded_back)
|
| 55 |
+
src_text = self.tokenizer_backward.batch_decode(retranslated, skip_special_tokens=True)[0]
|
| 56 |
+
return src_text
|
datasets/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
datasets/schema_guided/dialogues_001.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_002.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_003.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_004.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_005.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_006.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_007.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_008.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_009.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_010.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_011.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_012.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_013.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_014.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_015.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_016.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_017.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_018.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_019.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_020.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_021.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_022.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_023.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_024.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_025.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_026.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_027.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_028.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_029.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_030.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_031.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_032.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_033.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_034.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_035.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_036.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_037.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/schema_guided/dialogues_038.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|