import json
import logging
import unittest
from pathlib import Path
from transformers import AutoTokenizer
from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
from axolotl.prompters import ShareGPTPrompter
logging.basicConfig(level="INFO")
class TestPromptTokenizationStrategies(unittest.TestCase):
    def setUp(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
                "bos_token": "",
                "eos_token": "",
                "unk_token": "",
            }
        )
    def test_sharegpt_integration(self):
        print(Path(__file__).parent)
        with open(Path(__file__).parent / "fixtures/conversation.json", "r") as fin:
            data = fin.read()
            conversation = json.loads(data)
        with open(Path(__file__).parent / "fixtures/conversation.tokenized.json", "r") as fin:
            data = fin.read()
            tokenized_conversation = json.loads(data)
        prompter = ShareGPTPrompter("chat")
        strat = ShareGPTPromptTokenizingStrategy(
            prompter,
            self.tokenizer,
            False,
            2048,
        )
        example = strat.tokenize_prompt(conversation)
        for fields in ["input_ids", "attention_mask", "labels"]:
            self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
            self.assertEqual(example[fields], tokenized_conversation[fields])
if __name__ == "__main__":
    unittest.main()