Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import re | |
| import tensorflow as tf | |
| import tensorflow_datasets as tfds | |
| import nltk | |
| from nltk.stem import WordNetLemmatizer | |
| nltk.download('wordnet') | |
| nltk.download('punkt') | |
| lemmatizer = WordNetLemmatizer() | |
| def preprocess_sentence(sentence): | |
| sentence = sentence.lower().strip() | |
| sentence = re.sub(r"([?.!¿])", r" \1 ", sentence) | |
| sentence = re.sub(r'[" "]+', " ", sentence) | |
| sentence = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", sentence) | |
| sentence = re.sub(r"[^a-zA-ZğüşöçıİĞÜŞÖÇ?.!,¿]+", " ", sentence) | |
| sentence = sentence.strip() | |
| sentence = ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(sentence)]) | |
| return sentence | |
| def load_conversations(hparams, lines_file, conversations_file): | |
| id2line = {} | |
| with open(lines_file, encoding = "utf-8", errors="ignore") as file: | |
| lines = file.readlines() | |
| for line in lines: | |
| parts = line.replace("\n", "").split(" +++$+++ ") | |
| id2line[parts[0]] = parts[4] | |
| questions = [] | |
| answers = [] | |
| with open(conversations_file, "r") as file: | |
| lines = file.readlines() | |
| for line in lines: | |
| parts = line.replace("\n", "").split(" +++$+++ ") | |
| conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")] | |
| for i in range(len(conversation) - 1): | |
| questions.append(preprocess_sentence(id2line[conversation[i]])) | |
| answers.append(preprocess_sentence(id2line[conversation[i + 1]])) | |
| if len(questions) >= hparams.max_samples: | |
| return questions, answers | |
| return questions, answers | |
| def tokenize(hparams, tokenizer, questions, answers): | |
| tokenized_inputs, tokenized_outputs = [], [] | |
| for (question, answer) in zip(questions, answers): | |
| sentence1 = hparams.start_token + tokenizer.encode(question) + hparams.end_token | |
| sentence2 = hparams.start_token + tokenizer.encode(answer) + hparams.end_token | |
| if (len(sentence1) <= hparams.max_length and len(sentence2) <= hparams.max_length): | |
| tokenized_inputs.append(sentence1) | |
| tokenized_outputs.append(sentence2) | |
| tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences( | |
| tokenized_inputs, maxlen=hparams.max_length, padding="post") | |
| tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences( | |
| tokenized_outputs, maxlen=hparams.max_length, padding="post") | |
| return tokenized_inputs, tokenized_outputs | |
| def get_dataset(hparams): | |
| lines_file ="data/lines.txt" | |
| conversations_file = "data/conversations.txt" | |
| questions, answers = load_conversations(hparams, lines_file, conversations_file) | |
| tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13) | |
| tokenizer.save_to_file('tokenizer') | |
| hparams.start_token = [tokenizer.vocab_size] | |
| hparams.end_token = [tokenizer.vocab_size + 1] | |
| hparams.vocab_size = tokenizer.vocab_size + 2 | |
| questions, answers = tokenize(hparams, tokenizer, questions, answers) | |
| dataset = tf.data.Dataset.from_tensor_slices( | |
| ({"inputs": questions, "dec_inputs": answers[:, :-1]}, answers[:, 1:]) | |
| ) | |
| dataset = dataset.cache() | |
| dataset = dataset.shuffle(len(questions)) | |
| dataset = dataset.batch(hparams.batch_size) | |
| dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) | |
| return dataset, tokenizer | |