Hyacinthax commited on
Commit
e24ec36
·
verified ·
1 Parent(s): f1568c1

Upload 5 files

Browse files

Initial commit; There's a known problem on Colab that says ChatbotTrainer isn't instantiated but is; That will be the next update

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ preprocessed_dialogs.py filter=lfs diff=lfs merge=lfs -text
chatbotChat.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from chatbotTrainer import ChatbotTrainer # Import the ChatbotTrainer class
3
+
4
+
5
+ def main():
6
+ # Initialize the chatbot
7
+ chatbot_trainer = ChatbotTrainer()
8
+
9
+ # Ensure the model and tokenizer are loaded
10
+ if chatbot_trainer.model is None:
11
+ chatbot_trainer.load_model_file()
12
+
13
+ print("Chatbot is ready. Type 'exit' to end the conversation.")
14
+
15
+ while True:
16
+ try:
17
+ user_input = input("You: ").strip()
18
+ if not user_input:
19
+ print("Chatbot: Please say something, I'm here to help!")
20
+ continue
21
+
22
+ if user_input.lower() == "exit":
23
+ print("Chatbot: Goodbye! Have a great day!")
24
+ break
25
+
26
+ # Generate a response
27
+ response = chatbot_trainer.generate_response(user_input)
28
+
29
+ # Handle empty or invalid responses
30
+ if not response or response.strip() == "":
31
+ response = "I'm sorry, I don't have a response for that."
32
+
33
+ print(f"Alan: {response}")
34
+ except Exception as e:
35
+ print(f"Chatbot: An error occurred while generating a response. ({str(e)})")
36
+
37
+
38
+ # Run the chatbot if the script is executed directly
39
+ if __name__ == "__main__":
40
+ main()
chatbotTrainer.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import numpy as np
4
+ import tensorflow
5
+ from keras.callbacks import Callback, ReduceLROnPlateau
6
+ from tensorflow.keras.preprocessing.text import Tokenizer
7
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
8
+ from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout, Flatten
9
+ from tensorflow.keras.regularizers import l2
10
+ from tensorflow.keras.models import Model, load_model, model_from_json
11
+ from tensorflow.keras.optimizers import Adam
12
+ import matplotlib.pyplot as plt
13
+ import logging
14
+ import heapq
15
+ import pickle
16
+ import time
17
+ import json
18
+ import pdb
19
+
20
+ tensorflow.keras.mixed_precision.set_global_policy('mixed_float16')
21
+
22
+
23
+ class BeamSearchHelper:
24
+ def __init__(self, model, tokenizer, max_seq_length, encoder_filename, decoder_filename, top_k=5,
25
+ temperature=1.0, top_p=0.9, beam_width=3, scaling_factor=10, min_word=3):
26
+ self.model = model
27
+ self.tokenizer = tokenizer
28
+ self.max_seq_length = max_seq_length
29
+ self.top_k = top_k
30
+ self.encoder_filename = encoder_filename
31
+ self.decoder_filename = decoder_filename
32
+ self.temperature = temperature
33
+ self.scaling_factor = scaling_factor
34
+ self.top_p = top_p
35
+ self.beam_width = beam_width
36
+ self.min_word = min_word
37
+ self.logger = self.setup_logger()
38
+
39
+ def setup_logger(self):
40
+ logger = logging.getLogger("ChatbotBeamSearch")
41
+ logger.setLevel(logging.DEBUG)
42
+ console_handler = logging.StreamHandler()
43
+ console_handler.setLevel(logging.INFO)
44
+ console_formatter = logging.Formatter('%(levelname)s: %(message)s')
45
+ console_handler.setFormatter(console_formatter)
46
+ logger.addHandler(console_handler)
47
+ file_handler = logging.FileHandler("chatbotBeam.log")
48
+ file_handler.setLevel(logging.DEBUG)
49
+ file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
50
+ file_handler.setFormatter(file_formatter)
51
+ logger.addHandler(file_handler)
52
+ return logger
53
+
54
+ def beam_search(self, input_text):
55
+ # Load encoder and decoder models
56
+ encoder_model = load_model(self.encoder_filename)
57
+ decoder_model = load_model(self.decoder_filename)
58
+
59
+ # Preprocess input
60
+ input_seqs = self.tokenizer.texts_to_sequences([input_text])
61
+ input_seqs = pad_sequences(input_seqs, maxlen=self.max_seq_length, padding='post')
62
+
63
+ # Encode input sequence
64
+ encoder_states = encoder_model.predict(input_seqs)
65
+ state_h, state_c = encoder_states
66
+
67
+ # Ensure batch size of 1
68
+ state_h = state_h[0:1, :]
69
+ state_c = state_c[0:1, :]
70
+
71
+ # Initialize decoder with <start> token
72
+ start_token_index = self.tokenizer.word_index.get('<start>', 1)
73
+ target_seq = np.zeros((1, 1))
74
+ target_seq[0, 0] = start_token_index
75
+
76
+ # Initialize beam search candidates
77
+ sequences = [(target_seq, state_h, state_c, 0.0, [])] # (seq, h, c, score, decoded_words)
78
+
79
+ for _ in range(self.max_seq_length):
80
+ all_candidates = []
81
+
82
+ for seq, state_h, state_c, score, decoded_words in sequences:
83
+ # Predict the next token
84
+ output_tokens, state_h, state_c = decoder_model.predict([seq, state_h, state_c])
85
+
86
+ logits = output_tokens[0, -1, :] * self.scaling_factor
87
+ logits = logits / self.temperature
88
+ exp_logits = np.exp(logits - np.max(logits)) # Prevent overflow
89
+ probabilities = exp_logits / np.sum(exp_logits)
90
+
91
+ # Get the top beam_width candidate indices
92
+ top_indices = np.argsort(probabilities)[-self.beam_width:]
93
+
94
+ for idx in top_indices:
95
+ prob = probabilities[idx]
96
+ candidate_score = (score - np.log(prob + 1e-8)) / (len(decoded_words) + 1) # Normalize by length
97
+
98
+ # Append predicted token
99
+ new_decoded_words = decoded_words + [idx]
100
+ new_seq = np.copy(seq)
101
+ new_seq[0, 0] = idx # Set new token in sequence
102
+
103
+ # Enforce min_word before stopping at <end>
104
+ if idx == self.tokenizer.word_index.get('<end>', -1):
105
+ if len(new_decoded_words) < self.min_word:
106
+ continue # Ignore <end> if min_word isn't reached
107
+ else:
108
+ return " ".join(self.tokenizer.index_word[i] for i in new_decoded_words if i in self.tokenizer.index_word)
109
+
110
+ # Add to candidate list
111
+ all_candidates.append((new_seq, state_h, state_c, candidate_score, new_decoded_words))
112
+
113
+ # Select best beam_width sequences
114
+ if not all_candidates: # If no valid candidates, exit early
115
+ break
116
+
117
+ sequences = sorted(all_candidates, key=lambda x: x[3])[:self.beam_width]
118
+
119
+ # Convert token indices back to words
120
+ best_sequence = sequences[0][4] # Get best decoded words
121
+ return " ".join(self.tokenizer.index_word[idx] for idx in best_sequence if idx in self.tokenizer.index_word)
122
+
123
+ class BeamState:
124
+ def __init__(self, sequence, score, state, logger):
125
+ self.sequence = sequence
126
+ self.score = score
127
+ self.state = state
128
+ self.logger = logger
129
+
130
+ def __lt__(self, other):
131
+ return self.score < other.score
132
+
133
+ def log(self, message):
134
+ self.logger.debug(message)
135
+
136
+
137
+
138
+ class MonitorEarlyStopping(Callback):
139
+ def __init__(self, monitor='val_loss', patience=3, mode='min', restore_best_weights=True, verbose=1):
140
+ super(MonitorEarlyStopping, self).__init__()
141
+ self.monitor = monitor
142
+ self.patience = patience
143
+ self.mode = mode
144
+ self.restore_best_weights = restore_best_weights
145
+ self.verbose = verbose
146
+ self.best_weights = None
147
+ self.best_epoch = None
148
+ self.wait = 0
149
+ self.best_value = float('inf') if mode == 'min' else -float('inf')
150
+ self.stopped_epoch_list = [] # List to track stopped epochs
151
+
152
+ def on_epoch_end(self, epoch, logs=None):
153
+ current_value = logs.get(self.monitor)
154
+ if current_value is None:
155
+ if self.verbose > 0:
156
+ print(f"Warning: Metric '{self.monitor}' is not available in logs.")
157
+ return
158
+
159
+ # Check for improvement based on mode
160
+ if (self.mode == 'min' and current_value < self.best_value) or (self.mode == 'max' and current_value > self.best_value):
161
+ self.best_value = current_value
162
+ self.best_weights = self.model.get_weights()
163
+ self.best_epoch = epoch
164
+ self.wait = 0
165
+ if self.verbose > 0:
166
+ print(f"Epoch {epoch + 1}: {self.monitor} improved to {self.best_value:.4f}")
167
+ else:
168
+ self.wait += 1
169
+ if self.verbose > 0:
170
+ print(f"Epoch {epoch + 1}: {self.monitor} did not improve. Patience: {self.wait}/{self.patience}")
171
+ self.stopped_epoch_list.append(epoch + 1)
172
+
173
+ # Stop training if patience is exceeded
174
+ if self.wait >= self.patience:
175
+ if self.verbose > 0:
176
+ print(f"Stopping early at epoch {epoch + 1}. Best {self.monitor}: {self.best_value:.4f} at epoch {self.best_epoch + 1}")
177
+ self.model.stop_training = True
178
+ if self.restore_best_weights:
179
+ if self.verbose > 0:
180
+ print(f"Restoring best model weights from epoch {self.best_epoch + 1}.")
181
+ self.model.set_weights(self.best_weights)
182
+
183
+
184
+ class ChatbotTrainer:
185
+ def __init__(self):
186
+ # Corpus Setup
187
+ self.corpus = None
188
+ self.all_vocab_size = 0
189
+
190
+ # Model Setup
191
+ self.model = None
192
+ self.name = "Alex"
193
+ self.model_filename = f"{self.name}_model.keras"
194
+ self. encoder_filename = "encoder.keras"
195
+ self.decoder_filename = "decoder.keras"
196
+ self.tokenizer_save_path = "chatBotTokenizer.pkl"
197
+ self.tokenizer = None
198
+ self.reverse_tokenizer = None
199
+ self.embedding_dim = 64
200
+ self.max_seq_length = 64
201
+ self.learning_rate = 0.0013
202
+ self.optimizer = Adam(learning_rate=self.learning_rate, clipnorm=1.0)
203
+ self.batch_size = 16
204
+ self.epochs = 30
205
+ self.early_patience = self.epochs // 2
206
+ self.lstm_units = 128
207
+ self.dropout = 0.1
208
+ self.recurrent_dropout = 0.1
209
+ self.test_size = 0.2
210
+ self.max_vocabulary = 69000
211
+
212
+ # Model but instantiated here but filled later
213
+ self.encoder_model = None
214
+ self.encoder_inputs = None
215
+ self.decoder_inputs = None
216
+ self.decoder_outputs = None
217
+ self.decoder_model = None
218
+ self.max_vocab_size = None
219
+ self.config = None
220
+
221
+ # Training Setup
222
+ self.vocabularyList = []
223
+ self.troubleList = []
224
+ self.running_trouble = []
225
+
226
+ # Prediction Setup (Everything here will take priority)
227
+ self.min_word = 10 # Only for generate_response
228
+ self.temperature = 0.8
229
+ self.scaling_factor = 100
230
+ self.logger = self.setup_logger() # Initialize your logger here
231
+ self.beam_width = 9
232
+ self.top_p = 0.7
233
+ self.top_k = 3
234
+
235
+ # Log Metrics...
236
+ self.logger.info(f"""Metrics:\n
237
+ Embedding/MaxSeqLength:({self.embedding_dim}, {self.max_seq_length})\n
238
+ Batch Size: {self.batch_size}\n
239
+ LSTM Units: {self.lstm_units}\n
240
+ Epochs: {self.epochs}\n
241
+ Dropout: ({self.dropout}, {self.recurrent_dropout})\n
242
+ Test Split: {self.test_size}\n\n""")
243
+
244
+ # Tokenizer setup & propagation
245
+ if os.path.exists(self.tokenizer_save_path):
246
+ with open(self.tokenizer_save_path, 'rb') as tokenizer_load_file:
247
+ self.tokenizer = pickle.load(tokenizer_load_file)
248
+ self.reverse_tokenizer = {index: word for word, index in self.tokenizer.word_index.items()}
249
+ self.all_vocab_size = self.tokenizer.num_words
250
+ for words, i in self.tokenizer.word_index.items():
251
+ if words not in self.vocabularyList:
252
+ self.vocabularyList.append(words)
253
+ self.logger.info("Tokenizer loaded successfully.")
254
+ # print(f"Number of words in loaded tokenizer: {len(self.tokenizer.word_index)}")
255
+ # print(f"Number of words in the Vocab List: {len(self.vocabularyList)}")
256
+ else:
257
+ self.logger.warning("Tokenizer not found, making now... ")
258
+ self.tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-/.:;=?@[\\]^_`{|}~\t\n')
259
+
260
+ # Save '<OOV>', '<start>', and '<end>' to word index
261
+ self.tokenizer.num_words = 0
262
+ self.vocabularyList = ['<start>', '<end>']
263
+ for token in self.vocabularyList:
264
+ if token not in self.tokenizer.word_index:
265
+ self.tokenizer.word_index[token] = self.tokenizer.num_words
266
+ self.tokenizer.index_word[self.tokenizer.num_words] = token
267
+ self.all_vocab_size += 1
268
+ self.tokenizer.num_words += 1
269
+
270
+ # Set Tokenizer Values:
271
+ self.tokenizer.num_words = len(self.tokenizer.word_index)
272
+ self.tokenizer.oov_token = "<oov>"
273
+
274
+ self.logger.info(f"New Tokenizer Index's: {self.tokenizer.word_index}")
275
+
276
+ # Debug Lines
277
+ # for token in ['<start>', '<end>', '<oov>']:
278
+ # print(f"Index of {token}: {self.tokenizer.word_index.get(token)}")
279
+
280
+ # Debug Line
281
+ # print(list(self.tokenizer.word_index.keys()))
282
+
283
+ if os.path.exists(self.model_filename) and os.path.exists(self.encoder_filename) and os.path.exists(self.decoder_filename):
284
+ self.model, self.encoder_model, self.decoder_model =self.load_model_file()
285
+
286
+ def save_full_weights(self, encoder_path="encoder.weights.h5", decoder_path="decoder.weights.h5"):
287
+ if self.encoder_model is not None and self.decoder_model is not None:
288
+ if os.path.exists(encoder_path):
289
+ os.remove(encoder_path)
290
+ if os.path.exists(decoder_path):
291
+ os.remove(decoder_path)
292
+ self.encoder_model.save_weights(encoder_path)
293
+ self.decoder_model.save_weights(decoder_path)
294
+ self.logger.info(f"Encoder weights saved at {encoder_path}.")
295
+ self.logger.info(f"Decoder weights saved at {decoder_path}.")
296
+ else:
297
+ self.logger.warning(
298
+ "Encoder or Decoder model does not exist. Ensure models are initialized before saving weights.")
299
+
300
+
301
+ def load_corpus(self, corpus_path):
302
+ import convokit
303
+ self.logger.info("Loading and preprocessing corpus...")
304
+ self.corpus = convokit.Corpus(filename=corpus_path)
305
+ self.logger.info("Corpus loaded and preprocessed successfully.")
306
+
307
+ def load_full_weights(self, encoder_path="encoder.weights.h5", decoder_path="decoder.weights.h5"):
308
+ if self.encoder_model is not None and self.decoder_model is not None:
309
+ self.encoder_model.load_weights(encoder_path)
310
+ self.decoder_model.load_weights(decoder_path)
311
+ self.logger.info(f"Encoder weights loaded from {encoder_path}.")
312
+ self.logger.info(f"Decoder weights loaded from {decoder_path}.")
313
+ else:
314
+ self.logger.warning(
315
+ "Encoder or Decoder model does not exist. Ensure models are initialized before loading weights.")
316
+
317
+ def plot_and_save_training_metrics(self, history, speaker):
318
+ # Plot training metrics such as loss and accuracy
319
+ plt.figure(figsize=(10, 6))
320
+
321
+ # Plot training loss
322
+ plt.subplot(1, 2, 1)
323
+ plt.plot(history.history['loss'], label='Training Loss')
324
+ plt.plot(history.history['val_loss'], label='Validation Loss')
325
+ plt.title('Training and Validation Loss')
326
+ plt.xlabel('Epoch')
327
+ plt.ylabel('Loss')
328
+ plt.legend()
329
+
330
+ # Plot training accuracy
331
+ plt.subplot(1, 2, 2)
332
+ plt.plot(history.history['accuracy'], label='Training Accuracy')
333
+ plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
334
+ plt.title('Training and Validation Accuracy')
335
+ plt.xlabel('Epoch')
336
+ plt.ylabel('Accuracy')
337
+ plt.legend()
338
+
339
+ # Save the plot as an image file
340
+ # plot_filename = f"{speaker}_training_metrics.png"
341
+ # plt.tight_layout()
342
+ # plt.savefig(plot_filename) # Save the plot as an image
343
+ # plt.close() # Close the plot to free up memory
344
+
345
+ return "Did Not Save in Jupyter Notebook. See plot_and_save_training_metrics"
346
+
347
+
348
+ def setup_logger(self):
349
+ logger = logging.getLogger("ChatbotTrainer")
350
+ logger.setLevel(logging.DEBUG)
351
+
352
+ # Create console handler and set level to INFO for progress reports
353
+ console_handler = logging.StreamHandler()
354
+ console_handler.setLevel(logging.INFO)
355
+ console_formatter = logging.Formatter('%(levelname)s: %(message)s')
356
+ console_handler.setFormatter(console_formatter)
357
+ logger.addHandler(console_handler)
358
+
359
+ # Create a file handler and set level to DEBUG for progress reports and ERROR for error notifications
360
+ file_handler = logging.FileHandler("chatbot.log")
361
+ file_handler.setLevel(logging.DEBUG) # Set level to DEBUG to capture progress reports
362
+ file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
363
+ file_handler.setFormatter(file_formatter)
364
+ logger.addHandler(file_handler)
365
+
366
+ return logger
367
+
368
+ # This function allows to reformat the embedding weights to a new max_vocabulary
369
+ # If max_vocabulary(defined in build_model) is changed incrementally (or set large to begin with; this is N/A)
370
+ def redo_embeddings(self):
371
+ # Get current embedding weights
372
+ old_embedding_weights = self.model.get_layer("embedding").get_weights()[0]
373
+
374
+ # Define new max vocabulary size
375
+ new_vocab_size = self.max_vocabulary # Set this to the updated size
376
+ embedding_dim = old_embedding_weights.shape[1]
377
+
378
+ # Expand the embedding matrix
379
+ new_embedding_weights = np.random.normal(size=(new_vocab_size, embedding_dim)) # Initialize new words randomly
380
+ new_embedding_weights[:old_embedding_weights.shape[0], :] = old_embedding_weights # Keep old weights
381
+
382
+ # Replace the embedding layer
383
+ self.model.get_layer("embedding").set_weights([new_embedding_weights])
384
+
385
+ def save_tokenizer(self, texts=None):
386
+ if self.tokenizer:
387
+ if texts:
388
+ for token in texts:
389
+ if token not in self.tokenizer.word_index and self.tokenizer.num_words < self.max_vocabulary:
390
+ self.tokenizer.word_index[token] = self.tokenizer.num_words
391
+ self.all_vocab_size += 1
392
+ self.tokenizer.num_words += 1
393
+ # Debug Line
394
+ # print(f"Word: {token}\nIndex: {self.tokenizer.num_words}")
395
+ self.max_vocab_size = self.tokenizer.num_words
396
+
397
+ self.tokenizer.fit_on_texts(texts)
398
+
399
+ with open(self.tokenizer_save_path, 'wb') as tokenizer_save_file:
400
+ pickle.dump(self.tokenizer, tokenizer_save_file)
401
+
402
+ self.tokenizer.num_words = len(self.tokenizer.word_index)
403
+
404
+ elif self.tokenizer == None:
405
+ self.logger.warning("No tokenizer to save.")
406
+
407
+ def save_embedding_weights(self, filepath="embedding_weights.npy"):
408
+ if self.model is not None:
409
+ embedding_layer = self.model.get_layer('embedding')
410
+
411
+ # Extract the weights
412
+ embedding_weights = embedding_layer.get_weights()[0] # Weights are stored as a list, take the first element
413
+
414
+ # Save weights to a file
415
+ if os.path.exists(filepath):
416
+ os.remove(filepath)
417
+
418
+ np.save(filepath, embedding_weights)
419
+ self.logger.info(f"Embedding weights saved successfully at {filepath}.")
420
+ else:
421
+ self.logger.warning("No model exists to extract embedding weights.")
422
+
423
+ def load_embedding_weights(self, filepath="embedding_weights.npy"):
424
+ if self.model is not None:
425
+ embedding_layer = self.model.get_layer('embedding')
426
+
427
+ # Load weights from the file
428
+ embedding_weights = np.load(filepath)
429
+
430
+ # Ensure the weights shape matches the layer's expected shape
431
+ if embedding_layer.input_dim == embedding_weights.shape[0] and embedding_layer.output_dim == \
432
+ embedding_weights.shape[1]:
433
+ embedding_layer.set_weights([embedding_weights])
434
+ self.logger.info(f"Embedding weights loaded successfully from {filepath}.")
435
+ else:
436
+ self.logger.error("Mismatch in embedding weights shape. Ensure the model and weights are compatible.")
437
+ else:
438
+ self.logger.warning("No model exists to load embedding weights into.")
439
+
440
+ def clean_text(self, text):
441
+ txt = text.lower().strip()
442
+
443
+ # Contraction mapping (expanded)
444
+ contractions = {
445
+ "i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
446
+ "what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
447
+ "it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
448
+ "you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
449
+ "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
450
+ "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
451
+ "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
452
+ "they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
453
+ "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
454
+ "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
455
+ "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
456
+ "haven't": "have not", "hasn't": "has not", "hadn't": "had not"
457
+ }
458
+
459
+ # Expand contractions
460
+ for contraction, expansion in contractions.items():
461
+ txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)
462
+
463
+ # Remove unwanted characters but keep apostrophes
464
+ txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keep words, numbers, and apostrophes
465
+ txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces
466
+
467
+ # Preserve words in vocabulary list
468
+ for word in txt.split():
469
+ if word not in self.vocabularyList:
470
+ self.vocabularyList.append(word)
471
+
472
+ return txt
473
+
474
+ # Training
475
+ def preprocess_texts(self, input_texts, target_texts):
476
+ input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
477
+ target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
478
+ self.save_tokenizer(self.vocabularyList)
479
+ # Initialize lists to store processed inputs and targets
480
+ input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != "" and input_texts is not None]
481
+ target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != "" and target_texts is not None]
482
+
483
+ input_sequences = self.tokenizer.texts_to_sequences(input_texts) # [0]
484
+ target_sequences = self.tokenizer.texts_to_sequences(target_texts) # [0]
485
+
486
+ input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post', truncating='post') # [0]
487
+ target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post') # [0]
488
+ target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post')
489
+
490
+ # Ensure target_sequences has enough samples
491
+ if target_sequences.shape[0] != input_sequences.shape[0]:
492
+ print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
493
+ target_sequences = np.resize(target_sequences, input_sequences.shape) # Resize if necessary
494
+
495
+ # Ensure both lists have the same number of sequences
496
+ min_samples = min(len(input_sequences), len(target_sequences))
497
+ input_sequences = input_sequences[:min_samples]
498
+ target_sequences = target_sequences[:min_samples]
499
+
500
+ print(f"Preprocessed Encoder Input Shape: {input_sequences.shape}")
501
+ print(f"Preprocessed Decoder Input Shape: {target_sequences.shape}")
502
+ print(f"Preprocessed Decoder Target Shape: {target_sequences.shape}")
503
+
504
+ return input_sequences, target_sequences
505
+
506
+ # Prediction
507
+ def preprocess_input(self, texts):
508
+ preprocessed_input = ["<start>"]
509
+ texts = self.clean_text(texts)
510
+
511
+ preprocessed_text = texts.lower().split(" ")
512
+ preprocessed_input.extend(preprocessed_text)
513
+ preprocessed_input.append("<end>")
514
+
515
+ # Convert words to token IDs
516
+ preprocessed_input = self.tokenizer.texts_to_sequences([preprocessed_input])
517
+ preprocessed_input = [item for sublist in preprocessed_input for item in sublist] # Flatten
518
+
519
+ preprocessed_input = np.array(preprocessed_input).reshape(1, -1) # (1, length)
520
+
521
+ preprocessed_input = pad_sequences(preprocessed_input, maxlen=self.max_seq_length, padding='post')
522
+
523
+ # ("Final Input Shape:", preprocessed_input.shape) # Debugging
524
+ return preprocessed_input
525
+
526
+ def build_model(self):
527
+ if not self.model:
528
+ # Encoder
529
+ self.encoder_inputs = Input(shape=(self.max_seq_length,))
530
+ encoder_embedding = Embedding(
531
+ input_dim=self.max_vocabulary,
532
+ output_dim=self.embedding_dim,
533
+ mask_zero=True,
534
+ embeddings_regularizer=l2(0.01)
535
+ )(self.encoder_inputs)
536
+ encoder_lstm = LSTM(
537
+ self.lstm_units,
538
+ return_state=True,
539
+ return_sequences=False,
540
+ dropout=self.dropout,
541
+ recurrent_dropout=self.recurrent_dropout
542
+ )
543
+ _, state_h, state_c = encoder_lstm(encoder_embedding)
544
+ encoder_states = [state_h, state_c]
545
+ self.encoder_model = Model(self.encoder_inputs, encoder_states)
546
+
547
+ # Decoder
548
+ self.decoder_inputs = Input(shape=(None,), name='decoder_input')
549
+ decoder_embedding = Embedding(
550
+ input_dim=self.max_vocabulary,
551
+ output_dim=self.embedding_dim,
552
+ mask_zero=True
553
+ )(self.decoder_inputs)
554
+ decoder_lstm = LSTM(
555
+ self.lstm_units,
556
+ return_sequences=True,
557
+ return_state=True,
558
+ dropout=self.dropout,
559
+ recurrent_dropout=self.recurrent_dropout,
560
+ kernel_regularizer=l2(0.001)
561
+ )
562
+ decoder_state_input_h = Input(shape=(self.lstm_units,))
563
+ decoder_state_input_c = Input(shape=(self.lstm_units,))
564
+ decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
565
+ decoder_lstm_output, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
566
+ decoder_states = [state_h, state_c]
567
+ decoder_dense = Dense(self.max_vocabulary, activation='softmax', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))
568
+ self.decoder_outputs = decoder_dense(decoder_lstm_output)
569
+ self.decoder_model = Model([self.decoder_inputs] + decoder_states_inputs,
570
+ [self.decoder_outputs] + decoder_states)
571
+
572
+ # Combine encoder and decoder into the full model
573
+ decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
574
+ self.decoder_outputs = decoder_dense(decoder_lstm_output)
575
+ self.model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs)
576
+ self.model.compile(
577
+ optimizer=self.optimizer,
578
+ loss='sparse_categorical_crossentropy',
579
+ metrics=['accuracy']
580
+ )
581
+ return self.model, self.encoder_model, self.decoder_model
582
+
583
+ def load_model_config(self, config_filename="model_config.json"):
584
+ if os.path.exists(config_filename):
585
+ with open(config_filename, "r", encoding="utf-8") as f:
586
+ data = json.load(f)
587
+ self.logger.info(f"Loading model config from {config_filename}")
588
+
589
+ # Rebuild model from config
590
+ self.model = model_from_json(data["model_config"])
591
+
592
+ # Rebuild optimizer
593
+ self.optimizer = Adam.from_config(data["optimizer"])
594
+
595
+ # Compile model with restored optimizer
596
+ self.model.compile(
597
+ optimizer=self.optimizer,
598
+ loss='sparse_categorical_crossentropy',
599
+ metrics=['accuracy']
600
+ )
601
+ self.logger.info("Model compiled successfully after loading config.")
602
+ return self.model
603
+ return None
604
+
605
+ def train_model(self, input_texts, target_texts, conversation_id, speaker):
606
+ # We Define running_trouble at the start of a new training
607
+ self.running_trouble = []
608
+
609
+ # We make sure everything to do with the model is loaded properly, or generated if it doesn't exist
610
+ loaded_model = self.load_model_config(config_filename="model_config.json")
611
+ if os.path.exists(self.model_filename) and os.path.exists(self.encoder_filename) and os.path.exists(
612
+ self.decoder_filename):
613
+ self.model, self.encoder_model, self.decoder_model = self.load_model_file()
614
+ self.logger.info("Loaded full model from saved files.")
615
+
616
+ elif not os.path.exists(self.model_filename) and not os.path.exists(self.encoder_filename) and not os.path.exists(
617
+ self.decoder_filename) and loaded_model:
618
+ self.model = loaded_model
619
+ elif not self.model and not self.encoder_model and not self.decoder_model:
620
+ self.logger.info("Building new model...")
621
+ self.model, self.encoder_model, self.decoder_model = self.build_model()
622
+
623
+ # Once everything loads properly we start training:
624
+ self.logger.info(f"Training Model for ConversationID: {conversation_id}")
625
+
626
+ if self.corpus is None or self.tokenizer is None:
627
+ raise ValueError("Corpus or tokenizer is not initialized.")
628
+
629
+ # Preprocess the texts into sequences
630
+ input_sequences, target_sequences = input_texts, target_texts
631
+
632
+ # Debug Lines
633
+ # for token in ['<start>', '<end>', '<oov>']:
634
+ # print(f"Index of {token}: {self.tokenizer.word_index.get(token)}")
635
+
636
+ # Stats
637
+ self.logger.info(f"Num Words: {self.tokenizer.num_words}")
638
+ self.logger.info(f"Vocabulary Size: {len(self.tokenizer.word_index)}")
639
+ self.logger.info(f"Length of Vocabulary List: {len(self.vocabularyList)}")
640
+
641
+ # Prepare training data
642
+ encoder_input_data = input_sequences
643
+ decoder_input_data = target_sequences[:, :-1]
644
+ decoder_target_data = target_sequences[:, 1:]
645
+
646
+ min_samples = min(encoder_input_data.shape[0], decoder_input_data.shape[0])
647
+
648
+ encoder_input_data = encoder_input_data[:min_samples]
649
+ decoder_input_data = decoder_input_data[:min_samples]
650
+ decoder_target_data = decoder_target_data[:min_samples]
651
+
652
+ self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
653
+ self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
654
+ self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")
655
+
656
+ # Instantiate the callback
657
+ early_stopping = MonitorEarlyStopping(
658
+ monitor='val_loss',
659
+ patience=self.early_patience,
660
+ mode='min',
661
+ restore_best_weights=True,
662
+ verbose=1
663
+ )
664
+
665
+ lr_patience = self.early_patience // 3
666
+ lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=lr_patience, verbose=1)
667
+
668
+ # Train the model
669
+ history = self.model.fit(
670
+ [encoder_input_data, decoder_input_data],
671
+ np.expand_dims(decoder_target_data, -1),
672
+ batch_size=self.batch_size,
673
+ epochs=self.epochs,
674
+ validation_split=self.test_size,
675
+ callbacks=[early_stopping, lr_scheduler]
676
+ )
677
+
678
+ # Log any early stopping events
679
+ if len(early_stopping.stopped_epoch_list) > 0:
680
+ self.troubleList.append(speaker)
681
+
682
+ # Reset stopped epoch list & save to running trouble
683
+ self.running_trouble = [item for item in early_stopping.stopped_epoch_list]
684
+ early_stopping.stopped_epoch_list = []
685
+
686
+ # Evaluate the model on the training data
687
+ test_loss, test_accuracy = self.model.evaluate(
688
+ [encoder_input_data, decoder_input_data],
689
+ np.expand_dims(decoder_target_data, -1),
690
+ batch_size=self.batch_size
691
+ )
692
+
693
+ # Save training metrics as a plot
694
+ plot_filename = self.plot_and_save_training_metrics(history, speaker)
695
+ self.logger.info(f"Training metrics plot saved as {plot_filename}")
696
+ self.logger.info(f"Test loss for Conversation {speaker}: {test_loss}")
697
+ self.logger.info(f"Test accuracy for Conversation {speaker}: {test_accuracy}")
698
+ self.logger.info(f"Model trained and saved successfully for speaker: {speaker}")
699
+
700
+ # Compile the model before saving
701
+ self.model.compile(
702
+ optimizer=self.optimizer,
703
+ loss='sparse_categorical_crossentropy',
704
+ metrics=['accuracy']
705
+ )
706
+
707
+ # Save the model after training
708
+ self.save_tokenizer(self.vocabularyList)
709
+ self.save_model(self.model, self.encoder_model, self.decoder_model)
710
+
711
+ def save_model(self, model, encoder_model, decoder_model):
712
+ self.logger.info("Saving Model...")
713
+ if model:
714
+ self.encoder_model.save(self.encoder_filename)
715
+ self.logger.info("Encoder saved.")
716
+ time.sleep(1)
717
+ self.decoder_model.save(self.decoder_filename)
718
+ self.logger.info("Decoder saved.")
719
+ time.sleep(1)
720
+ self.model.save(self.model_filename)
721
+ self.logger.info("Model saved.")
722
+ time.sleep(1)
723
+ self.save_full_weights()
724
+ self.save_embedding_weights()
725
+
726
+ else:
727
+ self.logger.warning("No model to save.")
728
+
729
+ def load_model_file(self):
730
+ self.logger.info("Loading Model and Tokenizer...")
731
+
732
+ # Load model without the optimizer first
733
+ model = load_model(self.model_filename, compile=False)
734
+
735
+ # Manually recompile with a fresh Adam optimizer
736
+ self.optimizer = Adam(learning_rate=self.learning_rate, clipnorm=1.0)
737
+ model.compile(optimizer=self.optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
738
+
739
+ print("Model Loaded... \nNow loading encoder/decoder models... ")
740
+
741
+ encoder_model = load_model(self.encoder_filename)
742
+ decoder_model = load_model(self.decoder_filename)
743
+
744
+ print("Decoder and Encoder Loaded... ")
745
+
746
+ self.load_full_weights()
747
+ self.load_embedding_weights()
748
+
749
+ return model, encoder_model, decoder_model
750
+
751
+ def beam_search(self, input_text):
752
+ # Preprocess input to match generate_response format
753
+ input_seq = self.preprocess_input(input_text)
754
+
755
+ # Perform beam search using the BeamSearchHelper class
756
+ beam_search_helper = BeamSearchHelper(
757
+ model=self.model,
758
+ tokenizer=self.tokenizer,
759
+ max_seq_length=self.max_seq_length,
760
+ encoder_filename=self.encoder_filename,
761
+ decoder_filename=self.decoder_filename,
762
+ top_k=self.top_k,
763
+ temperature=self.temperature,
764
+ top_p=self.top_p,
765
+ beam_width=self.beam_width,
766
+ scaling_factor=self.scaling_factor
767
+ )
768
+
769
+ # Perform beam search
770
+ output_seq = beam_search_helper.beam_search(input_seq)
771
+
772
+ # Convert token indices back to words
773
+ output_words = [self.tokenizer.index_word[idx] for idx in output_seq if idx in self.tokenizer.index_word]
774
+
775
+ return " ".join(output_words)
776
+
777
+ def generate_response(self, input_seq):
778
+ try:
779
+ # Clean and tokenize input text
780
+ input_seqs = self.preprocess_input(input_seq)
781
+
782
+ # Encode the input sequence using the encoder model
783
+ encoder_states = self.encoder_model.predict(input_seqs)
784
+ state_h, state_c = encoder_states
785
+ state_h = state_h[0:1, :] # Ensure batch size 1
786
+ state_c = state_c[0:1, :]
787
+
788
+ # Initialize the decoder input with the <start> token
789
+ start_token_index = self.tokenizer.word_index.get('<start>', 1)
790
+ target_seq = np.zeros((1, 1))
791
+ target_seq[0, 0] = start_token_index
792
+
793
+ # Debugging before passing to the decoder
794
+ # print(f"Initial Target Seq Shape: {target_seq.shape}, state_h Shape: {state_h.shape}, state_c Shape: {state_c.shape}")
795
+
796
+ # Decode the sequence
797
+ decoded_sentence = []
798
+
799
+ for _ in range(self.max_seq_length):
800
+ output_tokens, state_h, state_c = self.decoder_model.predict([target_seq, state_h, state_c])
801
+
802
+ # Scale logits immediately after getting output_tokens
803
+ logits = output_tokens[0, -1, :] # * self.scaling_factor
804
+ # logits = logits / self.temperature
805
+
806
+ # Compute softmax
807
+ exp_logits = np.exp(logits - np.max(logits)) # Prevent overflow
808
+ probabilities = exp_logits / np.sum(exp_logits)
809
+
810
+ predicted_token_index = np.random.choice(len(probabilities), p=probabilities)
811
+ predicted_word = self.reverse_tokenizer.get(predicted_token_index, '<oov>')
812
+
813
+ print(f"Logits: {logits[:10]}") # Debugging (First 10 values)
814
+ print(f"Softmax Probabilities: {probabilities[:10]}") # Debugging
815
+
816
+ if predicted_word == "<end>" and len(
817
+ decoded_sentence) < self.min_word:
818
+ continue
819
+
820
+ elif predicted_word == "<end>":
821
+ break
822
+
823
+ if predicted_word not in ["<oov>", "<start>", "<end>"]:
824
+ decoded_sentence.append(predicted_word)
825
+
826
+ # Update target sequence for the next iteration
827
+ target_seq[0, 0] = predicted_token_index
828
+
829
+ return " ".join(decoded_sentence).strip()
830
+
831
+ except Exception as e:
832
+ self.logger.error(f"Error in generate_response: {str(e)}")
833
+ return "Error"
preprocessed_dialogs.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0661a037f5c51dcd9919cc932f77820d04412f5085afabb8af4b0927f94eb90
3
+ size 23851164
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ convokit
2
+ matplotlib
3
+ numpy
4
+ playsound3
5
+ scikit-learn
6
+ tensorflow
7
+ keras
8
+ nltk
runCorpus.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # runMovieCorpus.py
2
+ import os
3
+ from playsound3 import playsound
4
+ import tensorflow
5
+ import time
6
+ import random
7
+ import pdb
8
+ import sys
9
+ import matplotlib.pyplot as plt
10
+
11
+ # Personal files
12
+ from preprocessed_dialogs import dialog_data
13
+ from chatbotTrainer import ChatbotTrainer
14
+
15
+
16
+ class CorpusTrainer:
17
+ def __init__(self):
18
+ self.chatbot_trainer = None
19
+ self.running_trouble = None
20
+ self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
21
+ self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
22
+
23
+ self.log_file = "self.failure_history.txt"
24
+ self.counter = 0
25
+ self.bad_count = 0
26
+ self.top_num = 0
27
+ self.percent_reset = 10.0 # Percent to track failure rate, start at 75% lower to 10% over training success rise(ex of failure history: 7<75%>,20<75%>,80<75%>,300<50%>)
28
+ self.time_sleep = 10
29
+ self.processed_dialogs = dialog_data
30
+ self.notification_sound = "AlienNotification.mp3" # Set notification sound here
31
+
32
+ # All EMPTY lists are for scripts if ran for context
33
+ self.chatbot_trainer = None
34
+ self.conversation_id = None
35
+ self.all_input_texts = []
36
+ self.all_target_texts = []
37
+ self.failure_history = []
38
+ self.speakerList = []
39
+ self.speaker_input_texts = []
40
+ self.speaker_target_texts = []
41
+ self.speakerListData = None
42
+ self.troubleListData = None
43
+ self.troubleList = []
44
+ self.allTogether = []
45
+ # Failsafe Trigger
46
+ self.failsafe_trigger = False
47
+
48
+ # Import Speakers
49
+ with open('trained_speakers.txt', 'r') as file:
50
+ self.speakerListData = file.read().splitlines()
51
+
52
+ with open('troubled_speakers.txt', 'r') as file:
53
+ self.troubleListData = file.read().splitlines()
54
+
55
+ # We Reset the file after trouble list reset(Trouble List should be empty before and after this step)
56
+ self.resetTroubled()
57
+
58
+ # We clean up the trained
59
+ self.speakerList = self.cleanupTrained(self.speakerListData)
60
+ print("Num GPUs Available: ", len(tensorflow.config.experimental.list_physical_devices('GPU')))
61
+
62
+ # Initialize the corpus (Needed for convo-kit to initialize) Only needed when reading/creating the dialogs
63
+ self.corpus_path = '/root/.convokit/saved-corpora/movie-corpus'
64
+ self.chatbot_trainer.load_corpus(self.corpus_path)
65
+
66
+ def main(self, chatbot_trainer, user_choice, dialog_data, topConvo=0, top_num=0, play_notification=0):
67
+ if play_notification or user_choice in self.exit_commands:
68
+ # exit()
69
+ pass
70
+
71
+ if play_notification in self.choices_yes:
72
+ play_notification = 1
73
+ elif play_notification not in self.choices_yes:
74
+ play_notification = 0
75
+
76
+ # Debug Lines
77
+ # pdb.set_trace()
78
+ # print(list(self.speakerList))
79
+ self.chatbot_trainer = chatbot_trainer
80
+ self.cleanupTroubled()
81
+ for x in range(len(self.processed_dialogs.keys())):
82
+ topConvo += 1
83
+ self.counter += 1
84
+ randomconvo = random.randint(1, len(self.processed_dialogs.keys()))
85
+ speaker = str(randomconvo)
86
+ dialog_pairs = self.processed_dialogs[speaker]
87
+
88
+ # Input conversation data into input and target data from dialog pairs
89
+ for input_text, target_text in dialog_pairs:
90
+ if input_text != "" and target_text != "":
91
+ self.speaker_input_texts.append(input_text)
92
+ self.all_input_texts.append(input_text)
93
+ self.speaker_target_texts.append(target_text)
94
+ self.all_target_texts.append(target_text)
95
+
96
+
97
+ if self.failsafe_trigger is False:
98
+ if speaker not in self.speakerList:
99
+ self.conversation_id = int(speaker)
100
+ if self.conversation_id > self.top_num:
101
+ self.top_num = self.conversation_id
102
+
103
+ print(f"Conversation: {self.conversation_id}")
104
+ input_text, target_text = chatbot_trainer.preprocess_texts(input_text, target_text)
105
+ data = [input_text, target_text]
106
+
107
+ # Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
108
+ limit = self.chatbot_trainer.early_patience - 3
109
+
110
+ # Failsafe for too long num_words
111
+ if self.chatbot_trainer.tokenizer.num_words > self.chatbot_trainer.max_vocabulary:
112
+ print("MAXIMUM Vocabulary Reached! Quitting Now... ")
113
+ # time.sleep(1)
114
+ if play_notification == 1:
115
+ playsound(notification_sound)
116
+ # time.sleep(1)
117
+ return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
118
+
119
+ # User Choices
120
+ if user_choice in self.choices_yes and play_notification in self.choices_yes:
121
+ self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
122
+
123
+
124
+ elif user_choice in self.choices_yes and play_notification not in self.choices_yes:
125
+ self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
126
+
127
+
128
+ elif user_choice not in self.choices_yes and play_notification not in self.choices_yes:
129
+ self.user_no(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
130
+
131
+
132
+
133
+ def user_yes(self, data, speaker, limit, play_notification):
134
+ self.chatbot_trainer.train_model(data[0], data[1], str(self.conversation_id), speaker)
135
+ self.runningTrouble = self.chatbot_trainer.running_trouble
136
+ if speaker not in self.speakerList and len(self.runningTrouble) < limit:
137
+ self.speakerList.append(speaker)
138
+
139
+ with open("trained_speakers.txt", 'a') as f:
140
+ f.write(f"{speaker}\n")
141
+
142
+ elif len(self.runningTrouble) > limit:
143
+ self.bad_count += 1
144
+ self.troubleList.append(speaker)
145
+
146
+ with open("troubled_speakers.txt", 'a') as f:
147
+ f.write(f"{speaker}\n")
148
+
149
+ self.allTogether = self.resetTogether(self.speakerList, self.troubleList)
150
+ topConvo = len(self.allTogether)
151
+ self.bad_count = len(self.troubleList)
152
+
153
+ # Debug Lines
154
+ # print(f"Trouble List: {len(self.troubleList)}")
155
+ # print(f"Bad Count: {self.bad_count}")
156
+ # print(f"Running Trouble: {len(self.runningTrouble)}")
157
+
158
+ percent_running = self.runningPercent(len(self.troubleList), self.counter)
159
+ self.failure_history.append(len(self.troubleList))
160
+ if percent_running is None:
161
+ percent_self.running = 0.0
162
+ self.chatbot_trainer.logger.info(f"self.Running Percentage Failure: {percent_running}%")
163
+
164
+ # We check for speaker vs top num achieved successfully in self.speakerList
165
+ # topConvo is a larger buffer for models that may take longer to learn but for dynamic learning. self.top_num is for running consecutive and is default in that situation
166
+ print(f"Now is the time to quit if need be... ")
167
+ if play_notification == 1:
168
+ playsound(notification_sound)
169
+
170
+ if percent_running is not None:
171
+ # Note: The run adds to the trained speaker list which is successful trainings. If it's not in that list, when it comes up again it will be trained another time.
172
+ if percent_running > percent_reset:
173
+ print("Logging Failures... Resetting... Failure Rate is Greater than {percent_reset}%...")
174
+ answer_1 = input("Show Failures for this Run? \n>")
175
+ if answer_1 in self.exit_commands:
176
+ quit()
177
+ show_file = True if answer_1 in self.choices_yes else False
178
+ answer_2 = input("Save Failures for this Run? \n>")
179
+ if answer_2 in self.exit_commands:
180
+ quit()
181
+ save_file = True if answer_1 in self.choices_yes else False
182
+ self.log_failures(len(self.troubleList), log_file, show_file=show_file, save_file=save_file)
183
+ self.plot_failures(log_file)
184
+ print("Plotting Failures... See failures_plot.png for more information... ")
185
+ # Clear Speakers
186
+ delete_speakers = input("Would you like to clear trained_speakers.txt? \nThis is useful for touching on successful conversations... \n>")
187
+ if delete_speakers in self.choices_yes:
188
+ with open('trained_speakers.txt', 'w') as f:
189
+ f.write("")
190
+
191
+ # This resets the run for the failure rate to reset
192
+ input('Enter to Continue... (This will reset the run) ')
193
+ return self.main(self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num)
194
+
195
+ # Add more settings like whether to save the model or not
196
+ input("\nEnter to Continue... ")
197
+
198
+ def user_no(self, data, speaker, limit, play_notification):
199
+ self.chatbot_trainer.train_model(data[0], data[1], str(self.conversation_id), speaker)
200
+ if speaker not in self.speakerList and len(self.runningTrouble) < limit:
201
+ self.speakerList.append(speaker)
202
+
203
+ with open("trained_speakers.txt", 'a') as f:
204
+ f.write(f"{speaker}\n")
205
+
206
+ elif len(self.runningTrouble) > limit:
207
+ self.bad_count += 1
208
+ self.troubleList.append(speaker)
209
+
210
+ with open("troubled_speakers.txt", 'a') as f:
211
+ f.write(f"{speaker}\n")
212
+
213
+ # Find Top Convo
214
+ self.allTogether = self.resetTogether(self.speakerList, self.troubleList)
215
+ topConvo = len(self.allTogether)
216
+ self.bad_count = len(self.troubleList)
217
+
218
+ # Debug Lines; But pretty useful to see how it works
219
+ print(f"Trouble List: {len(self.troubleList)}")
220
+ print(f"Bad Count: {self.bad_count}")
221
+ print(f"Number of Conversations(This self.Run): {self.counter}")
222
+ print(f"Number of Conversations Combined: {topConvo}")
223
+ print(f"self.Running Trouble: {len(self.runningTrouble)}")
224
+
225
+ percent_self.running = self.runningPercent(len(self.troubleList), topConvo)
226
+ self.failure_history.append(len(self.troubleList))
227
+ if percent_self.running is None:
228
+ percent_self.running = 0.0
229
+ self.chatbot_trainer.logger.info(f"self.Running Percentage Failure: {percent_self.running}%")
230
+
231
+ print(f"Now is the time to quit if need be... ")
232
+ if play_notification == 1:
233
+ playsound(notification_sound)
234
+ for x in range(time_sleep):
235
+ # time.sleep(1)
236
+ print(f"Next convo in:{time_sleep-x}")
237
+
238
+ if percent_self.running is not None:
239
+ # Note: The self.run adds to the trained speaker list which is successful trainings. If it's not in that list, when it comes up again it will be trained another time.
240
+ if percent_self.running > percent_reset:
241
+ self.log_failures(len(self.troubleList), log_file)
242
+ print("Plotting Failures... See failures_plot.png for more information... ")
243
+ self.plot_failures(log_file)
244
+ if play_notification == 1:
245
+ playsound(notification_sound)
246
+ print(f"Resetting... Failure Rate is Greater than {percent_reset}%... For this self.run.")
247
+ # time.sleep(time_sleep * 3)
248
+ return self.main(self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num)
249
+
250
+ def resetTogether():
251
+ for speakers in self.speakerList:
252
+ if speakers not in self.allTogether:
253
+ self.allTogether.append(str(speakers))
254
+ for speakers in self.troubleListData:
255
+ if speakers not in self.allTogether:
256
+ self.allTogether.append(str(speakers))
257
+ self.allTogetherSorted = sorted(self.allTogether)
258
+
259
+ return self.allTogetherSorted
260
+
261
+ def cleanupTrained(self, speakerList):
262
+ for data in self.speakerList:
263
+ data = data.strip('\n')
264
+ if data not in self.speakerList and data not in self.troubleListData:
265
+ self.speakerList.append(data)
266
+ with open('trained_speakers.txt', 'w') as f:
267
+ for speakers in self.speakerList:
268
+ f.write(f"{speakers}\n")
269
+
270
+ self.speakerList = sorted(self.speakerList)
271
+ return self.speakerList
272
+
273
+ def resetTroubled(self):
274
+ os.remove('troubled_speakers.txt')
275
+ with open('troubled_speakers.txt', 'w') as f:
276
+ f.write("")
277
+
278
+ def cleanupTroubled(self):
279
+ tempBin = []
280
+ with open('troubled_speakers.txt', 'r') as fr:
281
+ data = fr.readlines()
282
+ for lines in data:
283
+ if lines not in tempBin:
284
+ tempBin.append(str(lines).strip('\n'))
285
+
286
+ tempBin = sorted(tempBin)
287
+ with open('troubled_speakers.txt', 'w') as fw:
288
+ fw.write("")
289
+ for troubled in tempBin:
290
+ fw.write(f"{troubled}\n")
291
+
292
+ def runningPercent(self, list1, list2):
293
+ if list1 > 0 and list2 > 0:
294
+ x = list1 / list2
295
+ percentage = x * 100
296
+ percentage = round(percentage, 2)
297
+
298
+ return percentage
299
+
300
+ elif list1 == 0:
301
+ percentage = 0.0
302
+ return percentage
303
+
304
+ def plot_failures(self, log_file, show_file=False, save_file=False):
305
+ # Read stored failures
306
+ if not os.path.exists(log_file):
307
+ print("No failure data found.")
308
+ return
309
+
310
+ with open("self.failure_history.txt", "r") as f:
311
+ self.failure_history = [int(line.strip()) for line in f.readlines()]
312
+
313
+ if len(self.failure_history) == 0:
314
+ print("No failure data to plot.")
315
+ return
316
+
317
+ # Plot actual failure values
318
+ plt.figure(figsize=(10, 6))
319
+ plt.plot(self.failure_history, marker='o', linestyle='-', color='red', label='Failures Per self.Run')
320
+ plt.xlabel("self.Run Iteration")
321
+ plt.ylabel("Number of Failures")
322
+ plt.title("Failures Before Restart Over Time")
323
+ plt.legend()
324
+ plt.grid(True)
325
+
326
+ if save_file:
327
+ # Save plot
328
+ plt.savefig("failures_plot.png")
329
+
330
+ if show_file:
331
+ plt.show()
332
+
333
+ def log_failures(self, num_failures, log_file):
334
+ # Append failure count to file
335
+ with open(log_file, "a") as f:
336
+ f.write(f"{num_failures}\n")
337
+
338
+ print(f"Logged {num_failures} failures.")
339
+
340
+
341
+ if __name__ == "__main__":
342
+ while True:
343
+ # Initialize the corpus (Needed for convo-kit to initialize) Only needed when reading/creating the dialogs
344
+ corpus_path = '/root/.convokit/saved-corpora/movie-corpus'
345
+ chatbot_trainer = ChatbotTrainer()
346
+ chatbot_trainer.load_corpus(corpus_path)
347
+ app = CorpusTrainer()
348
+ user_choice = input(f"Run Supervised?({chatbot_trainer.model_filename})\n>")
349
+ play_notification = input(f"Would you like to play a notification after each training?\nHelps with manual stopping before max_vocabulary reached... \n>")
350
+ app.main(chatbot_trainer=chatbot_trainer, user_choice=user_choice, dialog_data=dialog_data, play_notification=play_notification)
351
+