import os
import re
import time
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_worker = os.cpu_count()
workers = max(1, max_worker - 0)
vector_size = 1000
window_size = 50
min_count = 1
context_length = 256
max_length = context_length * 4
sentence_length = 5
top_k = 500
temperature = 0
min_probability = 0.5
max_probability = 1 

# === VERİ SETİNİ OKUMA ===
file_path = input("Veri setinin dosya yolunu giriniz: ")
try:
    with open(file_path, "r", encoding="utf-8") as f:
        dataset = f.readlines()
except FileNotFoundError:
    print("Dosya bulunamadı!")
    exit()

# === KELİMELERE AYIRMA ===
tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset]

# === WORD2VEC MODELİ OLUŞTURMA ===
model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=workers)

# === TOKENİZASYON ===
tokenizer = Tokenizer()
tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences])
sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences])
X = pad_sequences(sequences, maxlen=context_length, padding='post')
y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences])

# === SİNİR AĞI MODELİ ===
nn_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=vector_size, input_length=context_length),
    LSTM(256, return_sequences=True),
    LSTM(128, return_sequences=True),
    LSTM(128, return_sequences=True),
    LSTM(128, return_sequences=True),
    LSTM(64, return_sequences=True),
    LSTM(64, return_sequences=True),
    LSTM(64, return_sequences=True),
    LSTM(32, return_sequences=True),
    LSTM(32, return_sequences=True),
    LSTM(32, return_sequences=True),
    LSTM(16, return_sequences=True),
    LSTM(16, return_sequences=True),
    LSTM(16, return_sequences=True),
    LSTM(8, return_sequences=True),
    LSTM(8, return_sequences=True),
    LSTM(8, return_sequences=True),
    LSTM(4, return_sequences=True),
    LSTM(4, return_sequences=True),
    LSTM(4, return_sequences=True),
    LSTM(2, return_sequences=True),
    LSTM(1),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_model.fit(X, y, epochs=20, batch_size=32)

# === CÜMLE ÜRETME (temperature sampling eklenmiş) ===
def generate_sentence(start_word, sentence_length, temperature=1.0):
    sentence = [start_word]
    for _ in range(sentence_length - 1):
        sequence = tokenizer.texts_to_sequences([' '.join(sentence)])
        sequence = pad_sequences(sequence, maxlen=context_length, padding='post')
        predicted_probs = nn_model.predict(sequence)[0]
        
        # Uygulanan sıcaklık parametresiyle olasılıkları değiştirme
        predicted_probs = np.asarray(predicted_probs).astype('float64')
        predicted_probs = np.log(predicted_probs + 1e-10) / temperature
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))

        # Temperature ile daha rastgele bir kelime seçimi
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        next_word = tokenizer.index_word.get(predicted_index, '')
        
        if not next_word:
            break
        sentence.append(next_word)
    
    return ' '.join(sentence)

# Başlangıç kelimesi girilmesi ve cümle üretimi
start_word = input("Başlangıç kelimesi giriniz: ")
print("\nÜretilen Cümle:", generate_sentence(start_word, sentence_length, temperature=1.0))