import os import re import time import numpy as np from gensim.models import Word2Vec from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Embedding from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences max_worker = os.cpu_count() workers = max(1, max_worker - 0) vector_size = 1000 window_size = 50 min_count = 1 context_length = 256 max_length = context_length * 4 sentence_length = 5 top_k = 500 temperature = 0 min_probability = 0.5 max_probability = 1 # === VERİ SETİNİ OKUMA === file_path = input("Veri setinin dosya yolunu giriniz: ") try: with open(file_path, "r", encoding="utf-8") as f: dataset = f.readlines() except FileNotFoundError: print("Dosya bulunamadı!") exit() # === KELİMELERE AYIRMA === tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset] # === WORD2VEC MODELİ OLUŞTURMA === model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=workers) # === TOKENİZASYON === tokenizer = Tokenizer() tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences]) sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences]) X = pad_sequences(sequences, maxlen=context_length, padding='post') y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences]) # === SİNİR AĞI MODELİ === nn_model = Sequential([ Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=vector_size, input_length=context_length), LSTM(256, return_sequences=True), LSTM(128, return_sequences=True), LSTM(128, return_sequences=True), LSTM(128, return_sequences=True), LSTM(64, return_sequences=True), LSTM(64, return_sequences=True), LSTM(64, return_sequences=True), LSTM(32, return_sequences=True), LSTM(32, return_sequences=True), LSTM(32, return_sequences=True), LSTM(16, return_sequences=True), LSTM(16, return_sequences=True), LSTM(16, return_sequences=True), LSTM(8, return_sequences=True), LSTM(8, return_sequences=True), LSTM(8, return_sequences=True), LSTM(4, return_sequences=True), LSTM(4, return_sequences=True), LSTM(4, return_sequences=True), LSTM(2, return_sequences=True), LSTM(1), Dense(len(tokenizer.word_index) + 1, activation='softmax') ]) nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) nn_model.fit(X, y, epochs=20, batch_size=32) # === CÜMLE ÜRETME (temperature sampling eklenmiş) === def generate_sentence(start_word, sentence_length, temperature=1.0): sentence = [start_word] for _ in range(sentence_length - 1): sequence = tokenizer.texts_to_sequences([' '.join(sentence)]) sequence = pad_sequences(sequence, maxlen=context_length, padding='post') predicted_probs = nn_model.predict(sequence)[0] # Uygulanan sıcaklık parametresiyle olasılıkları değiştirme predicted_probs = np.asarray(predicted_probs).astype('float64') predicted_probs = np.log(predicted_probs + 1e-10) / temperature predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs)) # Temperature ile daha rastgele bir kelime seçimi predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs) next_word = tokenizer.index_word.get(predicted_index, '') if not next_word: break sentence.append(next_word) return ' '.join(sentence) # Başlangıç kelimesi girilmesi ve cümle üretimi start_word = input("Başlangıç kelimesi giriniz: ") print("\nÜretilen Cümle:", generate_sentence(start_word, sentence_length, temperature=1.0))