|
import os
|
|
import re
|
|
import time
|
|
import numpy as np
|
|
from gensim.models import Word2Vec
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense, LSTM, Embedding
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
|
max_worker = os.cpu_count()
|
|
workers = max(1, max_worker - 0)
|
|
vector_size = 1000
|
|
window_size = 50
|
|
min_count = 1
|
|
context_length = 256
|
|
max_length = context_length * 4
|
|
sentence_length = 5
|
|
top_k = 500
|
|
temperature = 0
|
|
min_probability = 0.5
|
|
max_probability = 1
|
|
|
|
|
|
file_path = input("Veri setinin dosya yolunu giriniz: ")
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
dataset = f.readlines()
|
|
except FileNotFoundError:
|
|
print("Dosya bulunamadı!")
|
|
exit()
|
|
|
|
|
|
tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset]
|
|
|
|
|
|
model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=workers)
|
|
|
|
|
|
tokenizer = Tokenizer()
|
|
tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences])
|
|
sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences])
|
|
X = pad_sequences(sequences, maxlen=context_length, padding='post')
|
|
y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences])
|
|
|
|
|
|
nn_model = Sequential([
|
|
Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=vector_size, input_length=context_length),
|
|
LSTM(256, return_sequences=True),
|
|
LSTM(128, return_sequences=True),
|
|
LSTM(128, return_sequences=True),
|
|
LSTM(128, return_sequences=True),
|
|
LSTM(64, return_sequences=True),
|
|
LSTM(64, return_sequences=True),
|
|
LSTM(64, return_sequences=True),
|
|
LSTM(32, return_sequences=True),
|
|
LSTM(32, return_sequences=True),
|
|
LSTM(32, return_sequences=True),
|
|
LSTM(16, return_sequences=True),
|
|
LSTM(16, return_sequences=True),
|
|
LSTM(16, return_sequences=True),
|
|
LSTM(8, return_sequences=True),
|
|
LSTM(8, return_sequences=True),
|
|
LSTM(8, return_sequences=True),
|
|
LSTM(4, return_sequences=True),
|
|
LSTM(4, return_sequences=True),
|
|
LSTM(4, return_sequences=True),
|
|
LSTM(2, return_sequences=True),
|
|
LSTM(1),
|
|
Dense(len(tokenizer.word_index) + 1, activation='softmax')
|
|
])
|
|
|
|
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
|
nn_model.fit(X, y, epochs=20, batch_size=32)
|
|
|
|
|
|
def generate_sentence(start_word, sentence_length, temperature=1.0):
|
|
sentence = [start_word]
|
|
for _ in range(sentence_length - 1):
|
|
sequence = tokenizer.texts_to_sequences([' '.join(sentence)])
|
|
sequence = pad_sequences(sequence, maxlen=context_length, padding='post')
|
|
predicted_probs = nn_model.predict(sequence)[0]
|
|
|
|
|
|
predicted_probs = np.asarray(predicted_probs).astype('float64')
|
|
predicted_probs = np.log(predicted_probs + 1e-10) / temperature
|
|
predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
|
|
|
|
|
|
predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
|
|
next_word = tokenizer.index_word.get(predicted_index, '')
|
|
|
|
if not next_word:
|
|
break
|
|
sentence.append(next_word)
|
|
|
|
return ' '.join(sentence)
|
|
|
|
|
|
start_word = input("Başlangıç kelimesi giriniz: ")
|
|
print("\nÜretilen Cümle:", generate_sentence(start_word, sentence_length, temperature=1.0))
|
|
|