Upload Br-T-1.1.py
Browse files- Br-T-1.1.py +101 -0
Br-T-1.1.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import time
|
4 |
+
import numpy as np
|
5 |
+
from gensim.models import Word2Vec
|
6 |
+
from tensorflow.keras.models import Sequential
|
7 |
+
from tensorflow.keras.layers import Dense, LSTM, Embedding
|
8 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
9 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
+
|
11 |
+
max_worker = os.cpu_count()
|
12 |
+
workers = max(1, max_worker - 0)
|
13 |
+
vector_size = 1000
|
14 |
+
window_size = 50
|
15 |
+
min_count = 1
|
16 |
+
context_length = 256
|
17 |
+
max_length = context_length * 4
|
18 |
+
sentence_length = 5
|
19 |
+
top_k = 500
|
20 |
+
temperature = 0
|
21 |
+
min_probability = 0.5
|
22 |
+
max_probability = 1
|
23 |
+
|
24 |
+
# === VERİ SETİNİ OKUMA ===
|
25 |
+
file_path = input("Veri setinin dosya yolunu giriniz: ")
|
26 |
+
try:
|
27 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
28 |
+
dataset = f.readlines()
|
29 |
+
except FileNotFoundError:
|
30 |
+
print("Dosya bulunamadı!")
|
31 |
+
exit()
|
32 |
+
|
33 |
+
# === KELİMELERE AYIRMA ===
|
34 |
+
tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset]
|
35 |
+
|
36 |
+
# === WORD2VEC MODELİ OLUŞTURMA ===
|
37 |
+
model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=workers)
|
38 |
+
|
39 |
+
# === TOKENİZASYON ===
|
40 |
+
tokenizer = Tokenizer()
|
41 |
+
tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences])
|
42 |
+
sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences])
|
43 |
+
X = pad_sequences(sequences, maxlen=context_length, padding='post')
|
44 |
+
y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences])
|
45 |
+
|
46 |
+
# === SİNİR AĞI MODELİ ===
|
47 |
+
nn_model = Sequential([
|
48 |
+
Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=vector_size, input_length=context_length),
|
49 |
+
LSTM(256, return_sequences=True),
|
50 |
+
LSTM(128, return_sequences=True),
|
51 |
+
LSTM(128, return_sequences=True),
|
52 |
+
LSTM(128, return_sequences=True),
|
53 |
+
LSTM(64, return_sequences=True),
|
54 |
+
LSTM(64, return_sequences=True),
|
55 |
+
LSTM(64, return_sequences=True),
|
56 |
+
LSTM(32, return_sequences=True),
|
57 |
+
LSTM(32, return_sequences=True),
|
58 |
+
LSTM(32, return_sequences=True),
|
59 |
+
LSTM(16, return_sequences=True),
|
60 |
+
LSTM(16, return_sequences=True),
|
61 |
+
LSTM(16, return_sequences=True),
|
62 |
+
LSTM(8, return_sequences=True),
|
63 |
+
LSTM(8, return_sequences=True),
|
64 |
+
LSTM(8, return_sequences=True),
|
65 |
+
LSTM(4, return_sequences=True),
|
66 |
+
LSTM(4, return_sequences=True),
|
67 |
+
LSTM(4, return_sequences=True),
|
68 |
+
LSTM(2, return_sequences=True),
|
69 |
+
LSTM(1),
|
70 |
+
Dense(len(tokenizer.word_index) + 1, activation='softmax')
|
71 |
+
])
|
72 |
+
|
73 |
+
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
74 |
+
nn_model.fit(X, y, epochs=20, batch_size=32)
|
75 |
+
|
76 |
+
# === CÜMLE ÜRETME (temperature sampling eklenmiş) ===
|
77 |
+
def generate_sentence(start_word, sentence_length, temperature=1.0):
|
78 |
+
sentence = [start_word]
|
79 |
+
for _ in range(sentence_length - 1):
|
80 |
+
sequence = tokenizer.texts_to_sequences([' '.join(sentence)])
|
81 |
+
sequence = pad_sequences(sequence, maxlen=context_length, padding='post')
|
82 |
+
predicted_probs = nn_model.predict(sequence)[0]
|
83 |
+
|
84 |
+
# Uygulanan sıcaklık parametresiyle olasılıkları değiştirme
|
85 |
+
predicted_probs = np.asarray(predicted_probs).astype('float64')
|
86 |
+
predicted_probs = np.log(predicted_probs + 1e-10) / temperature
|
87 |
+
predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
|
88 |
+
|
89 |
+
# Temperature ile daha rastgele bir kelime seçimi
|
90 |
+
predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
|
91 |
+
next_word = tokenizer.index_word.get(predicted_index, '')
|
92 |
+
|
93 |
+
if not next_word:
|
94 |
+
break
|
95 |
+
sentence.append(next_word)
|
96 |
+
|
97 |
+
return ' '.join(sentence)
|
98 |
+
|
99 |
+
# Başlangıç kelimesi girilmesi ve cümle üretimi
|
100 |
+
start_word = input("Başlangıç kelimesi giriniz: ")
|
101 |
+
print("\nÜretilen Cümle:", generate_sentence(start_word, sentence_length, temperature=1.0))
|