Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +68 -52
src/streamlit_app.py
CHANGED
@@ -1,10 +1,4 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
# Обеспечить, что Streamlit пишет конфиги в доступную директорию
|
4 |
-
os.environ["HOME"] = os.getenv("HOME", "/tmp")
|
5 |
-
os.environ["XDG_CONFIG_HOME"] = os.getenv("XDG_CONFIG_HOME", "/tmp/.config")
|
6 |
-
os.environ["STREAMLIT_CONFIG_DIR"] = os.getenv("STREAMLIT_CONFIG_DIR", "tv_show_symantic/src/streamlit_app.py")
|
7 |
-
|
8 |
import streamlit as st
|
9 |
import pandas as pd
|
10 |
import numpy as np
|
@@ -17,12 +11,16 @@ import ast
|
|
17 |
import random
|
18 |
import tempfile
|
19 |
|
20 |
-
#
|
|
|
|
|
|
|
21 |
HERE = os.path.dirname(os.path.abspath(__file__))
|
22 |
CSV_PATH = os.path.join(HERE, "tvshows_processed2.csv")
|
23 |
EMB_PATH = os.path.join(HERE, "embeddings.npy")
|
24 |
FAISS_PATH = os.path.join(HERE, "faiss_index.index")
|
25 |
|
|
|
26 |
BASIC_GENRES = [
|
27 |
"комедия", "драма", "боевик", "фэнтези", "ужасы", "триллер", "романтика",
|
28 |
"научная фантастика", "приключения", "криминал", "мюзикл",
|
@@ -33,6 +31,7 @@ BAD_ACTORS = [
|
|
33 |
"нет актёров", "нет актеров", "unknown", "—", ""
|
34 |
]
|
35 |
|
|
|
36 |
def list_str_to_text(x):
|
37 |
try:
|
38 |
lst = ast.literal_eval(x) if isinstance(x, str) else x
|
@@ -59,7 +58,7 @@ def extract_intro_paragraph(text, max_sentences=4):
|
|
59 |
|
60 |
def clean_tvshows_data(path):
|
61 |
if not os.path.exists(path):
|
62 |
-
st.error(f"Файл данных не найден: {path}.
|
63 |
st.stop()
|
64 |
df = pd.read_csv(path)
|
65 |
df["actors"] = df["actors"].apply(list_str_to_text).apply(clean_actors_string)
|
@@ -97,7 +96,7 @@ def clean_tvshows_data(path):
|
|
97 |
df[col] = None
|
98 |
return df.reset_index(drop=True)
|
99 |
|
100 |
-
@st.
|
101 |
def load_data():
|
102 |
return clean_tvshows_data(CSV_PATH)
|
103 |
|
@@ -118,7 +117,7 @@ def load_embeddings_and_index():
|
|
118 |
|
119 |
def semantic_search(query, embedder, index, df, genre=None, year=None, country=None, vtype=None, k=5):
|
120 |
if not query.strip():
|
121 |
-
return
|
122 |
query_embedding = embedder.encode([query])
|
123 |
faiss.normalize_L2(query_embedding)
|
124 |
dists, idxs = index.search(query_embedding, max(k*3, k))
|
@@ -139,13 +138,13 @@ def init_groq_llm():
|
|
139 |
key = os.environ.get("GROQ_API_KEY") or (st.secrets.get("GROQ_API_KEY") if hasattr(st, "secrets") else None) or st.text_input("🔐 Введите API-ключ Groq:", type="password")
|
140 |
if not key:
|
141 |
st.warning("Введите Groq API ключ.")
|
142 |
-
|
143 |
os.environ["GROQ_API_KEY"] = key
|
144 |
try:
|
145 |
return ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0, max_tokens=2000)
|
146 |
except Exception as e:
|
147 |
st.error(f"Ошибка инициализации Groq: {e}")
|
148 |
-
|
149 |
|
150 |
def format_docs_for_prompt(results_df):
|
151 |
parts = []
|
@@ -168,68 +167,74 @@ def main():
|
|
168 |
st.set_page_config(page_title="Поиск фильмов и сериалов + AI", layout="wide")
|
169 |
st.title("Семантический поиск фильмов и сериалов с AI")
|
170 |
|
171 |
-
#
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
df = load_data()
|
181 |
-
if "type" not in df.columns:
|
182 |
-
df["type"] = df["num_seasons"].apply(lambda x: "Сериал" if x > 1 else "Фильм")
|
183 |
-
|
184 |
embedder = init_embedder()
|
185 |
_, index = load_embeddings_and_index()
|
186 |
llm = init_groq_llm()
|
187 |
|
188 |
-
# Фильтры
|
189 |
colf1, colf2, colf3, colf4 = st.columns(4)
|
190 |
with colf1:
|
191 |
genres = ["Все"] + sorted(set(sum([g.split(", ") for g in df["basic_genres"].dropna().unique()], [])))
|
192 |
-
st.session_state.genre_filter = st.selectbox("Жанр", genres,
|
193 |
with colf2:
|
194 |
years = ["Все"] + [str(y) for y in sorted(df["year"].unique())]
|
195 |
-
st.session_state.year_filter = st.selectbox("Год", years,
|
196 |
with colf3:
|
197 |
countries = ["Все"] + sorted([c for c in df["country"].dropna().unique()])
|
198 |
-
st.session_state.country_filter = st.selectbox("Страна", countries,
|
199 |
with colf4:
|
200 |
vtypes = ["Все"] + sorted(df["type"].dropna().unique())
|
201 |
-
st.session_state.type_filter = st.selectbox("Тип", vtypes,
|
202 |
|
203 |
-
k = st.slider("Количество результатов:", 1, 20, 5)
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
207 |
st.session_state.ai_clicked = False
|
208 |
|
209 |
nav1, nav2, nav3, nav4 = st.columns(4)
|
210 |
with nav1:
|
211 |
if st.button("Случайный фильм/сериал"):
|
212 |
-
|
213 |
-
st.session_state.search_clicked = True
|
214 |
with nav2:
|
215 |
if st.button("ТОП по жанру") and st.session_state.genre_filter != "Все":
|
216 |
-
st.session_state.
|
217 |
-
st.session_state.search_clicked = True
|
218 |
with nav3:
|
219 |
if st.button("Новинки"):
|
220 |
-
|
221 |
-
st.session_state.search_clicked = True
|
222 |
with nav4:
|
223 |
if st.button("Искать"):
|
224 |
-
st.session_state.
|
225 |
|
226 |
-
|
227 |
-
if st.session_state.search_clicked and st.session_state.
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
233 |
if results.empty:
|
234 |
st.warning("Ничего не найдено.")
|
235 |
else:
|
@@ -237,7 +242,13 @@ def main():
|
|
237 |
for _, row in results.iterrows():
|
238 |
col1, col2 = st.columns([1, 3])
|
239 |
with col1:
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
with col2:
|
242 |
st.markdown(f"### {row['tvshow_title']} ({row['year']})")
|
243 |
st.caption(f"{row['basic_genres']} | {row['country'] or '—'} | {row['rating'] or '—'} | {row['type']} | {row['num_seasons']} сез.")
|
@@ -247,16 +258,21 @@ def main():
|
|
247 |
if row["url"]:
|
248 |
st.markdown(f"[Подробнее]({row['url']})")
|
249 |
st.divider()
|
250 |
-
|
|
|
|
|
251 |
st.session_state.ai_clicked = True
|
252 |
-
|
|
|
253 |
st.session_state.search_clicked = False
|
254 |
|
255 |
-
|
|
|
256 |
st.markdown("### Рекомендации AI:")
|
257 |
-
st.
|
|
|
258 |
|
259 |
st.sidebar.write(f"Всего записей: {len(df)}")
|
260 |
|
261 |
if __name__ == "__main__":
|
262 |
-
main()
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
|
|
11 |
import random
|
12 |
import tempfile
|
13 |
|
14 |
+
# 1. Удаляем блок с ручной установкой env vars
|
15 |
+
# Streamlit должен работать с .streamlit/config.toml, который вы создали
|
16 |
+
|
17 |
+
# 2. Пути к файлам относительно текущего скрипта
|
18 |
HERE = os.path.dirname(os.path.abspath(__file__))
|
19 |
CSV_PATH = os.path.join(HERE, "tvshows_processed2.csv")
|
20 |
EMB_PATH = os.path.join(HERE, "embeddings.npy")
|
21 |
FAISS_PATH = os.path.join(HERE, "faiss_index.index")
|
22 |
|
23 |
+
# Статические данные (не изменяются во время работы)
|
24 |
BASIC_GENRES = [
|
25 |
"комедия", "драма", "боевик", "фэнтези", "ужасы", "триллер", "романтика",
|
26 |
"научная фантастика", "приключения", "криминал", "мюзикл",
|
|
|
31 |
"нет актёров", "нет актеров", "unknown", "—", ""
|
32 |
]
|
33 |
|
34 |
+
# Функции, которые можно кэшировать
|
35 |
def list_str_to_text(x):
|
36 |
try:
|
37 |
lst = ast.literal_eval(x) if isinstance(x, str) else x
|
|
|
58 |
|
59 |
def clean_tvshows_data(path):
|
60 |
if not os.path.exists(path):
|
61 |
+
st.error(f"Файл данных не найден: {path}.")
|
62 |
st.stop()
|
63 |
df = pd.read_csv(path)
|
64 |
df["actors"] = df["actors"].apply(list_str_to_text).apply(clean_actors_string)
|
|
|
96 |
df[col] = None
|
97 |
return df.reset_index(drop=True)
|
98 |
|
99 |
+
@st.cache_data
|
100 |
def load_data():
|
101 |
return clean_tvshows_data(CSV_PATH)
|
102 |
|
|
|
117 |
|
118 |
def semantic_search(query, embedder, index, df, genre=None, year=None, country=None, vtype=None, k=5):
|
119 |
if not query.strip():
|
120 |
+
return pd.DataFrame() # Возвращаем пустой DataFrame, если запрос пуст
|
121 |
query_embedding = embedder.encode([query])
|
122 |
faiss.normalize_L2(query_embedding)
|
123 |
dists, idxs = index.search(query_embedding, max(k*3, k))
|
|
|
138 |
key = os.environ.get("GROQ_API_KEY") or (st.secrets.get("GROQ_API_KEY") if hasattr(st, "secrets") else None) or st.text_input("🔐 Введите API-ключ Groq:", type="password")
|
139 |
if not key:
|
140 |
st.warning("Введите Groq API ключ.")
|
141 |
+
st.stop()
|
142 |
os.environ["GROQ_API_KEY"] = key
|
143 |
try:
|
144 |
return ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0, max_tokens=2000)
|
145 |
except Exception as e:
|
146 |
st.error(f"Ошибка инициализации Groq: {e}")
|
147 |
+
st.stop()
|
148 |
|
149 |
def format_docs_for_prompt(results_df):
|
150 |
parts = []
|
|
|
167 |
st.set_page_config(page_title="Поиск фильмов и сериалов + AI", layout="wide")
|
168 |
st.title("Семантический поиск фильмов и сериалов с AI")
|
169 |
|
170 |
+
# 3. Инициализация состояния
|
171 |
+
if "search_clicked" not in st.session_state:
|
172 |
+
st.session_state.search_clicked = False
|
173 |
+
if "ai_clicked" not in st.session_state:
|
174 |
+
st.session_state.ai_clicked = False
|
175 |
+
if "query_input" not in st.session_state:
|
176 |
+
st.session_state.query_input = ""
|
177 |
+
if "genre_filter" not in st.session_state:
|
178 |
+
st.session_state.genre_filter = "Все"
|
179 |
+
if "year_filter" not in st.session_state:
|
180 |
+
st.session_state.year_filter = "Все"
|
181 |
+
if "country_filter" not in st.session_state:
|
182 |
+
st.session_state.country_filter = "Все"
|
183 |
+
if "type_filter" not in st.session_state:
|
184 |
+
st.session_state.type_filter = "Все"
|
185 |
|
186 |
df = load_data()
|
|
|
|
|
|
|
187 |
embedder = init_embedder()
|
188 |
_, index = load_embeddings_and_index()
|
189 |
llm = init_groq_llm()
|
190 |
|
191 |
+
# 4. Фильтры
|
192 |
colf1, colf2, colf3, colf4 = st.columns(4)
|
193 |
with colf1:
|
194 |
genres = ["Все"] + sorted(set(sum([g.split(", ") for g in df["basic_genres"].dropna().unique()], [])))
|
195 |
+
st.session_state.genre_filter = st.selectbox("Жанр", genres, key="genre_filter_key")
|
196 |
with colf2:
|
197 |
years = ["Все"] + [str(y) for y in sorted(df["year"].unique())]
|
198 |
+
st.session_state.year_filter = st.selectbox("Год", years, key="year_filter_key")
|
199 |
with colf3:
|
200 |
countries = ["Все"] + sorted([c for c in df["country"].dropna().unique()])
|
201 |
+
st.session_state.country_filter = st.selectbox("Страна", countries, key="country_filter_key")
|
202 |
with colf4:
|
203 |
vtypes = ["Все"] + sorted(df["type"].dropna().unique())
|
204 |
+
st.session_state.type_filter = st.selectbox("Тип", vtypes, key="type_filter_key")
|
205 |
|
206 |
+
k = st.slider("Количество результатов:", 1, 20, 5, key="k_slider")
|
207 |
+
st.text_input("Введите ключевые слова или сюжет:", key="query_input")
|
208 |
+
|
209 |
+
# 5. Обработка кнопок
|
210 |
+
def handle_search(query):
|
211 |
+
st.session_state.query_input = query
|
212 |
+
st.session_state.search_clicked = True
|
213 |
st.session_state.ai_clicked = False
|
214 |
|
215 |
nav1, nav2, nav3, nav4 = st.columns(4)
|
216 |
with nav1:
|
217 |
if st.button("Случайный фильм/сериал"):
|
218 |
+
handle_search(random.choice(df["tvshow_title"]))
|
|
|
219 |
with nav2:
|
220 |
if st.button("ТОП по жанру") and st.session_state.genre_filter != "Все":
|
221 |
+
handle_search(st.session_state.genre_filter)
|
|
|
222 |
with nav3:
|
223 |
if st.button("Новинки"):
|
224 |
+
handle_search(str(max(df["year"])))
|
|
|
225 |
with nav4:
|
226 |
if st.button("Искать"):
|
227 |
+
handle_search(st.session_state.query_input)
|
228 |
|
229 |
+
# 6. Отображение результатов
|
230 |
+
if st.session_state.search_clicked and st.session_state.query_input.strip():
|
231 |
+
with st.spinner("Поиск..."):
|
232 |
+
results = semantic_search(
|
233 |
+
st.session_state.query_input, embedder, index, df,
|
234 |
+
st.session_state.genre_filter, st.session_state.year_filter,
|
235 |
+
st.session_state.country_filter, st.session_state.type_filter, k
|
236 |
+
)
|
237 |
+
|
238 |
if results.empty:
|
239 |
st.warning("Ничего не найдено.")
|
240 |
else:
|
|
|
242 |
for _, row in results.iterrows():
|
243 |
col1, col2 = st.columns([1, 3])
|
244 |
with col1:
|
245 |
+
if row["image_url"]:
|
246 |
+
try:
|
247 |
+
st.image(row["image_url"], use_container_width=True)
|
248 |
+
except:
|
249 |
+
st.info("Нет изображения или не удалось загрузить")
|
250 |
+
else:
|
251 |
+
st.info("Нет изображения")
|
252 |
with col2:
|
253 |
st.markdown(f"### {row['tvshow_title']} ({row['year']})")
|
254 |
st.caption(f"{row['basic_genres']} | {row['country'] or '—'} | {row['rating'] or '—'} | {row['type']} | {row['num_seasons']} сез.")
|
|
|
258 |
if row["url"]:
|
259 |
st.markdown(f"[Подробнее]({row['url']})")
|
260 |
st.divider()
|
261 |
+
|
262 |
+
# Кнопка для AI-рекомендаций должна быть вне цикла
|
263 |
+
if st.button("AI: почему эти подходят и что ещё посмотреть", key="ai_button"):
|
264 |
st.session_state.ai_clicked = True
|
265 |
+
|
266 |
+
# Сброс флага поиска
|
267 |
st.session_state.search_clicked = False
|
268 |
|
269 |
+
|
270 |
+
if st.session_state.ai_clicked and "results" in locals() and not results.empty and llm is not None:
|
271 |
st.markdown("### Рекомендации AI:")
|
272 |
+
with st.spinner("Генерация ответа AI..."):
|
273 |
+
st.write(generate_rag_response(st.session_state.query_input, results, llm))
|
274 |
|
275 |
st.sidebar.write(f"Всего записей: {len(df)}")
|
276 |
|
277 |
if __name__ == "__main__":
|
278 |
+
main()
|