Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +54 -51
src/streamlit_app.py
CHANGED
|
@@ -189,7 +189,7 @@ def semantic_search(query, embedder, index, df, genre=None, year=None, country=N
|
|
| 189 |
query_embedding = embedder.encode([query])
|
| 190 |
faiss.normalize_L2(query_embedding)
|
| 191 |
|
| 192 |
-
n_search = 500
|
| 193 |
dists, idxs = index.search(query_embedding, n_search)
|
| 194 |
|
| 195 |
valid_idxs = [i for i in idxs[0] if i >= 0 and i < len(df)]
|
|
@@ -199,7 +199,6 @@ def semantic_search(query, embedder, index, df, genre=None, year=None, country=N
|
|
| 199 |
res = df.iloc[valid_idxs].copy()
|
| 200 |
res["score"] = dists[0][:len(valid_idxs)]
|
| 201 |
|
| 202 |
-
# Применяем фильтрацию
|
| 203 |
if genre and genre != "Все":
|
| 204 |
genre_lower = genre.lower()
|
| 205 |
res = res[res["basic_genres"].str.lower().str.contains(genre_lower, na=False)]
|
|
@@ -220,15 +219,12 @@ def semantic_search(query, embedder, index, df, genre=None, year=None, country=N
|
|
| 220 |
if res.empty:
|
| 221 |
return res
|
| 222 |
|
| 223 |
-
# --- УЛУЧШЕННОЕ ГИБРИДНОЕ РАНЖИРОВАНИЕ НА ОСНОВЕ КЛЮЧЕВЫХ СЛОВ ---
|
| 224 |
query_lower = query.lower()
|
| 225 |
|
| 226 |
-
# Добавляем очень большой бонус за точное совпадение фразы в названии
|
| 227 |
res['exact_match_title'] = res['tvshow_title'].str.lower() == query_lower
|
| 228 |
|
| 229 |
-
# Добавляем средний бонус за совпадение ключевых слов в названии/описании
|
| 230 |
query_words = re.findall(r'\b\w+\b', query_lower)
|
| 231 |
-
keyword_pattern = '|'.join([re.escape(word) for word in query_words if len(word) > 2])
|
| 232 |
|
| 233 |
if keyword_pattern:
|
| 234 |
res['has_keyword'] = res.apply(
|
|
@@ -241,13 +237,9 @@ def semantic_search(query, embedder, index, df, genre=None, year=None, country=N
|
|
| 241 |
|
| 242 |
res['final_score'] = res['score']
|
| 243 |
|
| 244 |
-
# Применяем веса:
|
| 245 |
-
# Самый большой бонус за точное совпадение названия
|
| 246 |
res['final_score'] = np.where(res['exact_match_title'], res['final_score'] + 1.5, res['final_score'])
|
| 247 |
-
# Умеренный бонус за совпадение ключевых слов
|
| 248 |
res['final_score'] = np.where(res['has_keyword'], res['final_score'] + 0.4, res['final_score'])
|
| 249 |
|
| 250 |
-
# Сортируем по новой, более точной оценке
|
| 251 |
sorted_results = res.sort_values(by="final_score", ascending=False)
|
| 252 |
|
| 253 |
return sorted_results.head(k)
|
|
@@ -274,7 +266,6 @@ def generate_rag_response(user_query, search_results, llm):
|
|
| 274 |
|
| 275 |
ctx = format_docs_for_prompt(search_results)
|
| 276 |
|
| 277 |
-
# УЛУЧШЕННЫЙ ПРОМПТ ДЛЯ LLM
|
| 278 |
prompt_template = """
|
| 279 |
Ты — эксперт по кино и сериалам. Твоя задача — помочь пользователю, основываясь на предоставленных ниже результатах поиска.
|
| 280 |
|
|
@@ -308,7 +299,6 @@ def main():
|
|
| 308 |
st.set_page_config(page_title="Поиск фильмов и сериалов + AI", layout="wide")
|
| 309 |
st.title("Семантический поиск фильмов и сериалов с AI")
|
| 310 |
|
| 311 |
-
# Инициализация состояния
|
| 312 |
if "df" not in st.session_state:
|
| 313 |
try:
|
| 314 |
st.session_state.df = cached_load_data(CSV_PATH)
|
|
@@ -342,19 +332,25 @@ def main():
|
|
| 342 |
index = st.session_state.index
|
| 343 |
llm = st.session_state.llm
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
# ====== Форма поиска ======
|
| 346 |
with st.container():
|
| 347 |
st.markdown("---")
|
| 348 |
with st.form(key='search_form'):
|
| 349 |
colf1, colf2, colf3, colf4 = st.columns(4)
|
| 350 |
with colf1:
|
| 351 |
-
basic_genres_list = []
|
| 352 |
-
for g in
|
| 353 |
-
for part in str(g).split(","):
|
| 354 |
-
p = part.strip()
|
| 355 |
-
if p:
|
| 356 |
-
basic_genres_list.append(p)
|
| 357 |
-
genres = ["Все"] + sorted(set(basic_genres_list))
|
| 358 |
genre_filter = st.selectbox("Жанр", genres, index=0, key="genre_filter_key")
|
| 359 |
with colf2:
|
| 360 |
years = ["Все"] + [str(y) for y in sorted(df["year"].unique()) if y != 0]
|
|
@@ -369,48 +365,47 @@ def main():
|
|
| 369 |
k = st.slider("Количество результатов:", 1, 20, 5, key="k_slider")
|
| 370 |
user_input = st.text_input("Введите ключевые слова или сюжет:", key="user_input_key")
|
| 371 |
|
| 372 |
-
|
| 373 |
-
with
|
| 374 |
random_search = st.form_submit_button("Случайный фильм/сериал")
|
| 375 |
-
with
|
| 376 |
genre_search = st.form_submit_button("ТОП по жанру")
|
| 377 |
-
with
|
| 378 |
new_search = st.form_submit_button("Новинки")
|
| 379 |
-
with
|
| 380 |
text_search = st.form_submit_button("Искать")
|
| 381 |
|
| 382 |
# Логика обработки нажатий кнопок
|
| 383 |
-
performed_search = False
|
| 384 |
if text_search and user_input:
|
| 385 |
st.session_state.last_query = user_input
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
elif random_search:
|
| 388 |
random_query = random.choice(df["tvshow_title"].tolist())
|
| 389 |
st.session_state.last_query = random_query
|
| 390 |
-
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
| 392 |
elif genre_search and genre_filter != "Все":
|
| 393 |
-
st.session_state.last_query = genre_filter
|
| 394 |
-
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
| 396 |
elif new_search:
|
| 397 |
-
new_query =
|
| 398 |
st.session_state.last_query = new_query
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
st.session_state.results = semantic_search(
|
| 405 |
-
user_input, embedder, index, df,
|
| 406 |
-
genre_filter, year_filter, country_filter, type_filter, k
|
| 407 |
-
)
|
| 408 |
-
st.session_state.ai_clicked = False
|
| 409 |
-
else:
|
| 410 |
-
if 'results' not in st.session_state:
|
| 411 |
-
st.session_state.results = pd.DataFrame()
|
| 412 |
-
if 'ai_clicked' not in st.session_state:
|
| 413 |
-
st.session_state.ai_clicked = False
|
| 414 |
|
| 415 |
# ====== Отрисовка результатов ======
|
| 416 |
results_container = st.container()
|
|
@@ -419,8 +414,9 @@ def main():
|
|
| 419 |
with results_container:
|
| 420 |
st.markdown("## Результаты поиска")
|
| 421 |
results_exist = isinstance(st.session_state.get("results"), pd.DataFrame) and not st.session_state.results.empty
|
|
|
|
| 422 |
if not results_exist:
|
| 423 |
-
if
|
| 424 |
st.warning(f"Ничего не найдено по запросу: '{st.session_state.last_query}'.")
|
| 425 |
else:
|
| 426 |
st.info("Введите запрос и нажмите «Искать», или выберите «Случайный фильм/сериал».")
|
|
@@ -448,11 +444,18 @@ def main():
|
|
| 448 |
st.markdown(f"[Подробнее]({row['url']})")
|
| 449 |
st.divider()
|
| 450 |
|
| 451 |
-
|
| 452 |
-
|
|
|
|
|
|
|
| 453 |
|
| 454 |
with ai_response_container:
|
| 455 |
-
if st.session_state.get("ai_clicked"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
st.markdown("### Рекомендации AI:")
|
| 457 |
with st.spinner("Генерация ответа AI..."):
|
| 458 |
rag = generate_rag_response(st.session_state.last_query, st.session_state.results, llm)
|
|
|
|
| 189 |
query_embedding = embedder.encode([query])
|
| 190 |
faiss.normalize_L2(query_embedding)
|
| 191 |
|
| 192 |
+
n_search = 500
|
| 193 |
dists, idxs = index.search(query_embedding, n_search)
|
| 194 |
|
| 195 |
valid_idxs = [i for i in idxs[0] if i >= 0 and i < len(df)]
|
|
|
|
| 199 |
res = df.iloc[valid_idxs].copy()
|
| 200 |
res["score"] = dists[0][:len(valid_idxs)]
|
| 201 |
|
|
|
|
| 202 |
if genre and genre != "Все":
|
| 203 |
genre_lower = genre.lower()
|
| 204 |
res = res[res["basic_genres"].str.lower().str.contains(genre_lower, na=False)]
|
|
|
|
| 219 |
if res.empty:
|
| 220 |
return res
|
| 221 |
|
|
|
|
| 222 |
query_lower = query.lower()
|
| 223 |
|
|
|
|
| 224 |
res['exact_match_title'] = res['tvshow_title'].str.lower() == query_lower
|
| 225 |
|
|
|
|
| 226 |
query_words = re.findall(r'\b\w+\b', query_lower)
|
| 227 |
+
keyword_pattern = '|'.join([re.escape(word) for word in query_words if len(word) > 2])
|
| 228 |
|
| 229 |
if keyword_pattern:
|
| 230 |
res['has_keyword'] = res.apply(
|
|
|
|
| 237 |
|
| 238 |
res['final_score'] = res['score']
|
| 239 |
|
|
|
|
|
|
|
| 240 |
res['final_score'] = np.where(res['exact_match_title'], res['final_score'] + 1.5, res['final_score'])
|
|
|
|
| 241 |
res['final_score'] = np.where(res['has_keyword'], res['final_score'] + 0.4, res['final_score'])
|
| 242 |
|
|
|
|
| 243 |
sorted_results = res.sort_values(by="final_score", ascending=False)
|
| 244 |
|
| 245 |
return sorted_results.head(k)
|
|
|
|
| 266 |
|
| 267 |
ctx = format_docs_for_prompt(search_results)
|
| 268 |
|
|
|
|
| 269 |
prompt_template = """
|
| 270 |
Ты — эксперт по кино и сериалам. Твоя задача — помочь пользователю, основываясь на предоставленных ниже результатах поиска.
|
| 271 |
|
|
|
|
| 299 |
st.set_page_config(page_title="Поиск фильмов и сериалов + AI", layout="wide")
|
| 300 |
st.title("Семантический поиск фильмов и сериалов с AI")
|
| 301 |
|
|
|
|
| 302 |
if "df" not in st.session_state:
|
| 303 |
try:
|
| 304 |
st.session_state.df = cached_load_data(CSV_PATH)
|
|
|
|
| 332 |
index = st.session_state.index
|
| 333 |
llm = st.session_state.llm
|
| 334 |
|
| 335 |
+
# Инициализация переменных состояния
|
| 336 |
+
if 'last_query' not in st.session_state:
|
| 337 |
+
st.session_state.last_query = ""
|
| 338 |
+
if 'results' not in st.session_state:
|
| 339 |
+
st.session_state.results = pd.DataFrame()
|
| 340 |
+
if 'ai_clicked' not in st.session_state:
|
| 341 |
+
st.session_state.ai_clicked = False
|
| 342 |
+
if 'search_query' not in st.session_state:
|
| 343 |
+
st.session_state.search_query = ""
|
| 344 |
+
|
| 345 |
+
|
| 346 |
# ====== Форма поиска ======
|
| 347 |
with st.container():
|
| 348 |
st.markdown("---")
|
| 349 |
with st.form(key='search_form'):
|
| 350 |
colf1, colf2, colf3, colf4 = st.columns(4)
|
| 351 |
with colf1:
|
| 352 |
+
basic_genres_list = sorted(list(set(g.strip() for g in ", ".join(df["basic_genres"].dropna().unique()).split(","))))
|
| 353 |
+
genres = ["Все"] + [g for g in basic_genres_list if g]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
genre_filter = st.selectbox("Жанр", genres, index=0, key="genre_filter_key")
|
| 355 |
with colf2:
|
| 356 |
years = ["Все"] + [str(y) for y in sorted(df["year"].unique()) if y != 0]
|
|
|
|
| 365 |
k = st.slider("Количество результатов:", 1, 20, 5, key="k_slider")
|
| 366 |
user_input = st.text_input("Введите ключевые слова или сюжет:", key="user_input_key")
|
| 367 |
|
| 368 |
+
col_buttons = st.columns(4)
|
| 369 |
+
with col_buttons[0]:
|
| 370 |
random_search = st.form_submit_button("Случайный фильм/сериал")
|
| 371 |
+
with col_buttons[1]:
|
| 372 |
genre_search = st.form_submit_button("ТОП по жанру")
|
| 373 |
+
with col_buttons[2]:
|
| 374 |
new_search = st.form_submit_button("Новинки")
|
| 375 |
+
with col_buttons[3]:
|
| 376 |
text_search = st.form_submit_button("Искать")
|
| 377 |
|
| 378 |
# Логика обработки нажатий кнопок
|
|
|
|
| 379 |
if text_search and user_input:
|
| 380 |
st.session_state.last_query = user_input
|
| 381 |
+
st.session_state.results = semantic_search(
|
| 382 |
+
user_input, embedder, index, df,
|
| 383 |
+
genre_filter, year_filter, country_filter, type_filter, k
|
| 384 |
+
)
|
| 385 |
+
st.session_state.ai_clicked = False
|
| 386 |
elif random_search:
|
| 387 |
random_query = random.choice(df["tvshow_title"].tolist())
|
| 388 |
st.session_state.last_query = random_query
|
| 389 |
+
st.session_state.results = semantic_search(
|
| 390 |
+
random_query, embedder, index, df,
|
| 391 |
+
genre_filter, year_filter, country_filter, type_filter, k
|
| 392 |
+
)
|
| 393 |
+
st.session_state.ai_clicked = False
|
| 394 |
elif genre_search and genre_filter != "Все":
|
| 395 |
+
st.session_state.last_query = f"Лучшие фильмы и сериалы в жанре {genre_filter}"
|
| 396 |
+
st.session_state.results = semantic_search(
|
| 397 |
+
st.session_state.last_query, embedder, index, df,
|
| 398 |
+
genre_filter, year_filter, country_filter, type_filter, k
|
| 399 |
+
)
|
| 400 |
+
st.session_state.ai_clicked = False
|
| 401 |
elif new_search:
|
| 402 |
+
new_query = f"Самые новые фильмы и сериалы {df['year'].max()}"
|
| 403 |
st.session_state.last_query = new_query
|
| 404 |
+
st.session_state.results = semantic_search(
|
| 405 |
+
new_query, embedder, index, df,
|
| 406 |
+
genre_filter, year_filter, country_filter, type_filter, k
|
| 407 |
+
)
|
| 408 |
+
st.session_state.ai_clicked = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
# ====== Отрисовка результатов ======
|
| 411 |
results_container = st.container()
|
|
|
|
| 414 |
with results_container:
|
| 415 |
st.markdown("## Результаты поиска")
|
| 416 |
results_exist = isinstance(st.session_state.get("results"), pd.DataFrame) and not st.session_state.results.empty
|
| 417 |
+
|
| 418 |
if not results_exist:
|
| 419 |
+
if st.session_state.last_query:
|
| 420 |
st.warning(f"Ничего не найдено по запросу: '{st.session_state.last_query}'.")
|
| 421 |
else:
|
| 422 |
st.info("Введите запрос и нажмите «Искать», или выберите «Случайный фильм/сериал».")
|
|
|
|
| 444 |
st.markdown(f"[Подробнее]({row['url']})")
|
| 445 |
st.divider()
|
| 446 |
|
| 447 |
+
# Кнопка для AI-генерации вне формы
|
| 448 |
+
if st.session_state.llm and not st.session_state.results.empty:
|
| 449 |
+
if st.button("AI: почему эти подходят и что ещё посмотреть", key="ai_button"):
|
| 450 |
+
st.session_state.ai_clicked = True
|
| 451 |
|
| 452 |
with ai_response_container:
|
| 453 |
+
if st.session_state.get("ai_clicked") and st.session_state.get("last_query"):
|
| 454 |
+
st.markdown("### Рекомендации AI:")
|
| 455 |
+
with st.spinner("Генерация ответа AI..."):
|
| 456 |
+
rag = generate_rag_response(st.session_state.last_query, st.session_state.results, llm)
|
| 457 |
+
st.write(rag)
|
| 458 |
+
elif st.session_state.get("ai_clicked") and not st.session_state.get("results").empty:
|
| 459 |
st.markdown("### Рекомендации AI:")
|
| 460 |
with st.spinner("Генерация ответа AI..."):
|
| 461 |
rag = generate_rag_response(st.session_state.last_query, st.session_state.results, llm)
|