Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- .gitignore +160 -0
- LICENSE +21 -0
- README.md +13 -20
- __init__.py +0 -0
- answer_rag.py +88 -0
- app.py +73 -0
- loader.py +51 -0
- rag.py +73 -0
- rag_local.py +72 -0
- reader_llm.py +28 -0
- requirements.txt +0 -0
- reranker.py +42 -0
- retrieval.py +24 -0
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.rag_venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Maria
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,20 +1,13 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
# Welcome to Streamlit!
|
16 |
-
|
17 |
-
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
18 |
-
|
19 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
20 |
-
forums](https://discuss.streamlit.io).
|
|
|
1 |
+
Проект по NLP - туристический RAG-путеводитель
|
2 |
+
|
3 |
+
Проект посвящен созданию туристического путеводителя по 4 городам России – Владимир, Ярославль, Екатеринбург и Нижний Новгород с помощью метода NLP, а именно подхода RAG.
|
4 |
+
|
5 |
+
Данные:
|
6 |
+
|
7 |
+
Данные представляют из себя таблицу (combinated_cities.csv), в которой по столбцам содержится информация о памятниках, архитектуре и других значимых местах для 4 городов. В данных есть столбцы:
|
8 |
+
|
9 |
+
* Name - название достопримечательности;
|
10 |
+
* City - город расположения;
|
11 |
+
* Lon, Lat - координаты долготы и широты;
|
12 |
+
* description - описание (извлеченное из WikiData);
|
13 |
+
* image - изображение в формате base64;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__init__.py
ADDED
File without changes
|
answer_rag.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Pipeline
|
2 |
+
# from langchain.vectorstores import FAISS
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
from reranker import get_reranker, get_rag_prompt_template
|
5 |
+
from typing import List, Tuple
|
6 |
+
from langchain.docstore.document import Document as LangchainDocument
|
7 |
+
import streamlit as st # Добавляем импорт Streamlit
|
8 |
+
|
9 |
+
def answer_with_rag2(
|
10 |
+
question: str,
|
11 |
+
llm: Pipeline,
|
12 |
+
knowledge_index: FAISS,
|
13 |
+
num_retrieved_docs: int = 30,
|
14 |
+
num_docs_final: int = 10,
|
15 |
+
) -> Tuple[str, List[LangchainDocument]]:
|
16 |
+
# Собираем документы с помощью ретривера
|
17 |
+
st.write("=> Retrieving documents...")
|
18 |
+
relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
|
19 |
+
relevant_contents = [doc.page_content for doc in relevant_docs]
|
20 |
+
|
21 |
+
# Получаем ранкер (теперь это CrossEncoder)
|
22 |
+
reranker = get_reranker()
|
23 |
+
|
24 |
+
st.write("=> Reranking documents...")
|
25 |
+
try:
|
26 |
+
# CrossEncoder работает иначе, чем ColBERT
|
27 |
+
scores = reranker.predict([(question, doc) for doc in relevant_contents])
|
28 |
+
|
29 |
+
# Сортируем документы по убыванию релевантности
|
30 |
+
scored_docs = list(zip(relevant_docs, scores))
|
31 |
+
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
32 |
+
|
33 |
+
# Отбираем топ документов
|
34 |
+
full_docs = [doc for doc, score in scored_docs[:num_docs_final]]
|
35 |
+
relevant_contents = [doc.page_content for doc in full_docs]
|
36 |
+
except Exception as e:
|
37 |
+
st.error(f"Ошибка при реранкинге: {e}")
|
38 |
+
full_docs = relevant_docs[:num_docs_final]
|
39 |
+
relevant_contents = relevant_contents[:num_docs_final]
|
40 |
+
|
41 |
+
# Формируем контекст для промпта
|
42 |
+
context = "\nExtracted documents:\n"
|
43 |
+
context += "".join([f"Document {i}:::\n{doc}\n" for i, doc in enumerate(relevant_contents)])
|
44 |
+
# st.write(context)
|
45 |
+
# Генерируем ответ
|
46 |
+
st.write("=> Generating answer...")
|
47 |
+
RAG_PROMPT_TEMPLATE = get_rag_prompt_template()
|
48 |
+
final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)
|
49 |
+
# st.write(final_prompt)
|
50 |
+
answer = llm(final_prompt)[0]["generated_text"]
|
51 |
+
|
52 |
+
# Отображаем результаты с изображениями и координатами
|
53 |
+
st.markdown("\n## Ответ")
|
54 |
+
st.write(answer)
|
55 |
+
|
56 |
+
st.markdown("## Использованные источники")
|
57 |
+
for i, doc in enumerate(full_docs[:num_docs_final]):
|
58 |
+
with st.expander(f"Документ {i+1}"):
|
59 |
+
st.write(doc.page_content)
|
60 |
+
|
61 |
+
# Отображаем координаты
|
62 |
+
if hasattr(doc, 'metadata') and doc.metadata:
|
63 |
+
if "longitude" in doc.metadata and "latitude" in doc.metadata:
|
64 |
+
st.write(f"📍 Широта: {doc.metadata['latitude']}, Долгота: {doc.metadata['longitude']}")
|
65 |
+
|
66 |
+
# Отображаем изображение
|
67 |
+
if "image" in doc.metadata and doc.metadata["image"]:
|
68 |
+
try:
|
69 |
+
if isinstance(doc.metadata["image"], str):
|
70 |
+
if doc.metadata["image"].startswith('/9j/'):
|
71 |
+
# Для base64 строк вида "data:image/png;base64,..."
|
72 |
+
import base64
|
73 |
+
from io import BytesIO
|
74 |
+
from PIL import Image
|
75 |
+
|
76 |
+
img_bytes = base64.b64decode(doc.metadata["image"])
|
77 |
+
img = Image.open(BytesIO(img_bytes))
|
78 |
+
st.image(img, caption=f"Изображение из документа {i+1}")
|
79 |
+
else:
|
80 |
+
# Предполагаем, что это путь к файлу
|
81 |
+
st.image(doc.metadata["image"], caption=f"Изображение из документа {i+1}")
|
82 |
+
elif isinstance(doc.metadata["image"], bytes):
|
83 |
+
# Обработка бинарных данных изображения
|
84 |
+
st.image(doc.metadata["image"], caption=f"Изображение из документа {i+1}")
|
85 |
+
except Exception as e:
|
86 |
+
st.error(f"Ошибка загрузки изображения: {str(e)}")
|
87 |
+
|
88 |
+
return answer, full_docs[:num_docs_final]
|
app.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
3 |
+
os.environ["NO_CUDA_EXT"] = "1"
|
4 |
+
|
5 |
+
|
6 |
+
from reader_llm import get_reader_llm
|
7 |
+
from retrieval import get_retriever
|
8 |
+
from answer_rag import answer_with_rag2
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
# Настройка страницы
|
12 |
+
st.set_page_config(page_title="RAG", layout="wide")
|
13 |
+
st.title("Туристический путеводитель")
|
14 |
+
st.header("Города: Ярославль, Екатеринбург, Нижний Новгород, Владимир")
|
15 |
+
|
16 |
+
@st.cache_resource
|
17 |
+
def load_models():
|
18 |
+
READER_LLM = get_reader_llm(name="Vikhrmodels/Vikhr-Llama-3.2-1B-Instruct")
|
19 |
+
# READER_LLM = get_reader_llm(name="microsoft/phi-2") # легкая модель для приложения на сайте streamlit
|
20 |
+
embedding_model, KNOWLEDGE_VECTOR_DATABASE = get_retriever()
|
21 |
+
return READER_LLM, embedding_model, KNOWLEDGE_VECTOR_DATABASE
|
22 |
+
|
23 |
+
READER_LLM, _, KNOWLEDGE_VECTOR_DATABASE = load_models()
|
24 |
+
|
25 |
+
if "messages" not in st.session_state:
|
26 |
+
st.session_state.messages = []
|
27 |
+
|
28 |
+
for message in st.session_state.messages:
|
29 |
+
with st.chat_message(message["role"]):
|
30 |
+
st.markdown(message["content"])
|
31 |
+
|
32 |
+
if prompt := st.chat_input("Задайте Ваш вопрос"):
|
33 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
34 |
+
|
35 |
+
with st.chat_message("user"):
|
36 |
+
st.markdown(prompt)
|
37 |
+
|
38 |
+
with st.chat_message("assistant"):
|
39 |
+
with st.spinner("Ищу информацию..."):
|
40 |
+
answer, sources = answer_with_rag2(
|
41 |
+
question=prompt,
|
42 |
+
llm=READER_LLM,
|
43 |
+
knowledge_index=KNOWLEDGE_VECTOR_DATABASE
|
44 |
+
)
|
45 |
+
st.markdown(answer)
|
46 |
+
|
47 |
+
# if sources:
|
48 |
+
# st.markdown("**Источники информации:**")
|
49 |
+
# for i, doc in enumerate(sources):
|
50 |
+
# with st.expander(f"Источник {i+1}"):
|
51 |
+
# st.write(doc.page_content)
|
52 |
+
# if hasattr(doc, 'metadata'):
|
53 |
+
# if "latitude" in doc.metadata and "longitude" in doc.metadata:
|
54 |
+
# st.write(f"📍 Координаты: {doc.metadata['latitude']}, {doc.metadata['longitude']}")
|
55 |
+
# if "image" in doc.metadata and doc.metadata["image"]:
|
56 |
+
# try:
|
57 |
+
# if isinstance(doc.metadata["image"], str):
|
58 |
+
# if doc.metadata["image"].startswith('/9j/'):
|
59 |
+
# import base64
|
60 |
+
# from io import BytesIO
|
61 |
+
# from PIL import Image
|
62 |
+
|
63 |
+
# img_bytes = base64.b64decode(doc.metadata["image"])
|
64 |
+
# img = Image.open(BytesIO(img_bytes))
|
65 |
+
# st.image(img, caption=f"Изображение {i+1}")
|
66 |
+
# else:
|
67 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
68 |
+
# elif isinstance(doc.metadata["image"], bytes):
|
69 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
70 |
+
# except Exception as e:
|
71 |
+
# st.error(f"Ошибка загрузки изображения: {str(e)}")
|
72 |
+
|
73 |
+
st.session_state.messages.append({"role": "assistant", "content": answer})
|
loader.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from langchain.docstore.document import Document as LangchainDocument
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
|
5 |
+
|
6 |
+
def load_and_split_markdown(filepath='https://drive.google.com/u/0/uc?id=1JQswhvNz6yNKKzJW0nrXU7AmUQaGevxA&export=download'):
|
7 |
+
# Загрузка данных
|
8 |
+
data_cities = pd.read_csv(filepath)
|
9 |
+
|
10 |
+
# Создание документов без прогресс-бара
|
11 |
+
RAW_KNOWLEDGE_BASE = [
|
12 |
+
LangchainDocument(
|
13 |
+
page_content=f"{row['City']} | {row['Name']} | {row['description']}",
|
14 |
+
metadata={
|
15 |
+
"longitude": row['Lon'],
|
16 |
+
"latitude": row['Lat'],
|
17 |
+
"image": row['image'],
|
18 |
+
# "english_description": row['en_txt']
|
19 |
+
}
|
20 |
+
)
|
21 |
+
for _, row in data_cities.iterrows() # Убрали tqdm
|
22 |
+
]
|
23 |
+
|
24 |
+
# Настройки разделителя текста
|
25 |
+
MARKDOWN_SEPARATORS = [
|
26 |
+
"\n#{1,6} ",
|
27 |
+
"```\n",
|
28 |
+
"\n\\*\\*\\*+\n",
|
29 |
+
"\n---+\n",
|
30 |
+
"\n___+\n",
|
31 |
+
"\n\n",
|
32 |
+
"\n",
|
33 |
+
" ",
|
34 |
+
"",
|
35 |
+
]
|
36 |
+
|
37 |
+
# Инициализация разделителя текста
|
38 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
39 |
+
chunk_size=1000,
|
40 |
+
chunk_overlap=100,
|
41 |
+
add_start_index=True,
|
42 |
+
strip_whitespace=True,
|
43 |
+
separators=MARKDOWN_SEPARATORS,
|
44 |
+
)
|
45 |
+
|
46 |
+
# Разделение документов
|
47 |
+
docs_processed = []
|
48 |
+
for doc in RAW_KNOWLEDGE_BASE:
|
49 |
+
docs_processed += text_splitter.split_documents([doc])
|
50 |
+
|
51 |
+
return docs_processed
|
rag.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
3 |
+
os.environ["NO_CUDA_EXT"] = "1"
|
4 |
+
|
5 |
+
|
6 |
+
from reader_llm import get_reader_llm
|
7 |
+
from retrieval import get_retriever
|
8 |
+
from answer_rag import answer_with_rag2
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
# Настройка страницы
|
12 |
+
st.set_page_config(page_title="RAG", layout="wide")
|
13 |
+
st.title("Туристический путеводитель")
|
14 |
+
st.header("Города: Ярославль, Екатеринбург, Нижний Новгород, Владимир")
|
15 |
+
|
16 |
+
@st.cache_resource
|
17 |
+
def load_models():
|
18 |
+
READER_LLM = get_reader_llm(name="Vikhrmodels/Vikhr-Llama-3.2-1B-Instruct")
|
19 |
+
# READER_LLM = get_reader_llm(name="microsoft/phi-2") # легкая модель для приложения на сайте streamlit
|
20 |
+
embedding_model, KNOWLEDGE_VECTOR_DATABASE = get_retriever()
|
21 |
+
return READER_LLM, embedding_model, KNOWLEDGE_VECTOR_DATABASE
|
22 |
+
|
23 |
+
READER_LLM, _, KNOWLEDGE_VECTOR_DATABASE = load_models()
|
24 |
+
|
25 |
+
if "messages" not in st.session_state:
|
26 |
+
st.session_state.messages = []
|
27 |
+
|
28 |
+
for message in st.session_state.messages:
|
29 |
+
with st.chat_message(message["role"]):
|
30 |
+
st.markdown(message["content"])
|
31 |
+
|
32 |
+
if prompt := st.chat_input("Задайте Ваш вопрос"):
|
33 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
34 |
+
|
35 |
+
with st.chat_message("user"):
|
36 |
+
st.markdown(prompt)
|
37 |
+
|
38 |
+
with st.chat_message("assistant"):
|
39 |
+
with st.spinner("Ищу информацию..."):
|
40 |
+
answer, sources = answer_with_rag2(
|
41 |
+
question=prompt,
|
42 |
+
llm=READER_LLM,
|
43 |
+
knowledge_index=KNOWLEDGE_VECTOR_DATABASE
|
44 |
+
)
|
45 |
+
st.markdown(answer)
|
46 |
+
|
47 |
+
# if sources:
|
48 |
+
# st.markdown("**Источники информации:**")
|
49 |
+
# for i, doc in enumerate(sources):
|
50 |
+
# with st.expander(f"Источник {i+1}"):
|
51 |
+
# st.write(doc.page_content)
|
52 |
+
# if hasattr(doc, 'metadata'):
|
53 |
+
# if "latitude" in doc.metadata and "longitude" in doc.metadata:
|
54 |
+
# st.write(f"📍 Координаты: {doc.metadata['latitude']}, {doc.metadata['longitude']}")
|
55 |
+
# if "image" in doc.metadata and doc.metadata["image"]:
|
56 |
+
# try:
|
57 |
+
# if isinstance(doc.metadata["image"], str):
|
58 |
+
# if doc.metadata["image"].startswith('/9j/'):
|
59 |
+
# import base64
|
60 |
+
# from io import BytesIO
|
61 |
+
# from PIL import Image
|
62 |
+
|
63 |
+
# img_bytes = base64.b64decode(doc.metadata["image"])
|
64 |
+
# img = Image.open(BytesIO(img_bytes))
|
65 |
+
# st.image(img, caption=f"Изображение {i+1}")
|
66 |
+
# else:
|
67 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
68 |
+
# elif isinstance(doc.metadata["image"], bytes):
|
69 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
70 |
+
# except Exception as e:
|
71 |
+
# st.error(f"Ошибка загрузки изображения: {str(e)}")
|
72 |
+
|
73 |
+
st.session_state.messages.append({"role": "assistant", "content": answer})
|
rag_local.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
3 |
+
os.environ["NO_CUDA_EXT"] = "1"
|
4 |
+
|
5 |
+
|
6 |
+
from reader_llm import get_reader_llm
|
7 |
+
from retrieval import get_retriever
|
8 |
+
from answer_rag import answer_with_rag2
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
# Настройка страницы
|
12 |
+
st.set_page_config(page_title="RAG", layout="wide")
|
13 |
+
st.title("Туристический путеводитель")
|
14 |
+
st.header("Города: Ярославль, Екатеринбург, Нижний Новгород, Владимир")
|
15 |
+
|
16 |
+
@st.cache_resource
|
17 |
+
def load_models():
|
18 |
+
READER_LLM = get_reader_llm()
|
19 |
+
embedding_model, KNOWLEDGE_VECTOR_DATABASE = get_retriever()
|
20 |
+
return READER_LLM, embedding_model, KNOWLEDGE_VECTOR_DATABASE
|
21 |
+
|
22 |
+
READER_LLM, _, KNOWLEDGE_VECTOR_DATABASE = load_models()
|
23 |
+
|
24 |
+
if "messages" not in st.session_state:
|
25 |
+
st.session_state.messages = []
|
26 |
+
|
27 |
+
for message in st.session_state.messages:
|
28 |
+
with st.chat_message(message["role"]):
|
29 |
+
st.markdown(message["content"])
|
30 |
+
|
31 |
+
if prompt := st.chat_input("Задайте Ваш вопрос"):
|
32 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
33 |
+
|
34 |
+
with st.chat_message("user"):
|
35 |
+
st.markdown(prompt)
|
36 |
+
|
37 |
+
with st.chat_message("assistant"):
|
38 |
+
with st.spinner("Ищу информацию..."):
|
39 |
+
answer, sources = answer_with_rag2(
|
40 |
+
question=prompt,
|
41 |
+
llm=READER_LLM,
|
42 |
+
knowledge_index=KNOWLEDGE_VECTOR_DATABASE
|
43 |
+
)
|
44 |
+
st.markdown(answer)
|
45 |
+
|
46 |
+
# if sources:
|
47 |
+
# st.markdown("**Источники информации:**")
|
48 |
+
# for i, doc in enumerate(sources):
|
49 |
+
# with st.expander(f"Источник {i+1}"):
|
50 |
+
# st.write(doc.page_content)
|
51 |
+
# if hasattr(doc, 'metadata'):
|
52 |
+
# if "latitude" in doc.metadata and "longitude" in doc.metadata:
|
53 |
+
# st.write(f"📍 Координаты: {doc.metadata['latitude']}, {doc.metadata['longitude']}")
|
54 |
+
# if "image" in doc.metadata and doc.metadata["image"]:
|
55 |
+
# try:
|
56 |
+
# if isinstance(doc.metadata["image"], str):
|
57 |
+
# if doc.metadata["image"].startswith('/9j/'):
|
58 |
+
# import base64
|
59 |
+
# from io import BytesIO
|
60 |
+
# from PIL import Image
|
61 |
+
|
62 |
+
# img_bytes = base64.b64decode(doc.metadata["image"])
|
63 |
+
# img = Image.open(BytesIO(img_bytes))
|
64 |
+
# st.image(img, caption=f"Изображение {i+1}")
|
65 |
+
# else:
|
66 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
67 |
+
# elif isinstance(doc.metadata["image"], bytes):
|
68 |
+
# st.image(doc.metadata["image"], caption=f"Изображение {i+1}")
|
69 |
+
# except Exception as e:
|
70 |
+
# st.error(f"Ошибка загрузки изображения: {str(e)}")
|
71 |
+
|
72 |
+
st.session_state.messages.append({"role": "assistant", "content": answer})
|
reader_llm.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
+
import torch
|
4 |
+
|
5 |
+
def get_reader_llm(name="Qwen/Qwen2.5-3B-Instruct"):
|
6 |
+
READER_MODEL_NAME = name
|
7 |
+
|
8 |
+
# Для CPU-only лучше не использовать device_map
|
9 |
+
model = AutoModelForCausalLM.from_pretrained(
|
10 |
+
READER_MODEL_NAME,
|
11 |
+
torch_dtype=torch.float16,
|
12 |
+
low_cpu_mem_usage=True
|
13 |
+
)
|
14 |
+
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
|
16 |
+
|
17 |
+
READER_LLM = pipeline(
|
18 |
+
model=model,
|
19 |
+
tokenizer=tokenizer,
|
20 |
+
task="text-generation",
|
21 |
+
# Убираем device, так как модель уже на CPU
|
22 |
+
do_sample=True,
|
23 |
+
temperature=0.2,
|
24 |
+
repetition_penalty=1.1,
|
25 |
+
return_full_text=False,
|
26 |
+
max_new_tokens=50 # Еще больше уменьшаем для надежности
|
27 |
+
)
|
28 |
+
return READER_LLM
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
reranker.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["NO_CUDA_EXT"] = "1" # Полностью отключаем C++ расширения
|
3 |
+
|
4 |
+
from typing import Optional
|
5 |
+
from sentence_transformers import CrossEncoder
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
|
8 |
+
def get_reranker(name: Optional[str] = None) -> CrossEncoder:
|
9 |
+
"""
|
10 |
+
Инициализация ранкера с использованием CrossEncoder
|
11 |
+
"""
|
12 |
+
# Используем более легкую модель по умолчанию
|
13 |
+
model_name = name or 'cross-encoder/ms-marco-MiniLM-L-6-v2'
|
14 |
+
return CrossEncoder(model_name)
|
15 |
+
|
16 |
+
def get_rag_prompt_template():
|
17 |
+
prompt_in_chat_format = [
|
18 |
+
{
|
19 |
+
"role": "system",
|
20 |
+
"content": """Используй информацию из контекста, чтобы дать полный ответ на вопрос.
|
21 |
+
Отвечай только на заданный вопрос, ответ должен быть чётким и соответствующим вопросу.
|
22 |
+
Указывай номер исходного документа, когда это уместно.
|
23 |
+
Если ответ нельзя вывести из контекста, дай ответ,который знаешь, но обязательно напиши,что ответ дан не из контекста.
|
24 |
+
Отвечай строго на русском языке, даже если контекст содержит текст на других языках.""", # Добавлено требование русского языка
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"role": "user",
|
28 |
+
"content": """Контекст:
|
29 |
+
{context}
|
30 |
+
---
|
31 |
+
Вот вопрос, на который нужно ответить.
|
32 |
+
|
33 |
+
Вопрос: {question}""",
|
34 |
+
}
|
35 |
+
]
|
36 |
+
READER_MODEL_NAME="Qwen/Qwen2.5-3B-Instruct"
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
|
38 |
+
return tokenizer.apply_chat_template(
|
39 |
+
prompt_in_chat_format,
|
40 |
+
tokenize=False,
|
41 |
+
add_generation_prompt=True
|
42 |
+
)
|
retrieval.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
2 |
+
from loader import load_and_split_markdown
|
3 |
+
# from langchain.vectorstores import FAISS
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain_community.vectorstores.utils import DistanceStrategy
|
6 |
+
from huggingface_hub.utils import disable_progress_bars
|
7 |
+
disable_progress_bars() # Отключает прогресс-бары загрузки
|
8 |
+
|
9 |
+
|
10 |
+
def get_retriever(name='intfloat/multilingual-e5-large'):
|
11 |
+
# Убираем multi_process для Windows
|
12 |
+
embedding_model = HuggingFaceEmbeddings(
|
13 |
+
model_name=name,
|
14 |
+
model_kwargs={"device": "cpu"},
|
15 |
+
encode_kwargs={
|
16 |
+
"normalize_embeddings": True,
|
17 |
+
"batch_size": 4 # Уменьшаем batch_size для CPU
|
18 |
+
}
|
19 |
+
)
|
20 |
+
docs_processed=load_and_split_markdown()
|
21 |
+
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
|
22 |
+
documents=docs_processed, embedding=embedding_model, distance_strategy=DistanceStrategy.COSINE )
|
23 |
+
|
24 |
+
return embedding_model, KNOWLEDGE_VECTOR_DATABASE
|