Spaces:
Sleeping
Sleeping
Deepak Sahu
commited on
Commit
·
2fe32bb
1
Parent(s):
694021b
adding vector store
Browse files- .gitattributes +2 -0
- app.py +10 -3
- cache_vector_store_images/index.faiss +3 -0
- cache_vector_store_images/index.pkl +3 -0
- cache_vector_store_text/index.faiss +3 -0
- cache_vector_store_text/index.pkl +3 -0
- requriements.txt +7 -0
- z_document_reader.py +63 -0
- z_embedding.py +127 -0
- z_generate.py +9 -2
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cache_vector_store_text/** filter=lfs diff=lfs merge=lfs -text
|
37 |
+
cache_vector_store_images/** filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import numpy as np
|
2 |
import gradio as gr
|
3 |
from z_generate import ServerlessInference
|
|
|
4 |
|
5 |
# STATIC TEXT DISPLAY
|
6 |
TXT_APP_DESCRIPTION = '''
|
@@ -12,17 +13,23 @@ Manually Downloaded as HTML files:
|
|
12 |
1. https://en.wikipedia.org/wiki/MS_Dhoni
|
13 |
2. https://en.wikipedia.org/wiki/Jharkhand
|
14 |
2. https://en.wikipedia.org/wiki/Cricket_World_Cup
|
|
|
|
|
|
|
|
|
|
|
15 |
'''
|
16 |
|
17 |
|
18 |
# UI Interface
|
19 |
demo = gr.Blocks()
|
20 |
-
|
21 |
-
llm = ServerlessInference()
|
22 |
|
23 |
# Processing Functions
|
24 |
def update_response(query:str = "something"):
|
25 |
-
|
|
|
26 |
|
27 |
def update_gallery(text:str = "hell"):
|
28 |
imgs = [
|
|
|
1 |
import numpy as np
|
2 |
import gradio as gr
|
3 |
from z_generate import ServerlessInference
|
4 |
+
from z_embedding import load_vector_store
|
5 |
|
6 |
# STATIC TEXT DISPLAY
|
7 |
TXT_APP_DESCRIPTION = '''
|
|
|
13 |
1. https://en.wikipedia.org/wiki/MS_Dhoni
|
14 |
2. https://en.wikipedia.org/wiki/Jharkhand
|
15 |
2. https://en.wikipedia.org/wiki/Cricket_World_Cup
|
16 |
+
|
17 |
+
## Details
|
18 |
+
|
19 |
+
1. Vector Store is built using FAISS prior to starting this app. Although the vector store size in KBs but the creation and loading of the store takes processing takes ~10GB RAM and lasts 5 mins. Hence **NOT BUILDING IT DURING RUNTIME OF APP**.
|
20 |
+
|
21 |
'''
|
22 |
|
23 |
|
24 |
# UI Interface
|
25 |
demo = gr.Blocks()
|
26 |
+
vector_text, vector_image = load_vector_store()
|
27 |
+
llm = ServerlessInference(vector_store_text=vector_text, vector_store_images=vector_image)
|
28 |
|
29 |
# Processing Functions
|
30 |
def update_response(query:str = "something"):
|
31 |
+
response_text = llm.perform_rag(query)
|
32 |
+
return response_text
|
33 |
|
34 |
def update_gallery(text:str = "hell"):
|
35 |
imgs = [
|
cache_vector_store_images/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe6d0d8806548cb057ca3dce003bed7827a90fc6cf3ca6792c09601498a716e9
|
3 |
+
size 49197
|
cache_vector_store_images/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a29bba3d9f601e708d17266e7f77791dcff9f94de806dd265aa1ae8fb7da0a6
|
3 |
+
size 8187
|
cache_vector_store_text/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07b4ebc87655d64c3e18879cf33630be391a239771b1ac074e204ee2c07c56a1
|
3 |
+
size 454701
|
cache_vector_store_text/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cb504a190a91ca3dd8a62d4ec7a9f0362e59c6541fc1c1b0207c49287fb2b6a
|
3 |
+
size 315789
|
requriements.txt
CHANGED
@@ -51,3 +51,10 @@ tzdata==2025.1
|
|
51 |
urllib3==2.3.0
|
52 |
uvicorn==0.34.0
|
53 |
websockets==14.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
urllib3==2.3.0
|
52 |
uvicorn==0.34.0
|
53 |
websockets==14.2
|
54 |
+
langchain
|
55 |
+
pillow
|
56 |
+
transformers
|
57 |
+
langchain-community
|
58 |
+
sentence-transformers
|
59 |
+
faiss-cpu
|
60 |
+
beautifulsoup4
|
z_document_reader.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# First creating Document reader
|
2 |
+
|
3 |
+
from typing import List
|
4 |
+
from langchain.docstore.document import Document as LangchainDocument
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import re
|
7 |
+
|
8 |
+
|
9 |
+
def read_wiki_html(filename: str) -> List[List]:
|
10 |
+
"""
|
11 |
+
Reads an HTML file, extracts the contents of the <body> tag,
|
12 |
+
finds all <figure> tags with their hrefs and <figcaption>,
|
13 |
+
and returns the processed content as a string.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
filename (str): The path to the HTML file.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
TEXT_KB: list of text extracted from the html
|
20 |
+
Figure_KB: list of figure captions extracted
|
21 |
+
"""
|
22 |
+
try:
|
23 |
+
with open(filename, 'r', encoding='utf-8') as file:
|
24 |
+
content = file.read()
|
25 |
+
|
26 |
+
# Parse the HTML content
|
27 |
+
soup = BeautifulSoup(content, 'html.parser')
|
28 |
+
|
29 |
+
# Focus only on the <body> tag
|
30 |
+
body = soup.body
|
31 |
+
if body is None:
|
32 |
+
return "Error: No <body> tag found in the HTML file."
|
33 |
+
|
34 |
+
body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())
|
35 |
+
|
36 |
+
|
37 |
+
TEXT_KB = [
|
38 |
+
LangchainDocument(page_content=body_text)
|
39 |
+
]
|
40 |
+
|
41 |
+
|
42 |
+
# Extract all <figure> tags with their href and figcaption
|
43 |
+
FIG_KB = []
|
44 |
+
for figure in body.find_all('figure'):
|
45 |
+
href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
|
46 |
+
figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
|
47 |
+
# figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
|
48 |
+
FIG_KB.append(
|
49 |
+
LangchainDocument(page_content=figcaption, metadata={"url": href})
|
50 |
+
)
|
51 |
+
|
52 |
+
# Join the details into a single string
|
53 |
+
return (TEXT_KB, FIG_KB)
|
54 |
+
|
55 |
+
except FileNotFoundError:
|
56 |
+
return f"Error: File '{filename}' not found."
|
57 |
+
except Exception as e:
|
58 |
+
return f"Error: {str(e)}"
|
59 |
+
|
60 |
+
if __name__=="__main__":
|
61 |
+
contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
|
62 |
+
# read_pdf()
|
63 |
+
pass
|
z_embedding.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
from uuid import uuid4
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
from langchain.docstore.document import Document as LangchainDocument
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
+
from langchain_community.vectorstores.utils import DistanceStrategy
|
11 |
+
|
12 |
+
from z_document_reader import read_wiki_html
|
13 |
+
|
14 |
+
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
|
15 |
+
|
16 |
+
def get_embedding_model():
|
17 |
+
|
18 |
+
embedding_model = HuggingFaceEmbeddings(
|
19 |
+
model_name=EMBEDDING_MODEL_NAME,
|
20 |
+
multi_process=True,
|
21 |
+
model_kwargs={"device": "cpu"},
|
22 |
+
encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
|
23 |
+
)
|
24 |
+
return embedding_model
|
25 |
+
|
26 |
+
def split_documents(
|
27 |
+
chunk_size: int,
|
28 |
+
knowledge_base: List[LangchainDocument],
|
29 |
+
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
|
30 |
+
) -> List[LangchainDocument]:
|
31 |
+
"""
|
32 |
+
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
|
33 |
+
"""
|
34 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
35 |
+
AutoTokenizer.from_pretrained(tokenizer_name),
|
36 |
+
chunk_size=chunk_size,
|
37 |
+
chunk_overlap=int(chunk_size / 10),
|
38 |
+
add_start_index=True,
|
39 |
+
strip_whitespace=True,
|
40 |
+
# separators=MARKDOWN_SEPARATORS,
|
41 |
+
)
|
42 |
+
|
43 |
+
docs_processed = []
|
44 |
+
for doc in knowledge_base:
|
45 |
+
docs_processed += text_splitter.split_documents([doc])
|
46 |
+
|
47 |
+
# Remove duplicates
|
48 |
+
unique_texts = {}
|
49 |
+
docs_processed_unique = []
|
50 |
+
for doc in docs_processed:
|
51 |
+
if doc.page_content not in unique_texts:
|
52 |
+
unique_texts[doc.page_content] = True
|
53 |
+
docs_processed_unique.append(doc)
|
54 |
+
|
55 |
+
return docs_processed_unique
|
56 |
+
|
57 |
+
def construct_vector_db(docs_processed, emb_model):
|
58 |
+
vdb = FAISS.from_documents(
|
59 |
+
docs_processed, emb_model, distance_strategy=DistanceStrategy.COSINE
|
60 |
+
)
|
61 |
+
return vdb
|
62 |
+
# from langchain_chroma import Chroma
|
63 |
+
# vector_store = Chroma(
|
64 |
+
# collection_name="example_collection",
|
65 |
+
# embedding_function=emb_model,
|
66 |
+
# persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
|
67 |
+
# )
|
68 |
+
# return vector_store
|
69 |
+
|
70 |
+
def get_data_files(location:str ="_data/") -> list:
|
71 |
+
"""
|
72 |
+
Returns html file paths
|
73 |
+
"""
|
74 |
+
from glob import glob
|
75 |
+
files = glob(location + "*.html")
|
76 |
+
files += glob(location + "*.htm")
|
77 |
+
return files
|
78 |
+
|
79 |
+
def generate_and_save_vector_store(vector_store_location:str="cache_vector_store"):
|
80 |
+
"""
|
81 |
+
One time function to create and store vector
|
82 |
+
"""
|
83 |
+
data_files = get_data_files()
|
84 |
+
TEXT_KBs, IMAGE_KBs = list(), list()
|
85 |
+
for file in data_files:
|
86 |
+
TEXT_KB, IMAGE_KB = read_wiki_html(file)
|
87 |
+
TEXT_KBs.extend(TEXT_KB)
|
88 |
+
IMAGE_KBs.extend(IMAGE_KB)
|
89 |
+
|
90 |
+
#
|
91 |
+
docs_text_processed = split_documents(
|
92 |
+
512, # We choose a chunk size adapted to our model
|
93 |
+
TEXT_KBs,
|
94 |
+
tokenizer_name=EMBEDDING_MODEL_NAME,
|
95 |
+
)
|
96 |
+
docs_imgs_processed = split_documents(
|
97 |
+
512, # We choose a chunk size adapted to our model
|
98 |
+
IMAGE_KBs,
|
99 |
+
tokenizer_name=EMBEDDING_MODEL_NAME,
|
100 |
+
)
|
101 |
+
|
102 |
+
emb_model = get_embedding_model()
|
103 |
+
|
104 |
+
vector_store_text = construct_vector_db(docs_text_processed, emb_model)
|
105 |
+
vector_store_images = construct_vector_db(docs_imgs_processed, emb_model)
|
106 |
+
|
107 |
+
vector_store_text.save_local(vector_store_location+"_text")
|
108 |
+
vector_store_images.save_local(vector_store_location+"_images")
|
109 |
+
|
110 |
+
def load_vector_store(vector_store_location:str="cache_vector_store"):
|
111 |
+
'''Returns two vector stores; one for text and another for image
|
112 |
+
'''
|
113 |
+
emb_model = get_embedding_model()
|
114 |
+
|
115 |
+
vs_text = FAISS.load_local(
|
116 |
+
vector_store_location+"_text", emb_model, allow_dangerous_deserialization=True
|
117 |
+
)
|
118 |
+
vs_image = FAISS.load_local(
|
119 |
+
vector_store_location+"_images", emb_model, allow_dangerous_deserialization=True
|
120 |
+
)
|
121 |
+
|
122 |
+
return vs_text, vs_image
|
123 |
+
|
124 |
+
if __name__ == "__main__":
|
125 |
+
# generate_and_save_vector_store()
|
126 |
+
load_vector_store()
|
127 |
+
pass
|
z_generate.py
CHANGED
@@ -2,9 +2,11 @@ from huggingface_hub import InferenceClient
|
|
2 |
import os
|
3 |
|
4 |
class ServerlessInference:
|
5 |
-
def __init__(self):
|
6 |
self.model:str = "HuggingFaceH4/zephyr-7b-beta"
|
7 |
self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
|
|
|
|
|
8 |
|
9 |
def test(self, query:str) -> str:
|
10 |
'''Responds to query using llm'''
|
@@ -20,4 +22,9 @@ class ServerlessInference:
|
|
20 |
max_tokens=500
|
21 |
)
|
22 |
|
23 |
-
return completion.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
3 |
|
4 |
class ServerlessInference:
|
5 |
+
def __init__(self, vector_store_text = None, vector_store_images = None):
|
6 |
self.model:str = "HuggingFaceH4/zephyr-7b-beta"
|
7 |
self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
|
8 |
+
self.vs_text = vector_store_text
|
9 |
+
self.vs_images = vector_store_images
|
10 |
|
11 |
def test(self, query:str) -> str:
|
12 |
'''Responds to query using llm'''
|
|
|
22 |
max_tokens=500
|
23 |
)
|
24 |
|
25 |
+
return completion.choices[0].message.content
|
26 |
+
|
27 |
+
def perform_rag(self, query:str):
|
28 |
+
# First perform text search
|
29 |
+
relevant_docs = self.vs_text.similarity_search(query=query, k=5)
|
30 |
+
return relevant_docs[0].page_content
|