Deepak Sahu commited on
Commit
2fe32bb
·
1 Parent(s): 694021b

adding vector store

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cache_vector_store_text/** filter=lfs diff=lfs merge=lfs -text
37
+ cache_vector_store_images/** filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import numpy as np
2
  import gradio as gr
3
  from z_generate import ServerlessInference
 
4
 
5
  # STATIC TEXT DISPLAY
6
  TXT_APP_DESCRIPTION = '''
@@ -12,17 +13,23 @@ Manually Downloaded as HTML files:
12
  1. https://en.wikipedia.org/wiki/MS_Dhoni
13
  2. https://en.wikipedia.org/wiki/Jharkhand
14
  2. https://en.wikipedia.org/wiki/Cricket_World_Cup
 
 
 
 
 
15
  '''
16
 
17
 
18
  # UI Interface
19
  demo = gr.Blocks()
20
-
21
- llm = ServerlessInference()
22
 
23
  # Processing Functions
24
  def update_response(query:str = "something"):
25
- return llm.test(query)
 
26
 
27
  def update_gallery(text:str = "hell"):
28
  imgs = [
 
1
  import numpy as np
2
  import gradio as gr
3
  from z_generate import ServerlessInference
4
+ from z_embedding import load_vector_store
5
 
6
  # STATIC TEXT DISPLAY
7
  TXT_APP_DESCRIPTION = '''
 
13
  1. https://en.wikipedia.org/wiki/MS_Dhoni
14
  2. https://en.wikipedia.org/wiki/Jharkhand
15
  2. https://en.wikipedia.org/wiki/Cricket_World_Cup
16
+
17
+ ## Details
18
+
19
+ 1. Vector Store is built using FAISS prior to starting this app. Although the vector store size in KBs but the creation and loading of the store takes processing takes ~10GB RAM and lasts 5 mins. Hence **NOT BUILDING IT DURING RUNTIME OF APP**.
20
+
21
  '''
22
 
23
 
24
  # UI Interface
25
  demo = gr.Blocks()
26
+ vector_text, vector_image = load_vector_store()
27
+ llm = ServerlessInference(vector_store_text=vector_text, vector_store_images=vector_image)
28
 
29
  # Processing Functions
30
  def update_response(query:str = "something"):
31
+ response_text = llm.perform_rag(query)
32
+ return response_text
33
 
34
  def update_gallery(text:str = "hell"):
35
  imgs = [
cache_vector_store_images/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe6d0d8806548cb057ca3dce003bed7827a90fc6cf3ca6792c09601498a716e9
3
+ size 49197
cache_vector_store_images/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a29bba3d9f601e708d17266e7f77791dcff9f94de806dd265aa1ae8fb7da0a6
3
+ size 8187
cache_vector_store_text/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07b4ebc87655d64c3e18879cf33630be391a239771b1ac074e204ee2c07c56a1
3
+ size 454701
cache_vector_store_text/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cb504a190a91ca3dd8a62d4ec7a9f0362e59c6541fc1c1b0207c49287fb2b6a
3
+ size 315789
requriements.txt CHANGED
@@ -51,3 +51,10 @@ tzdata==2025.1
51
  urllib3==2.3.0
52
  uvicorn==0.34.0
53
  websockets==14.2
 
 
 
 
 
 
 
 
51
  urllib3==2.3.0
52
  uvicorn==0.34.0
53
  websockets==14.2
54
+ langchain
55
+ pillow
56
+ transformers
57
+ langchain-community
58
+ sentence-transformers
59
+ faiss-cpu
60
+ beautifulsoup4
z_document_reader.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # First creating Document reader
2
+
3
+ from typing import List
4
+ from langchain.docstore.document import Document as LangchainDocument
5
+ from bs4 import BeautifulSoup
6
+ import re
7
+
8
+
9
+ def read_wiki_html(filename: str) -> List[List]:
10
+ """
11
+ Reads an HTML file, extracts the contents of the <body> tag,
12
+ finds all <figure> tags with their hrefs and <figcaption>,
13
+ and returns the processed content as a string.
14
+
15
+ Args:
16
+ filename (str): The path to the HTML file.
17
+
18
+ Returns:
19
+ TEXT_KB: list of text extracted from the html
20
+ Figure_KB: list of figure captions extracted
21
+ """
22
+ try:
23
+ with open(filename, 'r', encoding='utf-8') as file:
24
+ content = file.read()
25
+
26
+ # Parse the HTML content
27
+ soup = BeautifulSoup(content, 'html.parser')
28
+
29
+ # Focus only on the <body> tag
30
+ body = soup.body
31
+ if body is None:
32
+ return "Error: No <body> tag found in the HTML file."
33
+
34
+ body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())
35
+
36
+
37
+ TEXT_KB = [
38
+ LangchainDocument(page_content=body_text)
39
+ ]
40
+
41
+
42
+ # Extract all <figure> tags with their href and figcaption
43
+ FIG_KB = []
44
+ for figure in body.find_all('figure'):
45
+ href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
46
+ figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
47
+ # figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
48
+ FIG_KB.append(
49
+ LangchainDocument(page_content=figcaption, metadata={"url": href})
50
+ )
51
+
52
+ # Join the details into a single string
53
+ return (TEXT_KB, FIG_KB)
54
+
55
+ except FileNotFoundError:
56
+ return f"Error: File '{filename}' not found."
57
+ except Exception as e:
58
+ return f"Error: {str(e)}"
59
+
60
+ if __name__=="__main__":
61
+ contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
62
+ # read_pdf()
63
+ pass
z_embedding.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from uuid import uuid4
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from transformers import AutoTokenizer
5
+ from langchain.docstore.document import Document as LangchainDocument
6
+ from tqdm import tqdm
7
+
8
+ from langchain.vectorstores import FAISS
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.vectorstores.utils import DistanceStrategy
11
+
12
+ from z_document_reader import read_wiki_html
13
+
14
+ EMBEDDING_MODEL_NAME = "thenlper/gte-small"
15
+
16
+ def get_embedding_model():
17
+
18
+ embedding_model = HuggingFaceEmbeddings(
19
+ model_name=EMBEDDING_MODEL_NAME,
20
+ multi_process=True,
21
+ model_kwargs={"device": "cpu"},
22
+ encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
23
+ )
24
+ return embedding_model
25
+
26
+ def split_documents(
27
+ chunk_size: int,
28
+ knowledge_base: List[LangchainDocument],
29
+ tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
30
+ ) -> List[LangchainDocument]:
31
+ """
32
+ Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
33
+ """
34
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
35
+ AutoTokenizer.from_pretrained(tokenizer_name),
36
+ chunk_size=chunk_size,
37
+ chunk_overlap=int(chunk_size / 10),
38
+ add_start_index=True,
39
+ strip_whitespace=True,
40
+ # separators=MARKDOWN_SEPARATORS,
41
+ )
42
+
43
+ docs_processed = []
44
+ for doc in knowledge_base:
45
+ docs_processed += text_splitter.split_documents([doc])
46
+
47
+ # Remove duplicates
48
+ unique_texts = {}
49
+ docs_processed_unique = []
50
+ for doc in docs_processed:
51
+ if doc.page_content not in unique_texts:
52
+ unique_texts[doc.page_content] = True
53
+ docs_processed_unique.append(doc)
54
+
55
+ return docs_processed_unique
56
+
57
+ def construct_vector_db(docs_processed, emb_model):
58
+ vdb = FAISS.from_documents(
59
+ docs_processed, emb_model, distance_strategy=DistanceStrategy.COSINE
60
+ )
61
+ return vdb
62
+ # from langchain_chroma import Chroma
63
+ # vector_store = Chroma(
64
+ # collection_name="example_collection",
65
+ # embedding_function=emb_model,
66
+ # persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
67
+ # )
68
+ # return vector_store
69
+
70
+ def get_data_files(location:str ="_data/") -> list:
71
+ """
72
+ Returns html file paths
73
+ """
74
+ from glob import glob
75
+ files = glob(location + "*.html")
76
+ files += glob(location + "*.htm")
77
+ return files
78
+
79
+ def generate_and_save_vector_store(vector_store_location:str="cache_vector_store"):
80
+ """
81
+ One time function to create and store vector
82
+ """
83
+ data_files = get_data_files()
84
+ TEXT_KBs, IMAGE_KBs = list(), list()
85
+ for file in data_files:
86
+ TEXT_KB, IMAGE_KB = read_wiki_html(file)
87
+ TEXT_KBs.extend(TEXT_KB)
88
+ IMAGE_KBs.extend(IMAGE_KB)
89
+
90
+ #
91
+ docs_text_processed = split_documents(
92
+ 512, # We choose a chunk size adapted to our model
93
+ TEXT_KBs,
94
+ tokenizer_name=EMBEDDING_MODEL_NAME,
95
+ )
96
+ docs_imgs_processed = split_documents(
97
+ 512, # We choose a chunk size adapted to our model
98
+ IMAGE_KBs,
99
+ tokenizer_name=EMBEDDING_MODEL_NAME,
100
+ )
101
+
102
+ emb_model = get_embedding_model()
103
+
104
+ vector_store_text = construct_vector_db(docs_text_processed, emb_model)
105
+ vector_store_images = construct_vector_db(docs_imgs_processed, emb_model)
106
+
107
+ vector_store_text.save_local(vector_store_location+"_text")
108
+ vector_store_images.save_local(vector_store_location+"_images")
109
+
110
+ def load_vector_store(vector_store_location:str="cache_vector_store"):
111
+ '''Returns two vector stores; one for text and another for image
112
+ '''
113
+ emb_model = get_embedding_model()
114
+
115
+ vs_text = FAISS.load_local(
116
+ vector_store_location+"_text", emb_model, allow_dangerous_deserialization=True
117
+ )
118
+ vs_image = FAISS.load_local(
119
+ vector_store_location+"_images", emb_model, allow_dangerous_deserialization=True
120
+ )
121
+
122
+ return vs_text, vs_image
123
+
124
+ if __name__ == "__main__":
125
+ # generate_and_save_vector_store()
126
+ load_vector_store()
127
+ pass
z_generate.py CHANGED
@@ -2,9 +2,11 @@ from huggingface_hub import InferenceClient
2
  import os
3
 
4
  class ServerlessInference:
5
- def __init__(self):
6
  self.model:str = "HuggingFaceH4/zephyr-7b-beta"
7
  self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
 
 
8
 
9
  def test(self, query:str) -> str:
10
  '''Responds to query using llm'''
@@ -20,4 +22,9 @@ class ServerlessInference:
20
  max_tokens=500
21
  )
22
 
23
- return completion.choices[0].message.content
 
 
 
 
 
 
2
  import os
3
 
4
  class ServerlessInference:
5
+ def __init__(self, vector_store_text = None, vector_store_images = None):
6
  self.model:str = "HuggingFaceH4/zephyr-7b-beta"
7
  self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
8
+ self.vs_text = vector_store_text
9
+ self.vs_images = vector_store_images
10
 
11
  def test(self, query:str) -> str:
12
  '''Responds to query using llm'''
 
22
  max_tokens=500
23
  )
24
 
25
+ return completion.choices[0].message.content
26
+
27
+ def perform_rag(self, query:str):
28
+ # First perform text search
29
+ relevant_docs = self.vs_text.similarity_search(query=query, k=5)
30
+ return relevant_docs[0].page_content