Spaces:

LunaticMaestro
/

rag

Sleeping

App Files Files Community

Deepak Sahu commited on Jan 22

Commit

2fe32bb

1 Parent(s): 694021b

adding vector store

Browse files

Files changed (10) hide show

.gitattributes +2 -0
app.py +10 -3
cache_vector_store_images/index.faiss +3 -0
cache_vector_store_images/index.pkl +3 -0
cache_vector_store_text/index.faiss +3 -0
cache_vector_store_text/index.pkl +3 -0
requriements.txt +7 -0
z_document_reader.py +63 -0
z_embedding.py +127 -0
z_generate.py +9 -2

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cache_vector_store_text/** filter=lfs diff=lfs merge=lfs -text
+cache_vector_store_images/** filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import numpy as np
 import gradio as gr
 from z_generate import ServerlessInference
 # STATIC TEXT DISPLAY
 TXT_APP_DESCRIPTION = '''
@@ -12,17 +13,23 @@ Manually Downloaded as HTML files:
 1. https://en.wikipedia.org/wiki/MS_Dhoni
 2. https://en.wikipedia.org/wiki/Jharkhand
 2. https://en.wikipedia.org/wiki/Cricket_World_Cup
 '''
 # UI Interface
 demo = gr.Blocks()
-llm = ServerlessInference()
 # Processing Functions
 def update_response(query:str = "something"):
-    return llm.test(query)
 def update_gallery(text:str = "hell"):
     imgs = [

 import numpy as np
 import gradio as gr
 from z_generate import ServerlessInference
+from z_embedding import load_vector_store
 # STATIC TEXT DISPLAY
 TXT_APP_DESCRIPTION = '''
 1. https://en.wikipedia.org/wiki/MS_Dhoni
 2. https://en.wikipedia.org/wiki/Jharkhand
 2. https://en.wikipedia.org/wiki/Cricket_World_Cup
+## Details
+1. Vector Store is built using FAISS prior to starting this app. Although the vector store size in KBs but the creation and loading of the store takes processing takes ~10GB RAM and lasts 5 mins. Hence **NOT BUILDING IT DURING RUNTIME OF APP**.
 '''
 # UI Interface
 demo = gr.Blocks()
+vector_text, vector_image = load_vector_store()
+llm = ServerlessInference(vector_store_text=vector_text, vector_store_images=vector_image)
 # Processing Functions
 def update_response(query:str = "something"):
+    response_text = llm.perform_rag(query)
+    return response_text
 def update_gallery(text:str = "hell"):
     imgs = [

cache_vector_store_images/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe6d0d8806548cb057ca3dce003bed7827a90fc6cf3ca6792c09601498a716e9
+size 49197

cache_vector_store_images/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a29bba3d9f601e708d17266e7f77791dcff9f94de806dd265aa1ae8fb7da0a6
+size 8187

cache_vector_store_text/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07b4ebc87655d64c3e18879cf33630be391a239771b1ac074e204ee2c07c56a1
+size 454701

cache_vector_store_text/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cb504a190a91ca3dd8a62d4ec7a9f0362e59c6541fc1c1b0207c49287fb2b6a
+size 315789

requriements.txt CHANGED Viewed

@@ -51,3 +51,10 @@ tzdata==2025.1
 urllib3==2.3.0
 uvicorn==0.34.0
 websockets==14.2

 urllib3==2.3.0
 uvicorn==0.34.0
 websockets==14.2
+langchain
+pillow
+transformers
+langchain-community
+sentence-transformers
+faiss-cpu
+beautifulsoup4

z_document_reader.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# First creating Document reader
+from typing import List
+from langchain.docstore.document import Document as LangchainDocument
+from bs4 import BeautifulSoup
+import re
+def read_wiki_html(filename: str) -> List[List]:
+    """
+    Reads an HTML file, extracts the contents of the <body> tag,
+    finds all <figure> tags with their hrefs and <figcaption>,
+    and returns the processed content as a string.
+    Args:
+        filename (str): The path to the HTML file.
+    Returns:
+        TEXT_KB: list of text extracted from the html
+        Figure_KB: list of figure captions extracted
+    """
+    try:
+        with open(filename, 'r', encoding='utf-8') as file:
+            content = file.read()
+        # Parse the HTML content
+        soup = BeautifulSoup(content, 'html.parser')
+        # Focus only on the <body> tag
+        body = soup.body
+        if body is None:
+            return "Error: No <body> tag found in the HTML file."
+        body_text =  re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())
+        TEXT_KB = [
+            LangchainDocument(page_content=body_text)
+        ]
+        # Extract all <figure> tags with their href and figcaption
+        FIG_KB = []
+        for figure in body.find_all('figure'):
+            href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
+            figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
+            # figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
+            FIG_KB.append(
+                LangchainDocument(page_content=figcaption, metadata={"url": href})
+            )
+        # Join the details into a single string
+        return (TEXT_KB, FIG_KB)
+    except FileNotFoundError:
+        return f"Error: File '{filename}' not found."
+    except Exception as e:
+        return f"Error: {str(e)}"
+if __name__=="__main__":
+    contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
+    # read_pdf()
+    pass

z_embedding.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from typing import List, Optional
+from uuid import uuid4
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from transformers import AutoTokenizer
+from langchain.docstore.document import Document as LangchainDocument
+from tqdm import tqdm
+from langchain.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores.utils import DistanceStrategy
+from z_document_reader import read_wiki_html
+EMBEDDING_MODEL_NAME = "thenlper/gte-small"
+def get_embedding_model():
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        multi_process=True,
+        model_kwargs={"device": "cpu"},
+        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
+    )
+    return embedding_model
+def split_documents(
+    chunk_size: int,
+    knowledge_base: List[LangchainDocument],
+    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
+) -> List[LangchainDocument]:
+    """
+    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
+    """
+    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+        AutoTokenizer.from_pretrained(tokenizer_name),
+        chunk_size=chunk_size,
+        chunk_overlap=int(chunk_size / 10),
+        add_start_index=True,
+        strip_whitespace=True,
+        # separators=MARKDOWN_SEPARATORS,
+    )
+    docs_processed = []
+    for doc in knowledge_base:
+        docs_processed += text_splitter.split_documents([doc])
+    # Remove duplicates
+    unique_texts = {}
+    docs_processed_unique = []
+    for doc in docs_processed:
+        if doc.page_content not in unique_texts:
+            unique_texts[doc.page_content] = True
+            docs_processed_unique.append(doc)
+    return docs_processed_unique
+def construct_vector_db(docs_processed, emb_model):
+    vdb = FAISS.from_documents(
+        docs_processed, emb_model, distance_strategy=DistanceStrategy.COSINE
+    )
+    return vdb
+    # from langchain_chroma import Chroma
+    # vector_store = Chroma(
+    #     collection_name="example_collection",
+    #     embedding_function=emb_model,
+    #     persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
+    # )
+    # return vector_store
+def get_data_files(location:str ="_data/") -> list:
+    """
+    Returns html file paths
+    """
+    from glob import glob
+    files = glob(location + "*.html")
+    files += glob(location + "*.htm")
+    return files
+def generate_and_save_vector_store(vector_store_location:str="cache_vector_store"):
+    """
+    One time function to create and store vector
+    """
+    data_files = get_data_files()
+    TEXT_KBs, IMAGE_KBs = list(), list()
+    for file in data_files:
+        TEXT_KB, IMAGE_KB = read_wiki_html(file)
+        TEXT_KBs.extend(TEXT_KB)
+        IMAGE_KBs.extend(IMAGE_KB)
+    #
+    docs_text_processed = split_documents(
+        512,  # We choose a chunk size adapted to our model
+        TEXT_KBs,
+        tokenizer_name=EMBEDDING_MODEL_NAME,
+    )
+    docs_imgs_processed = split_documents(
+        512,  # We choose a chunk size adapted to our model
+        IMAGE_KBs,
+        tokenizer_name=EMBEDDING_MODEL_NAME,
+    )
+    emb_model = get_embedding_model()
+    vector_store_text = construct_vector_db(docs_text_processed, emb_model)
+    vector_store_images = construct_vector_db(docs_imgs_processed, emb_model)
+    vector_store_text.save_local(vector_store_location+"_text")
+    vector_store_images.save_local(vector_store_location+"_images")
+def load_vector_store(vector_store_location:str="cache_vector_store"):
+    '''Returns two vector stores; one for text and another for image
+    '''
+    emb_model = get_embedding_model()
+    vs_text = FAISS.load_local(
+        vector_store_location+"_text", emb_model, allow_dangerous_deserialization=True
+    )
+    vs_image = FAISS.load_local(
+        vector_store_location+"_images", emb_model, allow_dangerous_deserialization=True
+    )
+    return vs_text, vs_image
+if __name__ == "__main__":
+    # generate_and_save_vector_store()
+    load_vector_store()
+    pass

z_generate.py CHANGED Viewed

@@ -2,9 +2,11 @@ from huggingface_hub import InferenceClient
 import os
 class ServerlessInference:
-    def __init__(self):
         self.model:str = "HuggingFaceH4/zephyr-7b-beta"
         self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
     def test(self, query:str) -> str:
         '''Responds to query using llm'''
@@ -20,4 +22,9 @@ class ServerlessInference:
             max_tokens=500
         )
-        return completion.choices[0].message.content

 import os
 class ServerlessInference:
+    def __init__(self, vector_store_text = None, vector_store_images = None):
         self.model:str = "HuggingFaceH4/zephyr-7b-beta"
         self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
+        self.vs_text = vector_store_text
+        self.vs_images = vector_store_images
     def test(self, query:str) -> str:
         '''Responds to query using llm'''
             max_tokens=500
         )
+        return completion.choices[0].message.content
+    def perform_rag(self, query:str):
+        # First perform text search
+        relevant_docs = self.vs_text.similarity_search(query=query, k=5)
+        return relevant_docs[0].page_content