Spaces:

amoughnieh
/

rag-wikipedia

Sleeping

App Files Files Community

Ali Moughnieh commited on Aug 19

Commit

5446629

1 Parent(s): 62478f7

initial commit

Browse files

Files changed (5) hide show

.gitignore +6 -0
1_curate_data.py +29 -0
2_ingest.py +68 -0
app.py +88 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+data
+.git
+.idea
+__pycache__
+venv
+.env

1_curate_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import json
+from datasets import load_dataset
+full_dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split='train')
+dataset = full_dataset.shuffle(seed=42).select(range(50000))
+script_dir = os.getcwd()
+data_folder = os.path.join(script_dir, 'data', 'raw_documents')
+if not os.path.exists(data_folder):
+    os.makedirs(data_folder)
+for article in dataset:
+    article_data = {
+        'id': article['id'],
+        'url': article['url'],
+        'title': article['title'],
+        'text': article['text'],
+    }
+    file_path = os.path.join(data_folder, f"{article['id']}.json")
+    if not os.path.exists(file_path):
+        with open(file_path, 'w', encoding='utf-8') as f:
+            print(f.name, 'does not exist. creating file..')
+            json.dump(article_data, f, indent=4)
+if __name__ == '__main__':
+    pass

2_ingest.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import json
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+script_dir = os.path.dirname(os.path.abspath(__file__))
+data_folder = os.path.join(script_dir, 'data', 'raw_documents')
+files = os.listdir(data_folder)
+db_path = os.path.join(script_dir, 'data', 'chroma_db')
+if not os.path.exists(db_path):
+    document_to_store = []
+    for file in files:
+        with open(os.path.join(data_folder, file), 'r', encoding='utf-8') as f:
+            json_dict = json.load(f)
+            content = json_dict['text']
+            metadata = {key: value for key, value in json_dict.items() if key != 'text'}
+            document = Document(page_content=content,
+                                metadata=metadata)
+            document_to_store.append(document)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    texts = text_splitter.split_documents(document_to_store)
+    min_chunk_size = 50
+    long_texts = [doc for doc in texts if len(doc.page_content) > min_chunk_size]
+    print(f"Original number of chunks: {len(texts)}")
+    print(f"Number of chunks after filtering: {len(long_texts)}")
+    # creating vector database using filtered chunks
+    print('Creating the vector database...')
+    db = Chroma.from_documents(long_texts,
+                               embedding_function,
+                               persist_directory=db_path)
+    print('Finished creating the vector database.')
+else:
+    print('Vector database already exists. Loading...')
+    db = Chroma(
+        persist_directory=db_path,
+        embedding_function=embedding_function
+)
+    print('Vector database loaded')
+print("Checking titles in the database...")
+retrieved_items = db.get(
+    limit=1000000,
+    include=['metadatas']
+)
+unique_titles = set()
+for metadata in retrieved_items['metadatas']:
+    if 'title' in metadata:
+        unique_titles.add((metadata['title'], metadata['id']))
+print(f"\n--- {len(unique_titles)} Unique Article Titles Found ---")
+for title in sorted(list(unique_titles)):
+    print(title)
+if __name__ == '__main__':
+    pass

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import streamlit as st
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from dotenv import load_dotenv
+import os
+load_dotenv()
+st.title("AI-Powered Wikipedia Explorer")
+@st.cache_resource
+def load_chain():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    db_path = os.path.join(script_dir, 'data')
+    persist_directory = os.path.join(db_path, 'chroma_db')
+    embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    db = Chroma(
+        persist_directory=persist_directory,
+        embedding_function=embedding_function
+    )
+    print(db._collection.metadata)
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-2.5-flash-lite",
+        google_api_key=os.getenv("GOOGLE_API_KEY")
+    )
+    template = '''
+    Answer the question based only on the following knowledge base:
+    {context}
+    Question: {input}
+    Please remember, if the knowledge base does not include relevant information
+    pertaining to the question, do not provide information from your own
+    memory, only provide information from the given knowledge base.
+    '''
+    prompt = ChatPromptTemplate.from_template(template)
+    retriever = db.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={'score_threshold': 0.3,
+                       'k': 6}
+    )
+    document_chain = create_stuff_documents_chain(llm, prompt)
+    retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    return retrieval_chain
+chain = load_chain()
+user_question = st.text_input("Ask a question about the articles:")
+if st.button("Get Answer"):
+    if user_question:
+        with st.spinner("Thinking..."):
+            response = chain.invoke({"input": user_question})
+        if not response["context"]:
+            st.header("Answer")
+            st.write("I'm sorry, I couldn't find any relevant information in the documents to answer your question.")
+            with st.expander("Show Sources"):
+                st.write("Number of documents: 0")
+        else:
+            st.header("Answer")
+            st.write(response["answer"])
+            with st.expander("Show Sources"):
+                for doc in response["context"]:
+                    st.write(f"**Source:** {doc.metadata.get('title', 'Unknown Title')}, **ID:** {doc.metadata.get('id', 'Unknown ID')}")
+                    st.write(f"**URL:** {doc.metadata.get('url', 'No URL')}")
+                    st.write(f"**Content:** {doc.page_content}")
+                    st.write("---")
+                st.write(f"Number of documents: {len(response['context'])}")
+    else:
+        st.warning("Please enter a question first.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+datasets
+langchain
+langchain-google-genai
+langchain-chroma
+langchain-huggingface
+langchain-text-splitters
+sentence-transformers
+python-dotenv