Spaces:

Raja4321123
/

AI-Research-Assistant

Build error

App Files Files Community

Raja4321123 commited on Apr 20, 2024

Commit

42fa84c

verified ·

1 Parent(s): 974d2f2

Upload 28 files

Browse files

Files changed (29) hide show

.gitattributes +1 -0
.gitignore +161 -0
Data/peft.pdf +0 -0
Data/reft paper.pdf +3 -0
Dockerfile +25 -0
app.py +72 -0
main.py +5 -0
requirements.txt +13 -0
research/trials.ipynb +0 -0
setup.py +30 -0
src/research_assistant_app.egg-info/PKG-INFO +10 -0
src/research_assistant_app.egg-info/SOURCES.txt +10 -0
src/research_assistant_app.egg-info/dependency_links.txt +1 -0
src/research_assistant_app.egg-info/top_level.txt +1 -0
src/research_assistant_app/__init__.py +18 -0
src/research_assistant_app/__pycache__/__init__.cpython-39.pyc +0 -0
src/research_assistant_app/components/__init__.py +0 -0
src/research_assistant_app/components/__pycache__/__init__.cpython-39.pyc +0 -0
src/research_assistant_app/components/__pycache__/data_indexing.cpython-39.pyc +0 -0
src/research_assistant_app/components/__pycache__/data_ingestion.cpython-39.pyc +0 -0
src/research_assistant_app/components/__pycache__/data_querying.cpython-39.pyc +0 -0
src/research_assistant_app/components/data_indexing.py +94 -0
src/research_assistant_app/components/data_ingestion.py +69 -0
src/research_assistant_app/components/data_querying.py +92 -0
src/research_assistant_app/constants/__init__.py +15 -0
src/research_assistant_app/constants/__pycache__/__init__.cpython-39.pyc +0 -0
src/research_assistant_app/utils/__init__.py +0 -0
src/research_assistant_app/utils/exception.py +23 -0
template.py +35 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Data/reft[[:space:]]paper.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,161 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+# *.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# .github/workflows
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Data/peft.pdf ADDED Viewed

Binary file (563 kB). View file

Data/reft paper.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3d1dcf3e057506a4c470b7f0d4e3fa53abec21d216f2b3451b7dd736cb61e66
+size 1496447

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        netbase \
+        && rm -rf /var/lib/apt/lists/*
+RUN pip3 install -r requirements.txt
+ARG GEMINI_API_KEY1
+ENV GEMINI_API_KEY=$GEMINI_API_KEY1
+ARG PINECONE_API_KEY1
+ENV PINECONE_API_KEY=$PINECONE_API_KEY1
+EXPOSE 8501
+ENTRYPOINT ["streamlit", "run"]
+CMD ["app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import streamlit as st
+from pathlib import Path
+import os
+import google.generativeai as genai
+from research_assistant_app.components.data_ingestion import (
+    get_cleaned_dir_docs,
+    get_cleaned_input_docs,
+)
+from research_assistant_app.components.data_querying import user_query
+from research_assistant_app.components.data_indexing import run_indexing_pipeline
+from dotenv import load_dotenv
+load_dotenv()
+os.getenv("GOOGLE_API_KEY")
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+st.set_page_config("Chat PDF")
+st.header("Your research assistant here to help💁 (Powered by Gemini)")
+user_question = st.text_input(
+    "Chat with existing Pdfs in Pinecone data base or Your added PDF"
+)
+if user_question:
+    response = user_query(user_question)
+    st.write(response)
+File = st.file_uploader(
+    "Upload Your new PDF file to store in Pinecone DB", type=("pdf"), key="pdf"
+)
+if File:  # Save uploaded file to 'Data/' folder.
+    save_folder = "Data"
+    save_path = Path(save_folder, File.name)
+    with open(save_path, mode="wb") as w:
+        w.write(File.getvalue())
+    if save_path.exists():
+        st.success(f"File {File.name} is successfully saved!")
+    file_dir = f"Data/{File.name}"
+    res = get_cleaned_input_docs(file_dir)
+    print(res, "cleaned docs")
+    index_stats = run_indexing_pipeline(res)
+    print(index_stats, "checking indexes")
+    if index_stats != None:
+        st.success(f"File {File.name} is successfully upserted in Pinecone DB!")
+    user_question_pdf = st.text_input("Ask a Question from the PDF File")
+    if user_question_pdf:
+        response = user_query(user_question_pdf)
+        st.write(response)
+    File = None

main.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from research_assistant_app.components.data_querying import user_query
+ans = user_query("generate a summary based on the information you have")
+print(ans)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit
+google-generativeai
+python-dotenv
+langchain
+langchain_google_genai
+llama-index>=0.9.31
+pinecone-client>=3.0.0
+regex
+llama-index-llms-gemini
+IPython
+llama-index-embeddings-gemini
+llama-index-vector-stores-pinecone
+-e .

research/trials.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import setuptools
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+__version__ = "0.0.0.1"
+REPO_NAME = "AI_reasearch_assistant"
+AUTHOR_USER_NAME = "Rajarshi12321"
+SRC_REPO = "research_assistant_app"
+AUTHOR_EMAIL = "[email protected]"
+setuptools.setup(
+    name=SRC_REPO,
+    version=__version__,
+    author=AUTHOR_USER_NAME,
+    author_email=AUTHOR_EMAIL,
+    description="A small python package for sentiment analysis app",
+    long_description=long_description,
+    long_description_content="text/markdown",
+    url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
+    project_urls={
+        "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
+    },
+    package_dir={"": "src"},
+    packages=setuptools.find_packages(where="src"),
+)

src/research_assistant_app.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,10 @@

+Metadata-Version: 2.1
+Name: research-assistant-app
+Version: 0.0.0.1
+Summary: A small python package for sentiment analysis app
+Home-page: https://github.com/Rajarshi12321/AI_reasearch_assistant
+Author: Rajarshi12321
+Author-email: [email protected]
+Project-URL: Bug Tracker, https://github.com/Rajarshi12321/AI_reasearch_assistant/issues
+"# AI_reasearch_assistant"

src/research_assistant_app.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+README.md
+setup.py
+src/research_assistant_app/__init__.py
+src/research_assistant_app.egg-info/PKG-INFO
+src/research_assistant_app.egg-info/SOURCES.txt
+src/research_assistant_app.egg-info/dependency_links.txt
+src/research_assistant_app.egg-info/top_level.txt
+src/research_assistant_app/components/__init__.py
+src/research_assistant_app/utils/__init__.py
+src/research_assistant_app/utils/exception.py

src/research_assistant_app.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/research_assistant_app.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ research_assistant_app

src/research_assistant_app/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+import sys
+import logging
+logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
+log_dir = "logs"
+log_filepath = os.path.join(log_dir, "running_logs.log")
+os.makedirs(log_dir, exist_ok=True)
+logging.basicConfig(
+    level=logging.INFO,
+    format=logging_str,
+    handlers=[logging.FileHandler(log_filepath), logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("research_assistantLogger")

src/research_assistant_app/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (590 Bytes). View file

src/research_assistant_app/components/__init__.py ADDED Viewed

File without changes

src/research_assistant_app/components/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (174 Bytes). View file

src/research_assistant_app/components/__pycache__/data_indexing.cpython-39.pyc ADDED Viewed

Binary file (2.03 kB). View file

src/research_assistant_app/components/__pycache__/data_ingestion.cpython-39.pyc ADDED Viewed

Binary file (1.52 kB). View file

src/research_assistant_app/components/__pycache__/data_querying.cpython-39.pyc ADDED Viewed

Binary file (2.43 kB). View file

src/research_assistant_app/components/data_indexing.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from llama_index.core import StorageContext
+from llama_index.embeddings.gemini import GeminiEmbedding
+import google.generativeai as genai
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+from pinecone import Pinecone
+from llama_index.core.node_parser import SemanticSplitterNodeParser
+from llama_index.core.ingestion import IngestionPipeline
+from research_assistant_app.constants import gemini_api_key, pinecone_api_key
+from research_assistant_app.components.data_ingestion import get_cleaned_dir_docs
+from research_assistant_app.constants import gemini_api_key, pinecone_api_key
+from llama_index.embeddings.gemini import GeminiEmbedding
+from llama_index.llms.gemini import Gemini
+import google.generativeai as genai
+from llama_index.core import Settings
+from llama_index.core.node_parser import SentenceSplitter
+genai.configure(api_key=gemini_api_key)  # configuring api to run the pipeline
+model = Gemini(models="gemini-pro", api_key=gemini_api_key, temperature=0.3)
+gemini_embed_model = GeminiEmbedding(model_name="models/embedding-001")
+embed_model = gemini_embed_model
+Settings.llm = model
+Settings.embed_model = gemini_embed_model
+Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
+Settings.num_output = 512
+Settings.context_window = 3900
+# Define the initial pipeline
+pipeline = IngestionPipeline(
+    transformations=[
+        SemanticSplitterNodeParser(
+            buffer_size=1,
+            breakpoint_percentile_threshold=95,
+            embed_model=embed_model,
+        ),
+        embed_model,
+    ],
+)
+pc = Pinecone(api_key=pinecone_api_key)
+pinecone_index = pc.Index(
+    "ai-research-assistant"
+)  # `ai-research-assistant` is the index name
+vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+# cleaned_docs = get_cleaned_dir_docs()
+# print(cleaned_docs, "Check 1")
+pipeline = IngestionPipeline(
+    transformations=[
+        SemanticSplitterNodeParser(
+            buffer_size=1,
+            breakpoint_percentile_threshold=95,
+            embed_model=embed_model,
+        ),
+        embed_model,
+    ],
+    vector_store=vector_store,  # Our new addition
+)
+# Now we run our pipeline!
+def run_indexing_pipeline(docs):
+    genai.configure(api_key=gemini_api_key)  # configuring api to run the pipeline
+    pipeline.run(documents=docs)
+    # print(pinecone_index.describe_index_stats(), "pincone index")
+    return pinecone_index.describe_index_stats()
+# >>> {'dimension': 1536,
+# >>> 'index_fullness': 0.0,
+# >>> 'namespaces': {'': {'vector_count': 46}},
+# >>> 'total_vector_count': 46}
+if __name__ == "__main__":
+    cleaned_docs = get_cleaned_dir_docs("Data")
+    index_stats = run_indexing_pipeline(cleaned_docs[:3])
+    print(index_stats, "pincone index")

src/research_assistant_app/components/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
+import re
+def clean_up_text(content: str) -> str:
+    """
+    Remove unwanted characters and patterns in text input.
+    :param content: Text input.
+    :return: Cleaned version of original text input.
+    """
+    # Fix hyphenated words broken by newline
+    content = re.sub(r"(\w+)-\n(\w+)", r"\1\2", content)
+    # Remove specific unwanted patterns and characters
+    unwanted_patterns = [
+        "\\n",
+        "  —",
+        "——————————",
+        "—————————",
+        "—————",
+        r"\\u[\dA-Fa-f]{4}",
+        r"\uf075",
+        r"\uf0b7",
+    ]
+    for pattern in unwanted_patterns:
+        content = re.sub(pattern, "", content)
+    # Fix improperly spaced hyphenated words and normalize whitespace
+    content = re.sub(r"(\w)\s*-\s*(\w)", r"\1-\2", content)
+    content = re.sub(r"\s+", " ", content)
+    return content
+def get_cleaned_dir_docs(pdf_file_dir):
+    print(pdf_file_dir)
+    documents = SimpleDirectoryReader(pdf_file_dir).load_data()
+    # Call function
+    cleaned_docs = []
+    for d in documents:
+        cleaned_text = clean_up_text(d.text)
+        d.text = cleaned_text
+        cleaned_docs.append(d)
+    return cleaned_docs
+def get_cleaned_input_docs(pdf_file):
+    documents = SimpleDirectoryReader(input_files=[pdf_file]).load_data()
+    # Call function
+    cleaned_docs = []
+    for d in documents:
+        cleaned_text = clean_up_text(d.text)
+        d.text = cleaned_text
+        cleaned_docs.append(d)
+    return cleaned_docs
+if __name__ == "__main__":
+    # docs = get_cleaned_dir_docs("Data\10200221027_Rajarshi Roy_ (1).pdf")
+    docs = get_cleaned_dir_docs("E:\projects\AI research assistant\Data")
+    print(docs)

src/research_assistant_app/components/data_querying.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from llama_index.core import VectorStoreIndex
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+from pinecone import Pinecone
+from research_assistant_app.constants import gemini_api_key, pinecone_api_key
+import google.generativeai as genai
+pc = Pinecone(api_key=pinecone_api_key)
+pinecone_index = pc.Index(
+    "ai-research-assistant"
+)  # `ai-research-assistant` is the index name
+vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core import PromptTemplate
+def get_vector_retriever(Pinecone_vector_store):
+    # Instantiate VectorStoreIndex object from your vector_store object
+    vector_index = VectorStoreIndex.from_vector_store(
+        vector_store=Pinecone_vector_store
+    )
+    print(vector_index, "check indexes")
+    # Grab 5 search results
+    retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
+    # Pass in your retriever from above, which is configured to return the top 5 results
+    query_engine = RetrieverQueryEngine(retriever=retriever)
+    return query_engine, vector_index
+def get_full_prompt_template(cur_instr: str, prompt_tmpl):
+    tmpl_str = prompt_tmpl.get_template()
+    new_tmpl_str = cur_instr + "\n" + tmpl_str
+    new_tmpl = PromptTemplate(new_tmpl_str)
+    return new_tmpl
+def proper_prompting(my_query_enginge, my_vector_index):
+    QA_PROMPT_KEY = "response_synthesizer:text_qa_template"
+    # get the base qa prompt (without any instruction prefix)
+    base_qa_prompt = my_query_enginge.get_prompts()[QA_PROMPT_KEY]
+    initial_instr = """\
+    You are a QA assistant specifically designed to help in reaserch work as and research assistant.
+    ---------------------
+    Context information is below. Given the context information and not prior knowledge, \
+    "{context_str}\n"
+    ---------------------
+    answer the query. \
+    It is very important that If the context is not relevant,
+    please answer the question by using your own knowledge about the topic
+    """
+    # this is the "initial" prompt template
+    # implicitly used in the first stage of the loop during prompt optimization
+    # here we explicitly capture it so we can use it for evaluation
+    old_qa_prompt = get_full_prompt_template(initial_instr, base_qa_prompt)
+    old_qa_prompt
+    # Use the custom prompt when querying
+    # genai.configure(api_key=gemini_api_key)
+    query_engine = my_vector_index.as_query_engine(text_qa_template=old_qa_prompt)
+    return query_engine
+## This will be the main function that we would call for querying
+def user_query(qus):
+    genai.configure(api_key=gemini_api_key)
+    my_query_enginge, my_vector_index = get_vector_retriever(vector_store)
+    query_engine = proper_prompting(my_query_enginge, my_vector_index)
+    response = query_engine.query(qus)
+    return response.response

src/research_assistant_app/constants/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# loading secret key
+import os
+from dotenv import load_dotenv
+from llama_index.core import VectorStoreIndex
+from llama_index.core import ServiceContext
+from llama_index.core import StorageContext, load_index_from_storage
+from llama_index.embeddings.gemini import GeminiEmbedding
+from llama_index.llms.gemini import Gemini
+import google.generativeai as genai
+load_dotenv()
+gemini_api_key = os.getenv("GEMINI_API_KEY")
+pinecone_api_key = os.getenv("PINECONE_API_KEY")

src/research_assistant_app/constants/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (681 Bytes). View file

src/research_assistant_app/utils/__init__.py ADDED Viewed

File without changes

src/research_assistant_app/utils/exception.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sys
+from research_assistant_app import logging
+def error_message_detail(error, error_detail: sys):
+    _, _, exc_tb = error_detail.exc_info()
+    file_name = exc_tb.tb_frame.f_code.co_filename
+    error_message = "Error ocurred in python script name [{0}] line number [{1}] error message [{2}]".format(
+        file_name, exc_tb.tb_lineno, str(error)
+    )
+    return error_message
+class CustomException(Exception):
+    def __init__(self, error_message, error_detail: sys):
+        super().__init__(error_message)
+        self.error_message = error_message_detail(
+            error_message, error_detail=error_detail
+        )
+    def __str__(self):
+        return self.error_message

template.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from pathlib import Path
+import logging
+# logging string
+logging.basicConfig(level=logging.INFO, format="[%(asctime)s]: %(message)s:")
+project_name = "research_assistant_app"
+list_of_files = [
+    ".github/workflows/.gitkeep",
+    f"src/{project_name}/__init__.py",
+    f"src/{project_name}/components/__init__.py",
+    f"src/{project_name}/utils/__init__.py",
+    "requirements.txt",
+    "setup.py",
+    "research/trials.ipynb",
+]
+for filepath in list_of_files:
+    filepath = Path(filepath)
+    filedir, filename = os.path.split(filepath)
+    if filedir != "":
+        os.makedirs(filedir, exist_ok=True)
+        logging.info(f"Creating directory; {filedir} for the file: {filename}")
+    if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
+        with open(filepath, "w") as f:
+            pass
+            logging.info(f"Creating empty file: {filepath}")
+    else:
+        logging.info(f"{filename} is already exists")