Spaces:

jerpint
/

buster

Running

App Files Files Community

jerpint commited on Aug 14, 2023

Commit

d60948f

0 Parent(s):

First commit

Browse files

Files changed (5) hide show

README.md +9 -0
app.py +111 -0
cfg.py +130 -0
generate_embeddings.py +87 -0
requirements.txt +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: Buster
+emoji: 🤖
+colorFrom: red
+colorTo: blue
+sdk: gradio
+app_file: app.py
+python_version: 3.10.8
+---

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import cfg
+import gradio as gr
+import pandas as pd
+from cfg import setup_buster
+buster = setup_buster(cfg.buster_cfg)
+def format_sources(matched_documents: pd.DataFrame) -> str:
+    if len(matched_documents) == 0:
+        return ""
+    matched_documents.similarity_to_answer = (
+        matched_documents.similarity_to_answer * 100
+    )
+    # print the page instead of the heading, more meaningful for hf docs
+    matched_documents["page"] = matched_documents.apply(
+        lambda x: x.url.split("/")[-1], axis=1
+    )
+    documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
+    document_template: str = "[🔗 {document.page}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
+    documents = "\n".join(
+        [
+            document_template.format(document=document)
+            for _, document in matched_documents.iterrows()
+        ]
+    )
+    footnote: str = "I'm a bot 🤖 and not always perfect."
+    return documents_answer_template.format(documents=documents, footnote=footnote)
+def add_sources(history, completion):
+    if completion.answer_relevant:
+        formatted_sources = format_sources(completion.matched_documents)
+        history.append([None, formatted_sources])
+    return history
+def user(user_input, history):
+    """Adds user's question immediately to the chat."""
+    return "", history + [[user_input, None]]
+def chat(history):
+    user_input = history[-1][0]
+    completion = buster.process_input(user_input)
+    history[-1][1] = ""
+    for token in completion.answer_generator:
+        history[-1][1] += token
+        yield history, completion
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        """<h1><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h1>"""
+    )
+    gr.Markdown(
+        """
+    #### This chatbot is designed to answer any questions related to the [huggingface transformers](https://huggingface.co/docs/transformers/index) library.
+    #### It uses ChatGPT + embeddings to search the docs for relevant sections and uses them to answer questions. It can then cite its sources back to you to verify the information.
+    #### Note that LLMs are prone to hallucination, so all outputs should always be vetted by users.
+    #### The Code is open-sourced and available on [Github](www.github.com/jerpint/buster)")
+    """
+    )
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            question = gr.Textbox(
+                label="What's your question?",
+                placeholder="Ask a question to AI stackoverflow here...",
+                lines=1,
+            )
+        submit = gr.Button(value="Send", variant="secondary")
+    examples = gr.Examples(
+        examples=[
+            "What kind of models should I use for images and text?",
+            "When should I finetune a model vs. training it form scratch?",
+            "Can you give me some python code to quickly finetune a model on my sentiment analysis dataset?",
+        ],
+        inputs=question,
+    )
+    gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand.")
+    response = gr.State()
+    submit.click(user, [question, chatbot], [question, chatbot], queue=False).then(
+        chat, inputs=[chatbot], outputs=[chatbot, response]
+    ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
+    question.submit(user, [question, chatbot], [question, chatbot], queue=False).then(
+        chat, inputs=[chatbot], outputs=[chatbot, response]
+    ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
+block.queue(concurrency_count=16)
+block.launch(debug=True, share=False)

cfg.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from buster.busterbot import Buster, BusterConfig
+from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
+from buster.formatters.documents import DocumentsFormatter
+from buster.formatters.prompts import PromptFormatter
+from buster.retriever import DeepLakeRetriever, Retriever
+from buster.tokenizers import GPTTokenizer
+from buster.validators import QuestionAnswerValidator, Validator
+from buster.utils import extract_zip
+from huggingface_hub import hf_hub_download
+HUB_DB_FILE = "deeplake_store.zip"
+REPO_ID = "jerpint/hf_buster_data"
+hf_hub_download(
+    repo_id=REPO_ID,
+    repo_type="dataset",
+    filename=HUB_DB_FILE,
+    local_dir=".",
+)
+extract_zip(zip_file_path=HUB_DB_FILE, output_path=".")
+buster_cfg = BusterConfig(
+    validator_cfg={
+        "unknown_response_templates": [
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+        ],
+        "unknown_threshold": 0.85,
+        "embedding_model": "text-embedding-ada-002",
+        "use_reranking": True,
+        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
+        "check_question_prompt": """You are a chatbot answering technical questions on the huggingface documentation, a library used to train and do inference on open-source artificial intelligence models.
+Your job is to determine wether or not a question is valid, and should be answered.
+More general questions are not considered valid, even if you might know the response.
+Questions that are likely to be related to the huggingface library are considered valid.
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
+For example:
+Q: How can I train a vision model?
+true
+Q: What is the meaning of life?
+false
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "stream": False,
+            "temperature": 0,
+        },
+    },
+    retriever_cfg={
+        "path": "deeplake_store",
+        "top_k": 3,
+        "thresh": 0.7,
+        "max_tokens": 2000,
+        "embedding_model": "text-embedding-ada-002",
+    },
+    documents_answerer_cfg={
+        "no_documents_message": "No documents are available for this question.",
+    },
+    completion_cfg={
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "stream": True,
+            "temperature": 0,
+        },
+    },
+    tokenizer_cfg={
+        "model_name": "gpt-3.5-turbo",
+    },
+    documents_formatter_cfg={
+        "max_tokens": 3500,
+        "formatter": "{content}",
+    },
+    prompt_formatter_cfg={
+        "max_tokens": 3500,
+        "text_before_docs": (
+            "You are an chatbot answering technical questions on the huggingface transformers library. "
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_after_docs": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are an chatbot answering technical questions on the huggingface transformers library. "
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
+            "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for an qa bot?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with? "
+            "Now answer the following question:\n"
+        ),
+    },
+)
+def setup_buster(buster_cfg: BusterConfig):
+    """initialize buster with a buster_cfg class"""
+    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
+    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
+    document_answerer: DocumentAnswerer = DocumentAnswerer(
+        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
+        documents_formatter=DocumentsFormatter(
+            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
+        ),
+        prompt_formatter=PromptFormatter(
+            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
+        ),
+        **buster_cfg.documents_answerer_cfg,
+    )
+    validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
+    buster: Buster = Buster(
+        retriever=retriever, document_answerer=document_answerer, validator=validator
+    )
+    return buster

generate_embeddings.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import zipfile
+import requests
+import pandas as pd
+import time
+from buster.documents_manager import DeepLakeDocumentsManager
+from buster.docparser import get_all_documents
+from buster.parser import HuggingfaceParser
+hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip"
+def download_and_unzip(zip_url, target_dir, overwrite=False):
+    """Download a zip file from zip_url and unzip it to target_dir.
+    # Example usage
+    zip_url = "https://example.com/example.zip"
+    target_dir = "downloaded_files"
+    download_and_unzip(zip_url, target_dir, overwrite=True)
+    ChatGPT generated.
+    """
+    # Create the target directory if it doesn't exist
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+    # Get the filename from the zip_url
+    zip_filename = os.path.basename(zip_url)
+    target_path = os.path.join(target_dir, zip_filename)
+    # Check if the file already exists
+    if os.path.exists(target_path) and not overwrite:
+        print(f"{zip_filename} already exists in the target directory.")
+        return
+    # Download the zip file
+    response = requests.get(zip_url, stream=True)
+    if response.status_code == 200:
+        with open(target_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print(f"{zip_filename} downloaded successfully.")
+        # Unzip the file
+        with zipfile.ZipFile(target_path, "r") as zip_ref:
+            zip_ref.extractall(target_dir)
+        print(f"{zip_filename} extracted successfully.")
+    else:
+        print(f"Failed to download {zip_filename}. Status code: {response.status_code}")
+# Download the tranformers html pages and unzip it
+download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".")
+# Extract all documents from the html into a dataframe
+df = get_all_documents(
+    root_dir="transformers/main/en/",
+    base_url="https://huggingface.co/docs/transformers/main/en/",
+    parser_cls=HuggingfaceParser,
+    min_section_length=100,
+    max_section_length=1000,
+)
+# Add the source column
+df["source"] = "hf_transformers"
+# Save the .csv with chunks to disk
+df.to_csv("hf_transformers.csv")
+# Initialize the vector store
+dm = DeepLakeDocumentsManager(
+    vector_store_path="deeplake_store",
+    overwrite=True,
+    required_columns=["url", "content", "source", "title"],
+)
+# Add all embeddings to the vector store
+dm.batch_add(
+    df=df,
+    batch_size=3000,
+    min_time_interval=60,
+    num_workers=32,
+    csv_filename="embeddings.csv",
+    csv_overwrite=False,
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/jerpint/buster
+huggingface-hub
+gradio