jerpint commited on
Commit
d60948f
0 Parent(s):

First commit

Browse files
Files changed (5) hide show
  1. README.md +9 -0
  2. app.py +111 -0
  3. cfg.py +130 -0
  4. generate_embeddings.py +87 -0
  5. requirements.txt +3 -0
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Buster
3
+ emoji: 🤖
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ app_file: app.py
8
+ python_version: 3.10.8
9
+ ---
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cfg
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from cfg import setup_buster
5
+
6
+ buster = setup_buster(cfg.buster_cfg)
7
+
8
+
9
+ def format_sources(matched_documents: pd.DataFrame) -> str:
10
+ if len(matched_documents) == 0:
11
+ return ""
12
+
13
+ matched_documents.similarity_to_answer = (
14
+ matched_documents.similarity_to_answer * 100
15
+ )
16
+
17
+ # print the page instead of the heading, more meaningful for hf docs
18
+ matched_documents["page"] = matched_documents.apply(
19
+ lambda x: x.url.split("/")[-1], axis=1
20
+ )
21
+
22
+ documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
23
+ document_template: str = "[🔗 {document.page}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
24
+
25
+ documents = "\n".join(
26
+ [
27
+ document_template.format(document=document)
28
+ for _, document in matched_documents.iterrows()
29
+ ]
30
+ )
31
+ footnote: str = "I'm a bot 🤖 and not always perfect."
32
+
33
+ return documents_answer_template.format(documents=documents, footnote=footnote)
34
+
35
+
36
+ def add_sources(history, completion):
37
+ if completion.answer_relevant:
38
+ formatted_sources = format_sources(completion.matched_documents)
39
+ history.append([None, formatted_sources])
40
+
41
+ return history
42
+
43
+
44
+ def user(user_input, history):
45
+ """Adds user's question immediately to the chat."""
46
+ return "", history + [[user_input, None]]
47
+
48
+
49
+ def chat(history):
50
+ user_input = history[-1][0]
51
+
52
+ completion = buster.process_input(user_input)
53
+
54
+ history[-1][1] = ""
55
+
56
+ for token in completion.answer_generator:
57
+ history[-1][1] += token
58
+
59
+ yield history, completion
60
+
61
+
62
+ block = gr.Blocks()
63
+
64
+ with block:
65
+ gr.Markdown(
66
+ """<h1><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h1>"""
67
+ )
68
+ gr.Markdown(
69
+ """
70
+ #### This chatbot is designed to answer any questions related to the [huggingface transformers](https://huggingface.co/docs/transformers/index) library.
71
+ #### It uses ChatGPT + embeddings to search the docs for relevant sections and uses them to answer questions. It can then cite its sources back to you to verify the information.
72
+ #### Note that LLMs are prone to hallucination, so all outputs should always be vetted by users.
73
+
74
+ #### The Code is open-sourced and available on [Github](www.github.com/jerpint/buster)")
75
+ """
76
+ )
77
+
78
+ chatbot = gr.Chatbot()
79
+
80
+ with gr.Row():
81
+ with gr.Column(scale=4):
82
+ question = gr.Textbox(
83
+ label="What's your question?",
84
+ placeholder="Ask a question to AI stackoverflow here...",
85
+ lines=1,
86
+ )
87
+ submit = gr.Button(value="Send", variant="secondary")
88
+
89
+ examples = gr.Examples(
90
+ examples=[
91
+ "What kind of models should I use for images and text?",
92
+ "When should I finetune a model vs. training it form scratch?",
93
+ "Can you give me some python code to quickly finetune a model on my sentiment analysis dataset?",
94
+ ],
95
+ inputs=question,
96
+ )
97
+
98
+ gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand.")
99
+
100
+ response = gr.State()
101
+
102
+ submit.click(user, [question, chatbot], [question, chatbot], queue=False).then(
103
+ chat, inputs=[chatbot], outputs=[chatbot, response]
104
+ ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
105
+ question.submit(user, [question, chatbot], [question, chatbot], queue=False).then(
106
+ chat, inputs=[chatbot], outputs=[chatbot, response]
107
+ ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
108
+
109
+
110
+ block.queue(concurrency_count=16)
111
+ block.launch(debug=True, share=False)
cfg.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from buster.busterbot import Buster, BusterConfig
2
+ from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
3
+ from buster.formatters.documents import DocumentsFormatter
4
+ from buster.formatters.prompts import PromptFormatter
5
+ from buster.retriever import DeepLakeRetriever, Retriever
6
+ from buster.tokenizers import GPTTokenizer
7
+ from buster.validators import QuestionAnswerValidator, Validator
8
+ from buster.utils import extract_zip
9
+
10
+ from huggingface_hub import hf_hub_download
11
+
12
+
13
+ HUB_DB_FILE = "deeplake_store.zip"
14
+ REPO_ID = "jerpint/hf_buster_data"
15
+
16
+ hf_hub_download(
17
+ repo_id=REPO_ID,
18
+ repo_type="dataset",
19
+ filename=HUB_DB_FILE,
20
+ local_dir=".",
21
+ )
22
+
23
+ extract_zip(zip_file_path=HUB_DB_FILE, output_path=".")
24
+
25
+
26
+ buster_cfg = BusterConfig(
27
+ validator_cfg={
28
+ "unknown_response_templates": [
29
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
30
+ ],
31
+ "unknown_threshold": 0.85,
32
+ "embedding_model": "text-embedding-ada-002",
33
+ "use_reranking": True,
34
+ "invalid_question_response": "This question does not seem relevant to my current knowledge.",
35
+ "check_question_prompt": """You are a chatbot answering technical questions on the huggingface documentation, a library used to train and do inference on open-source artificial intelligence models.
36
+
37
+ Your job is to determine wether or not a question is valid, and should be answered.
38
+ More general questions are not considered valid, even if you might know the response.
39
+ Questions that are likely to be related to the huggingface library are considered valid.
40
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
41
+
42
+ For example:
43
+
44
+ Q: How can I train a vision model?
45
+ true
46
+
47
+ Q: What is the meaning of life?
48
+ false
49
+
50
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
51
+ "completion_kwargs": {
52
+ "model": "gpt-3.5-turbo",
53
+ "stream": False,
54
+ "temperature": 0,
55
+ },
56
+ },
57
+ retriever_cfg={
58
+ "path": "deeplake_store",
59
+ "top_k": 3,
60
+ "thresh": 0.7,
61
+ "max_tokens": 2000,
62
+ "embedding_model": "text-embedding-ada-002",
63
+ },
64
+ documents_answerer_cfg={
65
+ "no_documents_message": "No documents are available for this question.",
66
+ },
67
+ completion_cfg={
68
+ "completion_kwargs": {
69
+ "model": "gpt-3.5-turbo",
70
+ "stream": True,
71
+ "temperature": 0,
72
+ },
73
+ },
74
+ tokenizer_cfg={
75
+ "model_name": "gpt-3.5-turbo",
76
+ },
77
+ documents_formatter_cfg={
78
+ "max_tokens": 3500,
79
+ "formatter": "{content}",
80
+ },
81
+ prompt_formatter_cfg={
82
+ "max_tokens": 3500,
83
+ "text_before_docs": (
84
+ "You are an chatbot answering technical questions on the huggingface transformers library. "
85
+ "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
86
+ "If the answer is in the documentation, summarize it in a helpful way to the user. "
87
+ "If it isn't, simply reply that you cannot answer the question. "
88
+ "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
89
+ "Here is the documentation: "
90
+ "<DOCUMENTS> "
91
+ ),
92
+ "text_after_docs": (
93
+ "<\DOCUMENTS>\n"
94
+ "REMEMBER:\n"
95
+ "You are an chatbot answering technical questions on the huggingface transformers library. "
96
+ "Here are the rules you must follow:\n"
97
+ "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
98
+ "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
99
+ "3) Do not reference any links, urls or hyperlinks in your answers.\n"
100
+ "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
101
+ "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
102
+ "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
103
+ "For example:\n"
104
+ "What is the meaning of life for an qa bot?\n"
105
+ "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with? "
106
+ "Now answer the following question:\n"
107
+ ),
108
+ },
109
+ )
110
+
111
+
112
+ def setup_buster(buster_cfg: BusterConfig):
113
+ """initialize buster with a buster_cfg class"""
114
+ retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
115
+ tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
116
+ document_answerer: DocumentAnswerer = DocumentAnswerer(
117
+ completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
118
+ documents_formatter=DocumentsFormatter(
119
+ tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
120
+ ),
121
+ prompt_formatter=PromptFormatter(
122
+ tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
123
+ ),
124
+ **buster_cfg.documents_answerer_cfg,
125
+ )
126
+ validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
127
+ buster: Buster = Buster(
128
+ retriever=retriever, document_answerer=document_answerer, validator=validator
129
+ )
130
+ return buster
generate_embeddings.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import requests
4
+ import pandas as pd
5
+ import time
6
+
7
+ from buster.documents_manager import DeepLakeDocumentsManager
8
+
9
+ from buster.docparser import get_all_documents
10
+ from buster.parser import HuggingfaceParser
11
+
12
+ hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip"
13
+
14
+
15
+ def download_and_unzip(zip_url, target_dir, overwrite=False):
16
+ """Download a zip file from zip_url and unzip it to target_dir.
17
+
18
+ # Example usage
19
+ zip_url = "https://example.com/example.zip"
20
+ target_dir = "downloaded_files"
21
+ download_and_unzip(zip_url, target_dir, overwrite=True)
22
+
23
+ ChatGPT generated.
24
+ """
25
+ # Create the target directory if it doesn't exist
26
+ if not os.path.exists(target_dir):
27
+ os.makedirs(target_dir)
28
+
29
+ # Get the filename from the zip_url
30
+ zip_filename = os.path.basename(zip_url)
31
+ target_path = os.path.join(target_dir, zip_filename)
32
+
33
+ # Check if the file already exists
34
+ if os.path.exists(target_path) and not overwrite:
35
+ print(f"{zip_filename} already exists in the target directory.")
36
+ return
37
+
38
+ # Download the zip file
39
+ response = requests.get(zip_url, stream=True)
40
+ if response.status_code == 200:
41
+ with open(target_path, "wb") as file:
42
+ for chunk in response.iter_content(chunk_size=8192):
43
+ file.write(chunk)
44
+ print(f"{zip_filename} downloaded successfully.")
45
+
46
+ # Unzip the file
47
+ with zipfile.ZipFile(target_path, "r") as zip_ref:
48
+ zip_ref.extractall(target_dir)
49
+ print(f"{zip_filename} extracted successfully.")
50
+ else:
51
+ print(f"Failed to download {zip_filename}. Status code: {response.status_code}")
52
+
53
+
54
+ # Download the tranformers html pages and unzip it
55
+ download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".")
56
+
57
+ # Extract all documents from the html into a dataframe
58
+ df = get_all_documents(
59
+ root_dir="transformers/main/en/",
60
+ base_url="https://huggingface.co/docs/transformers/main/en/",
61
+ parser_cls=HuggingfaceParser,
62
+ min_section_length=100,
63
+ max_section_length=1000,
64
+ )
65
+
66
+ # Add the source column
67
+ df["source"] = "hf_transformers"
68
+
69
+ # Save the .csv with chunks to disk
70
+ df.to_csv("hf_transformers.csv")
71
+
72
+ # Initialize the vector store
73
+ dm = DeepLakeDocumentsManager(
74
+ vector_store_path="deeplake_store",
75
+ overwrite=True,
76
+ required_columns=["url", "content", "source", "title"],
77
+ )
78
+
79
+ # Add all embeddings to the vector store
80
+ dm.batch_add(
81
+ df=df,
82
+ batch_size=3000,
83
+ min_time_interval=60,
84
+ num_workers=32,
85
+ csv_filename="embeddings.csv",
86
+ csv_overwrite=False,
87
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/jerpint/buster
2
+ huggingface-hub
3
+ gradio