Commit 
							
							·
						
						d60948f
	
0
								Parent(s):
							
							
First commit
Browse files- README.md +9 -0
 - app.py +111 -0
 - cfg.py +130 -0
 - generate_embeddings.py +87 -0
 - requirements.txt +3 -0
 
    	
        README.md
    ADDED
    
    | 
         @@ -0,0 +1,9 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ---
         
     | 
| 2 | 
         
            +
            title: Buster
         
     | 
| 3 | 
         
            +
            emoji: 🤖
         
     | 
| 4 | 
         
            +
            colorFrom: red
         
     | 
| 5 | 
         
            +
            colorTo: blue
         
     | 
| 6 | 
         
            +
            sdk: gradio
         
     | 
| 7 | 
         
            +
            app_file: app.py
         
     | 
| 8 | 
         
            +
            python_version: 3.10.8
         
     | 
| 9 | 
         
            +
            ---
         
     | 
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,111 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import cfg
         
     | 
| 2 | 
         
            +
            import gradio as gr
         
     | 
| 3 | 
         
            +
            import pandas as pd
         
     | 
| 4 | 
         
            +
            from cfg import setup_buster
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            buster = setup_buster(cfg.buster_cfg)
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            def format_sources(matched_documents: pd.DataFrame) -> str:
         
     | 
| 10 | 
         
            +
                if len(matched_documents) == 0:
         
     | 
| 11 | 
         
            +
                    return ""
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
                matched_documents.similarity_to_answer = (
         
     | 
| 14 | 
         
            +
                    matched_documents.similarity_to_answer * 100
         
     | 
| 15 | 
         
            +
                )
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
                # print the page instead of the heading, more meaningful for hf docs
         
     | 
| 18 | 
         
            +
                matched_documents["page"] = matched_documents.apply(
         
     | 
| 19 | 
         
            +
                    lambda x: x.url.split("/")[-1], axis=1
         
     | 
| 20 | 
         
            +
                )
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
                documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
         
     | 
| 23 | 
         
            +
                document_template: str = "[🔗 {document.page}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                documents = "\n".join(
         
     | 
| 26 | 
         
            +
                    [
         
     | 
| 27 | 
         
            +
                        document_template.format(document=document)
         
     | 
| 28 | 
         
            +
                        for _, document in matched_documents.iterrows()
         
     | 
| 29 | 
         
            +
                    ]
         
     | 
| 30 | 
         
            +
                )
         
     | 
| 31 | 
         
            +
                footnote: str = "I'm a bot 🤖 and not always perfect."
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                return documents_answer_template.format(documents=documents, footnote=footnote)
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            def add_sources(history, completion):
         
     | 
| 37 | 
         
            +
                if completion.answer_relevant:
         
     | 
| 38 | 
         
            +
                    formatted_sources = format_sources(completion.matched_documents)
         
     | 
| 39 | 
         
            +
                    history.append([None, formatted_sources])
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
                return history
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            def user(user_input, history):
         
     | 
| 45 | 
         
            +
                """Adds user's question immediately to the chat."""
         
     | 
| 46 | 
         
            +
                return "", history + [[user_input, None]]
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            def chat(history):
         
     | 
| 50 | 
         
            +
                user_input = history[-1][0]
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
                completion = buster.process_input(user_input)
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
                history[-1][1] = ""
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                for token in completion.answer_generator:
         
     | 
| 57 | 
         
            +
                    history[-1][1] += token
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
                    yield history, completion
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            block = gr.Blocks()
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
            with block:
         
     | 
| 65 | 
         
            +
                gr.Markdown(
         
     | 
| 66 | 
         
            +
                    """<h1><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h1>"""
         
     | 
| 67 | 
         
            +
                )
         
     | 
| 68 | 
         
            +
                gr.Markdown(
         
     | 
| 69 | 
         
            +
                    """
         
     | 
| 70 | 
         
            +
                #### This chatbot is designed to answer any questions related to the [huggingface transformers](https://huggingface.co/docs/transformers/index) library.
         
     | 
| 71 | 
         
            +
                #### It uses ChatGPT + embeddings to search the docs for relevant sections and uses them to answer questions. It can then cite its sources back to you to verify the information.
         
     | 
| 72 | 
         
            +
                #### Note that LLMs are prone to hallucination, so all outputs should always be vetted by users.
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
                #### The Code is open-sourced and available on [Github](www.github.com/jerpint/buster)")
         
     | 
| 75 | 
         
            +
                """
         
     | 
| 76 | 
         
            +
                )
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
                chatbot = gr.Chatbot()
         
     | 
| 79 | 
         
            +
             
     | 
| 80 | 
         
            +
                with gr.Row():
         
     | 
| 81 | 
         
            +
                    with gr.Column(scale=4):
         
     | 
| 82 | 
         
            +
                        question = gr.Textbox(
         
     | 
| 83 | 
         
            +
                            label="What's your question?",
         
     | 
| 84 | 
         
            +
                            placeholder="Ask a question to AI stackoverflow here...",
         
     | 
| 85 | 
         
            +
                            lines=1,
         
     | 
| 86 | 
         
            +
                        )
         
     | 
| 87 | 
         
            +
                    submit = gr.Button(value="Send", variant="secondary")
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
                examples = gr.Examples(
         
     | 
| 90 | 
         
            +
                    examples=[
         
     | 
| 91 | 
         
            +
                        "What kind of models should I use for images and text?",
         
     | 
| 92 | 
         
            +
                        "When should I finetune a model vs. training it form scratch?",
         
     | 
| 93 | 
         
            +
                        "Can you give me some python code to quickly finetune a model on my sentiment analysis dataset?",
         
     | 
| 94 | 
         
            +
                    ],
         
     | 
| 95 | 
         
            +
                    inputs=question,
         
     | 
| 96 | 
         
            +
                )
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
                gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand.")
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                response = gr.State()
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
                submit.click(user, [question, chatbot], [question, chatbot], queue=False).then(
         
     | 
| 103 | 
         
            +
                    chat, inputs=[chatbot], outputs=[chatbot, response]
         
     | 
| 104 | 
         
            +
                ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
         
     | 
| 105 | 
         
            +
                question.submit(user, [question, chatbot], [question, chatbot], queue=False).then(
         
     | 
| 106 | 
         
            +
                    chat, inputs=[chatbot], outputs=[chatbot, response]
         
     | 
| 107 | 
         
            +
                ).then(add_sources, inputs=[chatbot, response], outputs=[chatbot])
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            block.queue(concurrency_count=16)
         
     | 
| 111 | 
         
            +
            block.launch(debug=True, share=False)
         
     | 
    	
        cfg.py
    ADDED
    
    | 
         @@ -0,0 +1,130 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from buster.busterbot import Buster, BusterConfig
         
     | 
| 2 | 
         
            +
            from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
         
     | 
| 3 | 
         
            +
            from buster.formatters.documents import DocumentsFormatter
         
     | 
| 4 | 
         
            +
            from buster.formatters.prompts import PromptFormatter
         
     | 
| 5 | 
         
            +
            from buster.retriever import DeepLakeRetriever, Retriever
         
     | 
| 6 | 
         
            +
            from buster.tokenizers import GPTTokenizer
         
     | 
| 7 | 
         
            +
            from buster.validators import QuestionAnswerValidator, Validator
         
     | 
| 8 | 
         
            +
            from buster.utils import extract_zip
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            from huggingface_hub import hf_hub_download
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            HUB_DB_FILE = "deeplake_store.zip"
         
     | 
| 14 | 
         
            +
            REPO_ID = "jerpint/hf_buster_data"
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            hf_hub_download(
         
     | 
| 17 | 
         
            +
                repo_id=REPO_ID,
         
     | 
| 18 | 
         
            +
                repo_type="dataset",
         
     | 
| 19 | 
         
            +
                filename=HUB_DB_FILE,
         
     | 
| 20 | 
         
            +
                local_dir=".",
         
     | 
| 21 | 
         
            +
            )
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            extract_zip(zip_file_path=HUB_DB_FILE, output_path=".")
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            buster_cfg = BusterConfig(
         
     | 
| 27 | 
         
            +
                validator_cfg={
         
     | 
| 28 | 
         
            +
                    "unknown_response_templates": [
         
     | 
| 29 | 
         
            +
                        "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
         
     | 
| 30 | 
         
            +
                    ],
         
     | 
| 31 | 
         
            +
                    "unknown_threshold": 0.85,
         
     | 
| 32 | 
         
            +
                    "embedding_model": "text-embedding-ada-002",
         
     | 
| 33 | 
         
            +
                    "use_reranking": True,
         
     | 
| 34 | 
         
            +
                    "invalid_question_response": "This question does not seem relevant to my current knowledge.",
         
     | 
| 35 | 
         
            +
                    "check_question_prompt": """You are a chatbot answering technical questions on the huggingface documentation, a library used to train and do inference on open-source artificial intelligence models.
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
            Your job is to determine wether or not a question is valid, and should be answered.
         
     | 
| 38 | 
         
            +
            More general questions are not considered valid, even if you might know the response.
         
     | 
| 39 | 
         
            +
            Questions that are likely to be related to the huggingface library are considered valid.
         
     | 
| 40 | 
         
            +
            A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
            For example:
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            Q: How can I train a vision model?
         
     | 
| 45 | 
         
            +
            true
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
            Q: What is the meaning of life?
         
     | 
| 48 | 
         
            +
            false
         
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
         
     | 
| 51 | 
         
            +
                    "completion_kwargs": {
         
     | 
| 52 | 
         
            +
                        "model": "gpt-3.5-turbo",
         
     | 
| 53 | 
         
            +
                        "stream": False,
         
     | 
| 54 | 
         
            +
                        "temperature": 0,
         
     | 
| 55 | 
         
            +
                    },
         
     | 
| 56 | 
         
            +
                },
         
     | 
| 57 | 
         
            +
                retriever_cfg={
         
     | 
| 58 | 
         
            +
                    "path": "deeplake_store",
         
     | 
| 59 | 
         
            +
                    "top_k": 3,
         
     | 
| 60 | 
         
            +
                    "thresh": 0.7,
         
     | 
| 61 | 
         
            +
                    "max_tokens": 2000,
         
     | 
| 62 | 
         
            +
                    "embedding_model": "text-embedding-ada-002",
         
     | 
| 63 | 
         
            +
                },
         
     | 
| 64 | 
         
            +
                documents_answerer_cfg={
         
     | 
| 65 | 
         
            +
                    "no_documents_message": "No documents are available for this question.",
         
     | 
| 66 | 
         
            +
                },
         
     | 
| 67 | 
         
            +
                completion_cfg={
         
     | 
| 68 | 
         
            +
                    "completion_kwargs": {
         
     | 
| 69 | 
         
            +
                        "model": "gpt-3.5-turbo",
         
     | 
| 70 | 
         
            +
                        "stream": True,
         
     | 
| 71 | 
         
            +
                        "temperature": 0,
         
     | 
| 72 | 
         
            +
                    },
         
     | 
| 73 | 
         
            +
                },
         
     | 
| 74 | 
         
            +
                tokenizer_cfg={
         
     | 
| 75 | 
         
            +
                    "model_name": "gpt-3.5-turbo",
         
     | 
| 76 | 
         
            +
                },
         
     | 
| 77 | 
         
            +
                documents_formatter_cfg={
         
     | 
| 78 | 
         
            +
                    "max_tokens": 3500,
         
     | 
| 79 | 
         
            +
                    "formatter": "{content}",
         
     | 
| 80 | 
         
            +
                },
         
     | 
| 81 | 
         
            +
                prompt_formatter_cfg={
         
     | 
| 82 | 
         
            +
                    "max_tokens": 3500,
         
     | 
| 83 | 
         
            +
                    "text_before_docs": (
         
     | 
| 84 | 
         
            +
                        "You are an chatbot answering technical questions on the huggingface transformers library. "
         
     | 
| 85 | 
         
            +
                        "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
         
     | 
| 86 | 
         
            +
                        "If the answer is in the documentation, summarize it in a helpful way to the user. "
         
     | 
| 87 | 
         
            +
                        "If it isn't, simply reply that you cannot answer the question. "
         
     | 
| 88 | 
         
            +
                        "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
         
     | 
| 89 | 
         
            +
                        "Here is the documentation: "
         
     | 
| 90 | 
         
            +
                        "<DOCUMENTS> "
         
     | 
| 91 | 
         
            +
                    ),
         
     | 
| 92 | 
         
            +
                    "text_after_docs": (
         
     | 
| 93 | 
         
            +
                        "<\DOCUMENTS>\n"
         
     | 
| 94 | 
         
            +
                        "REMEMBER:\n"
         
     | 
| 95 | 
         
            +
                        "You are an chatbot answering technical questions on the huggingface transformers library. "
         
     | 
| 96 | 
         
            +
                        "Here are the rules you must follow:\n"
         
     | 
| 97 | 
         
            +
                        "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
         
     | 
| 98 | 
         
            +
                        "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
         
     | 
| 99 | 
         
            +
                        "3) Do not reference any links, urls or hyperlinks in your answers.\n"
         
     | 
| 100 | 
         
            +
                        "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
         
     | 
| 101 | 
         
            +
                        "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
         
     | 
| 102 | 
         
            +
                        "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
         
     | 
| 103 | 
         
            +
                        "For example:\n"
         
     | 
| 104 | 
         
            +
                        "What is the meaning of life for an qa bot?\n"
         
     | 
| 105 | 
         
            +
                        "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with? "
         
     | 
| 106 | 
         
            +
                        "Now answer the following question:\n"
         
     | 
| 107 | 
         
            +
                    ),
         
     | 
| 108 | 
         
            +
                },
         
     | 
| 109 | 
         
            +
            )
         
     | 
| 110 | 
         
            +
             
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            def setup_buster(buster_cfg: BusterConfig):
         
     | 
| 113 | 
         
            +
                """initialize buster with a buster_cfg class"""
         
     | 
| 114 | 
         
            +
                retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
         
     | 
| 115 | 
         
            +
                tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
         
     | 
| 116 | 
         
            +
                document_answerer: DocumentAnswerer = DocumentAnswerer(
         
     | 
| 117 | 
         
            +
                    completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
         
     | 
| 118 | 
         
            +
                    documents_formatter=DocumentsFormatter(
         
     | 
| 119 | 
         
            +
                        tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
         
     | 
| 120 | 
         
            +
                    ),
         
     | 
| 121 | 
         
            +
                    prompt_formatter=PromptFormatter(
         
     | 
| 122 | 
         
            +
                        tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
         
     | 
| 123 | 
         
            +
                    ),
         
     | 
| 124 | 
         
            +
                    **buster_cfg.documents_answerer_cfg,
         
     | 
| 125 | 
         
            +
                )
         
     | 
| 126 | 
         
            +
                validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
         
     | 
| 127 | 
         
            +
                buster: Buster = Buster(
         
     | 
| 128 | 
         
            +
                    retriever=retriever, document_answerer=document_answerer, validator=validator
         
     | 
| 129 | 
         
            +
                )
         
     | 
| 130 | 
         
            +
                return buster
         
     | 
    	
        generate_embeddings.py
    ADDED
    
    | 
         @@ -0,0 +1,87 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import zipfile
         
     | 
| 3 | 
         
            +
            import requests
         
     | 
| 4 | 
         
            +
            import pandas as pd
         
     | 
| 5 | 
         
            +
            import time
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            from buster.documents_manager import DeepLakeDocumentsManager
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            from buster.docparser import get_all_documents
         
     | 
| 10 | 
         
            +
            from buster.parser import HuggingfaceParser
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip"
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            def download_and_unzip(zip_url, target_dir, overwrite=False):
         
     | 
| 16 | 
         
            +
                """Download a zip file from zip_url and unzip it to target_dir.
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                # Example usage
         
     | 
| 19 | 
         
            +
                zip_url = "https://example.com/example.zip"
         
     | 
| 20 | 
         
            +
                target_dir = "downloaded_files"
         
     | 
| 21 | 
         
            +
                download_and_unzip(zip_url, target_dir, overwrite=True)
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
                ChatGPT generated.
         
     | 
| 24 | 
         
            +
                """
         
     | 
| 25 | 
         
            +
                # Create the target directory if it doesn't exist
         
     | 
| 26 | 
         
            +
                if not os.path.exists(target_dir):
         
     | 
| 27 | 
         
            +
                    os.makedirs(target_dir)
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                # Get the filename from the zip_url
         
     | 
| 30 | 
         
            +
                zip_filename = os.path.basename(zip_url)
         
     | 
| 31 | 
         
            +
                target_path = os.path.join(target_dir, zip_filename)
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                # Check if the file already exists
         
     | 
| 34 | 
         
            +
                if os.path.exists(target_path) and not overwrite:
         
     | 
| 35 | 
         
            +
                    print(f"{zip_filename} already exists in the target directory.")
         
     | 
| 36 | 
         
            +
                    return
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
                # Download the zip file
         
     | 
| 39 | 
         
            +
                response = requests.get(zip_url, stream=True)
         
     | 
| 40 | 
         
            +
                if response.status_code == 200:
         
     | 
| 41 | 
         
            +
                    with open(target_path, "wb") as file:
         
     | 
| 42 | 
         
            +
                        for chunk in response.iter_content(chunk_size=8192):
         
     | 
| 43 | 
         
            +
                            file.write(chunk)
         
     | 
| 44 | 
         
            +
                    print(f"{zip_filename} downloaded successfully.")
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
                    # Unzip the file
         
     | 
| 47 | 
         
            +
                    with zipfile.ZipFile(target_path, "r") as zip_ref:
         
     | 
| 48 | 
         
            +
                        zip_ref.extractall(target_dir)
         
     | 
| 49 | 
         
            +
                    print(f"{zip_filename} extracted successfully.")
         
     | 
| 50 | 
         
            +
                else:
         
     | 
| 51 | 
         
            +
                    print(f"Failed to download {zip_filename}. Status code: {response.status_code}")
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            # Download the tranformers html pages and unzip it
         
     | 
| 55 | 
         
            +
            download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".")
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            # Extract all documents from the html into a dataframe
         
     | 
| 58 | 
         
            +
            df = get_all_documents(
         
     | 
| 59 | 
         
            +
                root_dir="transformers/main/en/",
         
     | 
| 60 | 
         
            +
                base_url="https://huggingface.co/docs/transformers/main/en/",
         
     | 
| 61 | 
         
            +
                parser_cls=HuggingfaceParser,
         
     | 
| 62 | 
         
            +
                min_section_length=100,
         
     | 
| 63 | 
         
            +
                max_section_length=1000,
         
     | 
| 64 | 
         
            +
            )
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
            # Add the source column
         
     | 
| 67 | 
         
            +
            df["source"] = "hf_transformers"
         
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
            +
            # Save the .csv with chunks to disk
         
     | 
| 70 | 
         
            +
            df.to_csv("hf_transformers.csv")
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
            # Initialize the vector store
         
     | 
| 73 | 
         
            +
            dm = DeepLakeDocumentsManager(
         
     | 
| 74 | 
         
            +
                vector_store_path="deeplake_store",
         
     | 
| 75 | 
         
            +
                overwrite=True,
         
     | 
| 76 | 
         
            +
                required_columns=["url", "content", "source", "title"],
         
     | 
| 77 | 
         
            +
            )
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
            # Add all embeddings to the vector store
         
     | 
| 80 | 
         
            +
            dm.batch_add(
         
     | 
| 81 | 
         
            +
                df=df,
         
     | 
| 82 | 
         
            +
                batch_size=3000,
         
     | 
| 83 | 
         
            +
                min_time_interval=60,
         
     | 
| 84 | 
         
            +
                num_workers=32,
         
     | 
| 85 | 
         
            +
                csv_filename="embeddings.csv",
         
     | 
| 86 | 
         
            +
                csv_overwrite=False,
         
     | 
| 87 | 
         
            +
            )
         
     | 
    	
        requirements.txt
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            git+https://github.com/jerpint/buster
         
     | 
| 2 | 
         
            +
            huggingface-hub
         
     | 
| 3 | 
         
            +
            gradio
         
     |