Spaces:

JiakaiDu
/

RAG_Test

Paused

App Files Files Community

JiakaiDu commited on Sep 26, 2024

Commit

c27d36b

verified ·

1 Parent(s): 078c925

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +873 -0

app.py ADDED Viewed

	@@ -0,0 +1,873 @@

+import os
+os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false"
+from pathlib import Path
+import requests
+import shutil
+import io
+from pathlib import Path
+import openvino as ov
+import torch
+from transformers import (
+    TextIteratorStreamer,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from llm_config import (
+    SUPPORTED_EMBEDDING_MODELS,
+    SUPPORTED_RERANK_MODELS,
+    SUPPORTED_LLM_MODELS,
+)
+from huggingface_hub import login
+config_shared_path = Path("../../utils/llm_config.py")
+config_dst_path = Path("llm_config.py")
+text_example_en_path = Path("text_example_en.pdf")
+text_example_cn_path = Path("text_example_cn.pdf")
+text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf"
+text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf"
+if not config_dst_path.exists():
+    if config_shared_path.exists():
+        try:
+            os.symlink(config_shared_path, config_dst_path)
+        except Exception:
+            shutil.copy(config_shared_path, config_dst_path)
+    else:
+        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
+        with open("llm_config.py", "w", encoding="utf-8") as f:
+            f.write(r.text)
+elif not os.path.islink(config_dst_path):
+    print("LLM config will be updated")
+    if config_shared_path.exists():
+        shutil.copy(config_shared_path, config_dst_path)
+    else:
+        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
+        with open("llm_config.py", "w", encoding="utf-8") as f:
+            f.write(r.text)
+if not text_example_en_path.exists():
+    r = requests.get(url=text_example_en)
+    content = io.BytesIO(r.content)
+    with open("text_example_en.pdf", "wb") as f:
+        f.write(content.read())
+if not text_example_cn_path.exists():
+    r = requests.get(url=text_example_cn)
+    content = io.BytesIO(r.content)
+    with open("text_example_cn.pdf", "wb") as f:
+        f.write(content.read())
+model_language = "English"
+llm_model_id= "llama-3-8b-instruct"
+llm_model_configuration = SUPPORTED_LLM_MODELS[model_language][llm_model_id]
+print(f"Selected LLM model {llm_model_id}")
+prepare_int4_model = True   # Prepare INT4 model
+prepare_int8_model = False  # Do not prepare INT8 model
+prepare_fp16_model = False  # Do not prepare FP16 model
+enable_awq = False
+# Get the token from the environment variable
+hf_token = os.getenv("HUGGINGFACE_TOKEN")
+if hf_token is None:
+    raise ValueError(
+        "HUGGINGFACE_TOKEN environment variable not set. "
+        "Please set it in your environment variables or repository secrets."
+    )
+# Log in to Hugging Face Hub
+login(token=hf_token)
+pt_model_id = llm_model_configuration["model_id"]
+# pt_model_name = llm_model_id.value.split("-")[0]
+fp16_model_dir = Path(llm_model_id) / "FP16"
+int8_model_dir = Path(llm_model_id) / "INT8_compressed_weights"
+int4_model_dir = Path(llm_model_id) / "INT4_compressed_weights"
+def convert_to_fp16():
+    if (fp16_model_dir / "openvino_model.xml").exists():
+        return
+    remote_code = llm_model_configuration.get("remote_code", False)
+    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id)
+    if remote_code:
+        export_command_base += " --trust-remote-code"
+    export_command = export_command_base + " " + str(fp16_model_dir)
+def convert_to_int8():
+    if (int8_model_dir / "openvino_model.xml").exists():
+        return
+    int8_model_dir.mkdir(parents=True, exist_ok=True)
+    remote_code = llm_model_configuration.get("remote_code", False)
+    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id)
+    if remote_code:
+        export_command_base += " --trust-remote-code"
+    export_command = export_command_base + " " + str(int8_model_dir)
+def convert_to_int4():
+    compression_configs = {
+        "zephyr-7b-beta": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "mistral-7b": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "minicpm-2b-dpo": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "gemma-2b-it": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "notus-7b-v1": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "neural-chat-7b-v3-1": {
+            "sym": True,
+            "group_size": 64,
+            "ratio": 0.6,
+        },
+        "llama-2-chat-7b": {
+            "sym": True,
+            "group_size": 128,
+            "ratio": 0.8,
+        },
+        "llama-3-8b-instruct": {
+            "sym": True,
+            "group_size": 128,
+            "ratio": 0.8,
+        },
+        "gemma-7b-it": {
+            "sym": True,
+            "group_size": 128,
+            "ratio": 0.8,
+        },
+        "chatglm2-6b": {
+            "sym": True,
+            "group_size": 128,
+            "ratio": 0.72,
+        },
+        "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6},
+        "red-pajama-3b-chat": {
+            "sym": False,
+            "group_size": 128,
+            "ratio": 0.5,
+        },
+        "default": {
+            "sym": False,
+            "group_size": 128,
+            "ratio": 0.8,
+        },
+    }
+    model_compression_params = compression_configs.get(llm_model_id, compression_configs["default"])
+    if (int4_model_dir / "openvino_model.xml").exists():
+        return
+    remote_code = llm_model_configuration.get("remote_code", False)
+    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id)
+    int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"])
+    if model_compression_params["sym"]:
+        int4_compression_args += " --sym"
+    if enable_awq.value:
+        int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
+    export_command_base += int4_compression_args
+    if remote_code:
+        export_command_base += " --trust-remote-code"
+    export_command = export_command_base + " " + str(int4_model_dir)
+if prepare_fp16_model:
+    convert_to_fp16()
+if prepare_int8_model:
+    convert_to_int8()
+if prepare_int4_model:
+    convert_to_int4()
+fp16_weights = fp16_model_dir / "openvino_model.bin"
+int8_weights = int8_model_dir / "openvino_model.bin"
+int4_weights = int4_model_dir / "openvino_model.bin"
+if fp16_weights.exists():
+    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")
+for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
+    if compressed_weights.exists():
+        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
+    if compressed_weights.exists() and fp16_weights.exists():
+        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")
+embedding_model_id = 'bge-small-en-v1.5'                  #'bge-small-en-v1.5', 'bge-large-en-v1.5', 'bge-m3'), value='bge-small-en-v1.5'
+embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[model_language][embedding_model_id]
+print(f"Selected {embedding_model_id} model")
+export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"])
+export_command = export_command_base + " " + str(embedding_model_id)
+rerank_model_id = "bge-reranker-v2-m3"                               #'bge-reranker-v2-m3', 'bge-reranker-large', 'bge-reranker-base')
+rerank_model_configuration = SUPPORTED_RERANK_MODELS[rerank_model_id]
+print(f"Selected {rerank_model_id} model")
+export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"])
+export_command = export_command_base + " " + str(rerank_model_id)
+embedding_device = "CPU"
+USING_NPU = embedding_device == "NPU"
+npu_embedding_dir = embedding_model_id + "-npu"
+npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml"
+if USING_NPU and not Path(npu_embedding_dir).exists():
+    r = requests.get(
+        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
+    )
+    with open("notebook_utils.py", "w") as f:
+        f.write(r.text)
+    import notebook_utils as utils
+    shutil.copytree(embedding_model_id, npu_embedding_dir)
+    utils.optimize_bge_embedding(Path(embedding_model_id) / "openvino_model.xml", npu_embedding_path)
+rerank_device = "CPU"
+llm_device = "CPU"
+from langchain_community.embeddings import OpenVINOBgeEmbeddings
+embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id
+batch_size = 1 if USING_NPU else 4
+embedding_model_kwargs = {"device": embedding_device, "compile": False}
+encode_kwargs = {
+    "mean_pooling": embedding_model_configuration["mean_pooling"],
+    "normalize_embeddings": embedding_model_configuration["normalize_embeddings"],
+    "batch_size": batch_size,
+}
+embedding = OpenVINOBgeEmbeddings(
+    model_name_or_path=embedding_model_name,
+    model_kwargs=embedding_model_kwargs,
+    encode_kwargs=encode_kwargs,
+)
+if USING_NPU:
+    embedding.ov_model.reshape(1, 512)
+embedding.ov_model.compile()
+text = "This is a test document."
+embedding_result = embedding.embed_query(text)
+embedding_result[:3]
+from langchain_community.document_compressors.openvino_rerank import OpenVINOReranker
+rerank_model_name = rerank_model_id
+rerank_model_kwargs = {"device": rerank_device}
+rerank_top_n = 2
+reranker = OpenVINOReranker(
+    model_name_or_path=rerank_model_name,
+    model_kwargs=rerank_model_kwargs,
+    top_n=rerank_top_n,
+)
+model_to_run = "INT4"
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+if model_to_run == "INT4":
+    model_dir = int4_model_dir
+elif model_to_run == "INT8":
+    model_dir = int8_model_dir
+else:
+    model_dir = fp16_model_dir
+print(f"Loading model from {model_dir}")
+ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
+if "GPU" in llm_device and "qwen2-7b-instruct" in llm_model_id:
+    ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO"
+# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy
+# issues caused by this, which we avoid by setting precision hint to "f32".
+if llm_model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device in ["GPU", "AUTO"]:
+    ov_config["INFERENCE_PRECISION_HINT"] = "f32"
+llm = HuggingFacePipeline.from_model_id(
+    model_id=str(model_dir),
+    task="text-generation",
+    backend="openvino",
+    model_kwargs={
+        "device": llm_device,
+        "ov_config": ov_config,
+        "trust_remote_code": True,
+    },
+    pipeline_kwargs={"max_new_tokens": 2},
+)
+llm.invoke("2 + 2 =")
+import re
+from typing import List
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+    MarkdownTextSplitter,
+)
+from langchain.document_loaders import (
+    CSVLoader,
+    EverNoteLoader,
+    PyPDFLoader,
+    TextLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+class ChineseTextSplitter(CharacterTextSplitter):
+    def __init__(self, pdf: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        self.pdf = pdf
+    def split_text(self, text: str) -> List[str]:
+        if self.pdf:
+            text = re.sub(r"\n{3,}", "\n", text)
+            text = text.replace("\n\n", "")
+        sent_sep_pattern = re.compile('([﹒﹔﹖﹗．。！？]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')
+        sent_list = []
+        for ele in sent_sep_pattern.split(text):
+            if sent_sep_pattern.match(ele) and sent_list:
+                sent_list[-1] += ele
+            elif ele:
+                sent_list.append(ele)
+        return sent_list
+TEXT_SPLITERS = {
+    "Character": CharacterTextSplitter,
+    "RecursiveCharacter": RecursiveCharacterTextSplitter,
+    "Markdown": MarkdownTextSplitter,
+    "Chinese": ChineseTextSplitter,
+}
+LOADERS = {
+    ".csv": (CSVLoader, {}),
+    ".doc": (UnstructuredWordDocumentLoader, {}),
+    ".docx": (UnstructuredWordDocumentLoader, {}),
+    ".enex": (EverNoteLoader, {}),
+    ".epub": (UnstructuredEPubLoader, {}),
+    ".html": (UnstructuredHTMLLoader, {}),
+    ".md": (UnstructuredMarkdownLoader, {}),
+    ".odt": (UnstructuredODTLoader, {}),
+    ".pdf": (PyPDFLoader, {}),
+    ".ppt": (UnstructuredPowerPointLoader, {}),
+    ".pptx": (UnstructuredPowerPointLoader, {}),
+    ".txt": (TextLoader, {"encoding": "utf8"}),
+}
+chinese_examples = [
+    ["英特尔®酷睿™ Ultra处理器可以降低多少功耗？"],
+    ["相比英特尔之前的移动处理器产品，英特尔®酷睿™ Ultra处理器的AI推理性能提升了多少？"],
+    ["英特尔博锐® Enterprise系统提供哪些功能？"],
+]
+english_examples = [
+    ["How much power consumption can Intel® Core™ Ultra Processors help save?"],
+    ["Compared to Intel’s previous mobile processor, what is the advantage of Intel® Core™ Ultra Processors for Artificial Intelligence?"],
+    ["What can Intel vPro® Enterprise systems offer?"],
+]
+if model_language == "English":
+    # text_example_path = "text_example_en.pdf"
+    text_example_path = ['Supervisors-Guide-Accurate-Timekeeping_AH edits.docx','Salary-vs-Hourly-Guide_AH edits.docx','Employee-Guide-Accurate-Timekeeping_AH edits.docx','Eller Overtime Guidelines.docx','Eller FLSA information 9.2024_AH edits.docx','Accurate Timekeeping Supervisors 12.2.20_AH edits.docx']
+else:
+    text_example_path = "text_example_cn.pdf"
+examples = chinese_examples if (model_language == "Chinese") else english_examples
+from langchain.prompts import PromptTemplate
+from langchain_community.vectorstores import FAISS
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.docstore.document import Document
+from langchain.retrievers import ContextualCompressionRetriever
+from threading import Thread
+import gradio as gr
+stop_tokens = llm_model_configuration.get("stop_tokens")
+rag_prompt_template = llm_model_configuration["rag_prompt_template"]
+class StopOnTokens(StoppingCriteria):
+    def __init__(self, token_ids):
+        self.token_ids = token_ids
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stop_id in self.token_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+if stop_tokens is not None:
+    if isinstance(stop_tokens[0], str):
+        stop_tokens = llm.pipeline.tokenizer.convert_tokens_to_ids(stop_tokens)
+    stop_tokens = [StopOnTokens(stop_tokens)]
+def load_single_document(file_path: str) -> List[Document]:
+    """
+    helper for loading a single document
+    Params:
+      file_path: document path
+    Returns:
+      documents loaded
+    """
+    ext = "." + file_path.rsplit(".", 1)[-1]
+    if ext in LOADERS:
+        loader_class, loader_args = LOADERS[ext]
+        loader = loader_class(file_path, **loader_args)
+        return loader.load()
+    raise ValueError(f"File does not exist '{ext}'")
+def default_partial_text_processor(partial_text: str, new_text: str):
+    """
+    helper for updating partially generated answer, used by default
+    Params:
+      partial_text: text buffer for storing previosly generated text
+      new_text: text update for the current step
+    Returns:
+      updated text string
+    """
+    partial_text += new_text
+    return partial_text
+text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor)
+def create_vectordb(
+    docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold, progress=gr.Progress()
+):
+    """
+    Initialize a vector database
+    Params:
+      doc: orignal documents provided by user
+      spliter_name: spliter method
+      chunk_size:  size of a single sentence chunk
+      chunk_overlap: overlap size between 2 chunks
+      vector_search_top_k: Vector search top k
+      vector_rerank_top_n: Search rerank top n
+      run_rerank: whether run reranker
+      search_method: top k search method
+      score_threshold: score threshold when selecting 'similarity_score_threshold' method
+    """
+    global db
+    global retriever
+    global combine_docs_chain
+    global rag_chain
+    if vector_rerank_top_n > vector_search_top_k:
+        gr.Warning("Search top k must >= Rerank top n")
+    documents = []
+    for doc in docs:
+        if type(doc) is not str:
+            doc = doc.name
+        documents.extend(load_single_document(doc))
+    text_splitter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    texts = text_splitter.split_documents(documents)
+    db = FAISS.from_documents(texts, embedding)
+    if search_method == "similarity_score_threshold":
+        search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold}
+    else:
+        search_kwargs = {"k": vector_search_top_k}
+    retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method)
+    if run_rerank:
+        reranker.top_n = vector_rerank_top_n
+        retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
+    prompt = PromptTemplate.from_template(rag_prompt_template)
+    combine_docs_chain = create_stuff_documents_chain(llm, prompt)
+    rag_chain = create_retrieval_chain(retriever, combine_docs_chain)
+    return "Vector database is Ready"
+def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold):
+    """
+    Update retriever
+    Params:
+      vector_search_top_k: Vector search top k
+      vector_rerank_top_n: Search rerank top n
+      run_rerank: whether run reranker
+      search_method: top k search method
+      score_threshold: score threshold when selecting 'similarity_score_threshold' method
+    """
+    global db
+    global retriever
+    global combine_docs_chain
+    global rag_chain
+    if vector_rerank_top_n > vector_search_top_k:
+        gr.Warning("Search top k must >= Rerank top n")
+    if search_method == "similarity_score_threshold":
+        search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold}
+    else:
+        search_kwargs = {"k": vector_search_top_k}
+    retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method)
+    if run_rerank:
+        retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
+        reranker.top_n = vector_rerank_top_n
+    rag_chain = create_retrieval_chain(retriever, combine_docs_chain)
+    return "Vector database is Ready"
+def user(message, history):
+    """
+    callback function for updating user messages in interface on submit button click
+    Params:
+      message: current message
+      history: conversation history
+    Returns:
+      None
+    """
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+def bot(history, temperature, top_p, top_k, repetition_penalty, hide_full_prompt, do_rag):
+    """
+    callback function for running chatbot on submit button click
+    Params:
+      history: conversation history
+      temperature:  parameter for control the level of creativity in AI-generated text.
+                    By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.
+      top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.
+      top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability.
+      repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.
+      hide_full_prompt: whether to show searching results in promopt.
+      do_rag: whether do RAG when generating texts.
+    """
+    streamer = TextIteratorStreamer(
+        llm.pipeline.tokenizer,
+        timeout=60.0,
+        skip_prompt=hide_full_prompt,
+        skip_special_tokens=True,
+    )
+    llm.pipeline._forward_params = dict(
+        max_new_tokens=512,
+        temperature=temperature,
+        do_sample=temperature > 0.0,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        streamer=streamer,
+    )
+    if stop_tokens is not None:
+        llm.pipeline._forward_params["stopping_criteria"] = StoppingCriteriaList(stop_tokens)
+    if do_rag:
+        t1 = Thread(target=rag_chain.invoke, args=({"input": history[-1][0]},))
+    else:
+        input_text = rag_prompt_template.format(input=history[-1][0], context="")
+        t1 = Thread(target=llm.invoke, args=(input_text,))
+    t1.start()
+    # Initialize an empty string to store the generated text
+    partial_text = ""
+    for new_text in streamer:
+        partial_text = text_processor(partial_text, new_text)
+        history[-1][1] = partial_text
+        yield history
+def request_cancel():
+    llm.pipeline.model.request.cancel()
+def clear_files():
+    return "Vector Store is Not ready"
+# initialize the vector store with example document
+create_vectordb(
+    text_example_path,  #changed
+    "RecursiveCharacter",
+    chunk_size=400,
+    chunk_overlap=50,
+    vector_search_top_k=10,
+    vector_rerank_top_n=2,
+    run_rerank=True,
+    search_method="similarity_score_threshold",
+    score_threshold=0.5,
+)
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css=".disclaimer {font-variant-caps: all-small-caps;}",
+) as demo:
+    gr.Markdown("""<h1><center>QA over Document</center></h1>""")
+    gr.Markdown(f"""<center>Powered by OpenVINO and {llm_model_id} </center>""")
+    with gr.Row():
+        with gr.Column(scale=1):
+            docs = gr.File(
+                label="Step 1: Load text files",
+                value=text_example_path,    #changed
+                file_count="multiple",
+                file_types=[
+                    ".csv",
+                    ".doc",
+                    ".docx",
+                    ".enex",
+                    ".epub",
+                    ".html",
+                    ".md",
+                    ".odt",
+                    ".pdf",
+                    ".ppt",
+                    ".pptx",
+                    ".txt",
+                ],
+            )
+            load_docs = gr.Button("Step 2: Build Vector Store", variant="primary")
+            db_argument = gr.Accordion("Vector Store Configuration", open=False)
+            with db_argument:
+                spliter = gr.Dropdown(
+                    ["Character", "RecursiveCharacter", "Markdown", "Chinese"],
+                    value="RecursiveCharacter",
+                    label="Text Spliter",
+                    info="Method used to splite the documents",
+                    multiselect=False,
+                )
+                chunk_size = gr.Slider(
+                    label="Chunk size",
+                    value=400,
+                    minimum=50,
+                    maximum=2000,
+                    step=50,
+                    interactive=True,
+                    info="Size of sentence chunk",
+                )
+                chunk_overlap = gr.Slider(
+                    label="Chunk overlap",
+                    value=50,
+                    minimum=0,
+                    maximum=400,
+                    step=10,
+                    interactive=True,
+                    info=("Overlap between 2 chunks"),
+                )
+            langchain_status = gr.Textbox(
+                label="Vector Store Status",
+                value="Vector Store is Ready",
+                interactive=False,
+            )
+            do_rag = gr.Checkbox(
+                value=True,
+                label="RAG is ON",
+                interactive=True,
+                info="Whether to do RAG for generation",
+            )
+            with gr.Accordion("Generation Configuration", open=False):
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            temperature = gr.Slider(
+                                label="Temperature",
+                                value=0.1,
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.1,
+                                interactive=True,
+                                info="Higher values produce more diverse outputs",
+                            )
+                    with gr.Column():
+                        with gr.Row():
+                            top_p = gr.Slider(
+                                label="Top-p (nucleus sampling)",
+                                value=1.0,
+                                minimum=0.0,
+                                maximum=1,
+                                step=0.01,
+                                interactive=True,
+                                info=(
+                                    "Sample from the smallest possible set of tokens whose cumulative probability "
+                                    "exceeds top_p. Set to 1 to disable and sample from all tokens."
+                                ),
+                            )
+                    with gr.Column():
+                        with gr.Row():
+                            top_k = gr.Slider(
+                                label="Top-k",
+                                value=50,
+                                minimum=0.0,
+                                maximum=200,
+                                step=1,
+                                interactive=True,
+                                info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
+                            )
+                    with gr.Column():
+                        with gr.Row():
+                            repetition_penalty = gr.Slider(
+                                label="Repetition Penalty",
+                                value=1.1,
+                                minimum=1.0,
+                                maximum=2.0,
+                                step=0.1,
+                                interactive=True,
+                                info="Penalize repetition — 1.0 to disable.",
+                            )
+        with gr.Column(scale=4):
+            chatbot = gr.Chatbot(
+                height=800,
+                label="Step 3: Input Query",
+            )
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        msg = gr.Textbox(
+                            label="QA Message Box",
+                            placeholder="Chat Message Box",
+                            show_label=False,
+                            container=False,
+                        )
+                with gr.Column():
+                    with gr.Row():
+                        submit = gr.Button("Submit", variant="primary")
+                        stop = gr.Button("Stop")
+                        clear = gr.Button("Clear")
+            gr.Examples(examples, inputs=msg, label="Click on any example and press the 'Submit' button")
+            retriever_argument = gr.Accordion("Retriever Configuration", open=True)
+            with retriever_argument:
+                with gr.Row():
+                    with gr.Row():
+                        do_rerank = gr.Checkbox(
+                            value=True,
+                            label="Rerank searching result",
+                            interactive=True,
+                        )
+                        hide_context = gr.Checkbox(
+                            value=True,
+                            label="Hide searching result in prompt",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        search_method = gr.Dropdown(
+                            ["similarity_score_threshold", "similarity", "mmr"],
+                            value="similarity_score_threshold",
+                            label="Searching Method",
+                            info="Method used to search vector store",
+                            multiselect=False,
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        score_threshold = gr.Slider(
+                            0.01,
+                            0.99,
+                            value=0.5,
+                            step=0.01,
+                            label="Similarity Threshold",
+                            info="Only working for 'similarity score threshold' method",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        vector_rerank_top_n = gr.Slider(
+                            1,
+                            10,
+                            value=2,
+                            step=1,
+                            label="Rerank top n",
+                            info="Number of rerank results",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        vector_search_top_k = gr.Slider(
+                            1,
+                            50,
+                            value=10,
+                            step=1,
+                            label="Search top k",
+                            info="Search top k must >= Rerank top n",
+                            interactive=True,
+                        )
+    docs.clear(clear_files, outputs=[langchain_status], queue=False)
+    load_docs.click(
+        create_vectordb,
+        inputs=[docs, spliter, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+        queue=False,
+    )
+    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot,
+        [chatbot, temperature, top_p, top_k, repetition_penalty, hide_context, do_rag],
+        chatbot,
+        queue=True,
+    )
+    submit_click_event = submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot,
+        [chatbot, temperature, top_p, top_k, repetition_penalty, hide_context, do_rag],
+        chatbot,
+        queue=True,
+    )
+    stop.click(
+        fn=request_cancel,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+    vector_search_top_k.release(
+        update_retriever,
+        [vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+    )
+    vector_rerank_top_n.release(
+        update_retriever,
+        inputs=[vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+    )
+    do_rerank.change(
+        update_retriever,
+        inputs=[vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+    )
+    search_method.change(
+        update_retriever,
+        inputs=[vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+    )
+    score_threshold.change(
+        update_retriever,
+        inputs=[vector_search_top_k, vector_rerank_top_n, do_rerank, search_method, score_threshold],
+        outputs=[langchain_status],
+    )
+demo.queue()
+# if you are launching remotely, specify server_name and server_port
+# demo.launch(server_port=8082)
+# if you have any issue to launch on your platform, you can pass share=True to launch method:
+demo.launch(share=True)
+# it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/
+# demo.launch()