Spaces:

mohcineelharras
/

llama-index-docs-spaces

Runtime error

App Files Files Community

mohcineelharras commited on Nov 21, 2023

Commit

77b04d1

1 Parent(s): e6e7a99

templates done

Browse files

Files changed (3) hide show

README.md +2 -1
app.py +114 -54
data/doctest.txt +4 -2

README.md CHANGED Viewed

@@ -9,4 +9,5 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+How to pick chunks that are pertinent ?
+How to stream response word by word ?

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from llama_index.embeddings import InstructorEmbedding
 from llama_index import ServiceContext, VectorStoreIndex, SimpleDirectoryReader
 from tqdm.notebook import tqdm
 from dotenv import load_dotenv
 # --------------------------------env variables-----------------------------------
@@ -22,12 +24,92 @@ no_proxy = os.getenv("no_proxy")
 OPENAI_API_KEY =  os.getenv("OPENAI_API_KEY")
 OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
 # --------------------------------cache LLM-----------------------------------
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 llama_debug = LlamaDebugHandler(print_trace_on_end=True)
 callback_manager = CallbackManager([llama_debug])
 # LLM
 @st.cache_resource
 def load_llm_model():
@@ -40,7 +122,7 @@ def load_llm_model():
         model_path="models/dolphin-2.1-mistral-7b.Q4_K_S.gguf",
         temperature=0.0,
         max_new_tokens=100,
-        context_window=1024,
         generate_kwargs={},
         model_kwargs={"n_gpu_layers": 20},
         messages_to_prompt=messages_to_prompt,
@@ -49,8 +131,6 @@ def load_llm_model():
     )
     return llm
-llm = load_llm_model()
 # --------------------------------cache Embedding model-----------------------------------
 @st.cache_resource
@@ -62,14 +142,13 @@ def load_emb_model():
     embed_model_inst = InstructorEmbedding("models/hkunlp_instructor-base"
         #model_name="hkunlp/instructor-base"
         )
-    service_context = ServiceContext.from_defaults(embed_model=embed_model_inst, llm=llm)
     documents = SimpleDirectoryReader("data").load_data()
     print(f"Number of documents: {len(documents)}")
     index = VectorStoreIndex.from_documents(
         documents, service_context=service_context, show_progress=True)
-    return index.as_query_engine()
-query_engine = load_emb_model()
 # ------------------------------------layout----------------------------------------
@@ -77,7 +156,7 @@ with st.sidebar:
     api_server_info = st.text_input("Local LLM API server", OPENAI_API_BASE ,key="openai_api_base")
     st.title("🤖 Llama Index 📚")
     if st.button('Clear Memory'):
-        st.session_state.memory = ""
     st.write("Local LLM API server in this demo is useles, we are loading local model using llama_index integration of llama cpp")
     st.write("🚀 This app allows you to chat with local LLM using api server or loaded in cache")
     st.subheader("💻 System Requirements: ")
@@ -89,43 +168,26 @@ with st.sidebar:
 # Define your app's tabs
 tab1, tab2, tab3 = st.tabs(["LLM only", "LLM RAG QA with database", "One single document Q&A"])
-# -----------------------------------LLM only---------------------------------------------
 if 'memory' not in st.session_state:
     st.session_state.memory = ""
-#token_count = 0
 with tab1:
     st.title("💬 LLM only")
     prompt = st.text_input(
         "Ask your question here",
-        placeholder="Who is Lionel Messi",
-    )
-    template = (
-        "system\n"
-        "You are Dolphin, a helpful AI assistant. Your responses should be based solely on the content of documents you have access to. "
-        "Do not provide information that is not contained in the documents. "
-        "If a question is asked about content not in the documents, respond with 'I do not have that information.' "
-        "Always respond in the same language as the question was asked. Be concise.\n"
-        "user\n"
-        "{prompt}\n"
-        "assistant\n"
     )
     if prompt:
         contextual_prompt = st.session_state.memory + "\n" + prompt
-        formatted_prompt = template.format(prompt=contextual_prompt)
-        response = llm.complete(formatted_prompt,max_tokens=100, temperature=0, top_p=0.95, top_k=10)
-        #print(response)
         text_response = response
-        #---------------------------------------------
-        # text_response = response["choices"][0]["text"]
-        # token_count += response["usage"]["total_tokens"]
-        # st.write("LLM's Response:\n", text_response)
-        # st.write("Token count:\n", token_count)
-        #---------------------------------------------
-        st.write("LLM's Response:\n",text_response)
         st.session_state.memory = f"Prompt: {contextual_prompt}\nResponse:\n {text_response}"
-        #st.write("Memory:\n", memory)
         with open("short_memory.txt", 'w') as file:
             file.write(st.session_state.memory)
@@ -133,34 +195,30 @@ with tab1:
 with tab2:
     st.title("💬 LLM RAG QA with database")
-    st.write("To consult files that are available in the database, go to https://huggingface.co/spaces/mohcineelharras/llama-index-docs-spaces/blob/main/data")
     prompt = st.text_input(
         "Ask your question here",
         placeholder="How does the blockchain work ?",
     )
     if prompt:
-        response = query_engine.query(prompt)
-        st.write("Your prompt: ", prompt)
-        st.write("LLM's Response:\n"+ response.response)
         with st.expander("Document Similarity Search"):
             for i, node in enumerate(response.source_nodes):
                 dict_source_i = node.node.metadata
                 dict_source_i.update({"Text":node.node.text})
                 st.write("Source n°"+str(i+1), dict_source_i)
-                st.write()
-# -----------------------------------Upload File Q&A-----------------------------------------
-def load_emb_uploaded_document(filename):
-    # You may want to add a check to prevent execution during initialization.
-    if 'init' in st.session_state:
-        embed_model_inst = InstructorEmbedding("models/hkunlp_instructor-base")
-        service_context = ServiceContext.from_defaults(embed_model=embed_model_inst, llm=llm)
-        documents = SimpleDirectoryReader(input_files=[filename]).load_data()
-        index = VectorStoreIndex.from_documents(
-            documents, service_context=service_context, show_progress=True)
-        return index.as_query_engine()
-    return None
 with tab3:
     st.title("📝 One single document Q&A with Llama Index using local open llms")
@@ -190,11 +248,13 @@ with tab3:
         st.write("File ",uploaded_file.name, "was loaded successfully")
     if uploaded_file and question and api_server_info:
-        response = prompt = f"""Based on the context presented. Respond to the question below to the best of your ability.
-        \n\n{question}"""
-        response = query_engine.query(prompt)
         st.write("### Answer")
-        st.write(response.response)
         with st.expander("Document Similarity Search"):
             #st.write(len(response.source_nodes))
             for i, node in enumerate(response.source_nodes):

 from llama_index import ServiceContext, VectorStoreIndex, SimpleDirectoryReader
 from tqdm.notebook import tqdm
 from dotenv import load_dotenv
+from llama_index.llms import ChatMessage, MessageRole
+from llama_index.prompts import ChatPromptTemplate
 # --------------------------------env variables-----------------------------------
 OPENAI_API_KEY =  os.getenv("OPENAI_API_KEY")
 OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
+# Text QA Prompt
+chat_text_qa_msgs = [
+    ChatMessage(
+        role=MessageRole.SYSTEM,
+        content=(
+            "You are Dolphin, a helpful AI assistant. "
+            "Answer questions based solely on the context provided. "
+            "Do not use information outside of the context. "
+            "Respond in the same language as the question. Be concise."
+        ),
+    ),
+    ChatMessage(
+        role=MessageRole.USER,
+        content=(
+            "Context information is below:\n"
+            "---------------------\n"
+            "{context_str}\n"
+            "---------------------\n"
+            "Based on this context, answer the question: {query_str}\n"
+        ),
+    ),
+]
+text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
+# Refine Prompt
+chat_refine_msgs = [
+    ChatMessage(
+        role=MessageRole.SYSTEM,
+        content=(
+            "You are Dolphin, focused on refining answers with additional context. "
+            "Use new context to refine the answer. "
+            "If the new context isn't useful, restate the original answer. "
+            "Be precise and match the language of the query."
+        ),
+    ),
+    ChatMessage(
+        role=MessageRole.USER,
+        content=(
+            "New context for refinement:\n"
+            "------------\n"
+            "{context_msg}\n"
+            "------------\n"
+            "Refine the original answer with this context for the question: {query_str}. "
+            "Original Answer: {existing_answer}"
+        ),
+    ),
+]
+refine_template = ChatPromptTemplate(chat_refine_msgs)
+template = (
+    "system\n"
+    "\"You are Dolphin, a helpful AI assistant. Your responses should be based solely on the content of documents you have access to, "
+    "including the specific context provided below. Do not provide information that is not contained in the documents or the context. "
+    "If a question is asked about content not in the documents or context, respond with 'I do not have that information.' "
+    "Always respond in the same language as the question was asked. Be concise.\n"
+    "Respond to the best of your ability. Try to respond in markdown.\"\n"
+    "context\n"
+    "{context}\n"
+    "user\n"
+    "{prompt}\n"
+    "assistant\n"
+)
 # --------------------------------cache LLM-----------------------------------
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 llama_debug = LlamaDebugHandler(print_trace_on_end=True)
 callback_manager = CallbackManager([llama_debug])
+#One doc embedding
+def load_emb_uploaded_document(filename):
+    # You may want to add a check to prevent execution during initialization.
+    if 'init' in st.session_state:
+        embed_model_inst = InstructorEmbedding("models/hkunlp_instructor-base")
+        service_context = ServiceContext.from_defaults(embed_model=embed_model_inst, llm=llm, chunk_size_limit=500)
+        documents = SimpleDirectoryReader(input_files=[filename]).load_data()
+        index = VectorStoreIndex.from_documents(
+            documents, service_context=service_context, show_progress=True)
+        return index.as_query_engine(text_qa_template=text_qa_template, refine_template=refine_template)
+    return None
 # LLM
 @st.cache_resource
 def load_llm_model():
         model_path="models/dolphin-2.1-mistral-7b.Q4_K_S.gguf",
         temperature=0.0,
         max_new_tokens=100,
+        context_window=2048,
         generate_kwargs={},
         model_kwargs={"n_gpu_layers": 20},
         messages_to_prompt=messages_to_prompt,
     )
     return llm
 # --------------------------------cache Embedding model-----------------------------------
 @st.cache_resource
     embed_model_inst = InstructorEmbedding("models/hkunlp_instructor-base"
         #model_name="hkunlp/instructor-base"
         )
+    service_context = ServiceContext.from_defaults(embed_model=embed_model_inst,
+                                                   llm=llm)
     documents = SimpleDirectoryReader("data").load_data()
     print(f"Number of documents: {len(documents)}")
     index = VectorStoreIndex.from_documents(
         documents, service_context=service_context, show_progress=True)
+    return index.as_query_engine(text_qa_template=text_qa_template, refine_template=refine_template)
 # ------------------------------------layout----------------------------------------
     api_server_info = st.text_input("Local LLM API server", OPENAI_API_BASE ,key="openai_api_base")
     st.title("🤖 Llama Index 📚")
     if st.button('Clear Memory'):
+        del st.session_state["memory"]
     st.write("Local LLM API server in this demo is useles, we are loading local model using llama_index integration of llama cpp")
     st.write("🚀 This app allows you to chat with local LLM using api server or loaded in cache")
     st.subheader("💻 System Requirements: ")
 # Define your app's tabs
 tab1, tab2, tab3 = st.tabs(["LLM only", "LLM RAG QA with database", "One single document Q&A"])
 if 'memory' not in st.session_state:
     st.session_state.memory = ""
+llm = load_llm_model()
+query_engine = load_emb_model()
+# -----------------------------------LLM only---------------------------------------------
 with tab1:
     st.title("💬 LLM only")
     prompt = st.text_input(
         "Ask your question here",
+        placeholder="Who is Mohcine",
     )
     if prompt:
         contextual_prompt = st.session_state.memory + "\n" + prompt
+        response = llm.complete(prompt,max_tokens=100, temperature=0, top_p=0.95, top_k=10)
         text_response = response
+        st.write("### Answer")
+        st.markdown(text_response)
         st.session_state.memory = f"Prompt: {contextual_prompt}\nResponse:\n {text_response}"
         with open("short_memory.txt", 'w') as file:
             file.write(st.session_state.memory)
 with tab2:
     st.title("💬 LLM RAG QA with database")
+    st.write("To consult files that are available in the database, go to https://huggingface.co/spaces/mohcineelharras/llama-index-docs-spaces/tree/main/data")
     prompt = st.text_input(
         "Ask your question here",
         placeholder="How does the blockchain work ?",
     )
     if prompt:
+        contextual_prompt = st.session_state.memory + "\n" + prompt
+        response = query_engine.query(contextual_prompt)
+        text_response = response.response
+        st.write("### Answer")
+        st.markdown(text_response)
+        st.session_state.memory = f"Prompt: {contextual_prompt}\nResponse:\n {text_response}"
         with st.expander("Document Similarity Search"):
             for i, node in enumerate(response.source_nodes):
                 dict_source_i = node.node.metadata
                 dict_source_i.update({"Text":node.node.text})
                 st.write("Source n°"+str(i+1), dict_source_i)
+                break
+        st.session_state.memory = f"Prompt: {contextual_prompt}\nResponse:\n {text_response}"
+        with open("short_memory.txt", 'w') as file:
+            file.write(st.session_state.memory)
+# -----------------------------------Upload File Q&A-----------------------------------------
 with tab3:
     st.title("📝 One single document Q&A with Llama Index using local open llms")
         st.write("File ",uploaded_file.name, "was loaded successfully")
     if uploaded_file and question and api_server_info:
+        contextual_prompt = st.session_state.memory + "\n" + question
+        response = query_engine.query(contextual_prompt)
+        text_response = response.response
         st.write("### Answer")
+        st.session_state.memory = f"Prompt: {contextual_prompt}\nResponse:\n {text_response}"
+        with open("short_memory.txt", 'w') as file:
+            file.write(st.session_state.memory)
         with st.expander("Document Similarity Search"):
             #st.write(len(response.source_nodes))
             for i, node in enumerate(response.source_nodes):

data/doctest.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-Hi my name is Mohcine,
 I am 25 years old
-I am a freelancer

+Hi my name is Mohcine
 I am 25 years old
+I am a freelancer
+I am interested in crypto
+I worked at EDF and Enedis