MBAL_chatbot

Running

App Files Files Community

ngcanh commited on Jul 17

Commit

d657a45

verified ·

1 Parent(s): ff8c629

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -145

app.py CHANGED Viewed

@@ -1,149 +1,208 @@
-import streamlit as st
-from langchain.llms import HuggingFacePipeline
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-from langchain.prompts.prompt import PromptTemplate
-from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from langchain.schema import Document
-from langchain_community.llms import HuggingFaceEndpoint
-from langchain.vectorstores import Chroma
-from transformers import TextStreamer
-from langchain.llms import HuggingFacePipeline
-from langchain.prompts import ChatPromptTemplate
-from langchain.llms import HuggingFaceHub
 import os
-import pandas as pd
-# from langchain.vectorstores import FAISS
-import subprocess
-from langchain_community.llms import HuggingFaceHub
-import pandas as pd
-# Configuración del modelo
 TOKEN=os.getenv('HF_TOKEN')
 subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
-######
-# set this key as an environment variable
-os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
-# Initialize tokenizer
-@st.cache_resource
-def load_model():
-    TOKEN=os.getenv('HF_TOKEN')
-    subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
-    os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
-    MODEL_NAME = "google/gemma-2b-it"
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME
-        # quantization_config=nf4_config, # add config
-        # torch_dtype=torch.bfloat16, # save memory using float16
-        # low_cpu_mem_usage=True,
-        # token= TOKEN
-    ).to("cuda")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model_pipeline = pipeline(
-        'text-generation',
-        model=model,
-        tokenizer=tokenizer,
-        max_new_tokens=1024, # output token
-        device_map="auto" # auto allocate GPU if available
-    )
-    return HuggingFacePipeline(pipeline=model_pipeline)
-# Initialize embeddings
-@st.cache_resource
-def load_embeddings():
-    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/bkai-foundation-models/vietnamese-bi-encoder')
-    # embeddings = OpenAIEmbeddings()
-    return embeddings
-# Chroma Vector store
-@st.cache_resource
-def setup_vector():
-    chunks = []
-    df = pd.read_excel(r"chunk_metadata_template.xlsx")
-    for _, row in df.iterrows():
-        chunk_with_metadata = Document(
-            page_content=row['page_content'],
-            metadata={
-                'chunk_id': row['chunk_id'],
-                'document_title': row['document_title'],
-            }
-        )
-        chunks.append(chunk_with_metadata)
-    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/bkai-foundation-models/vietnamese-bi-encoder')
-    return Chroma.from_documents(chunks, embedding=embeddings)
-# Set up chain
-def setup_conversation_chain():
-    llm = load_model()
-    vector = setup_vector()
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-    template = """Bạn là một chuyên viên tư vấn cho khách hàng về sản phẩm bảo hiểm của công ty MB Ageas Life tại Việt Nam.
-    Hãy trả lời  chuyên nghiệp, chính xác, cung cấp thông tin trước rồi hỏi câu tiếp theo. Tất cả các thông tin cung cấp đều trong phạm vi MBAL. Khi có đủ thông tin khách hàng thì mới mời khách hàng đăng ký để nhận tư vấn trên https://www.mbageas.life/
-        {context}
-        Câu hỏi: {question}
-        Trả lời:"""
-    # PROMPT = ChatPromptTemplate.from_template(template=template)
-    # chain = ConversationalRetrievalChain.from_llm(
-    #     llm=llm,
-    #     retriever=vector.as_retriever(search_kwargs={'k': 5}),
-    #     memory=memory,
-    #     combine_docs_chain_kwargs={"prompt": PROMPT}
-    #     # condense_question_prompt=CUSTOM_QUESTION_PROMPT
-    # )
-    chain = (
-    {"context": vector.as_retriever(search_kwargs={'k': 5}) | format_docs, "question": RunnablePassthrough()}
-    | prompt
-    | llm
-    | parser
-    )
-    return chain
-# Streamlit
 def main():
-    st.title("🛡️ MBAL Chatbot 🛡️")
-    # Inicializar la cadena de conversación
-    if 'conversation_chain' not in st.session_state:
-        st.session_state.conversation_chain = setup_conversation_chain()
-    # Mostrar mensajes del chat
-    if 'messages' not in st.session_state:
-        st.session_state.messages = []
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # Campo de entrada para el usuario
-    if prompt := st.chat_input("Bạn cần tư vấn về điều gì? Hãy chia sẻ nhu cầu và thông tin của bạn nhé!"):
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        with st.chat_message("assistant"):
-            message_placeholder = st.empty()
-            full_response = ""
-            # Generar respuesta
-            response = st.session_state.conversation_chain({"question": prompt, "chat_history": []})
-            full_response = response['answer']
-            # full_response = response.get("answer", "No response generated.")
-            message_placeholder.markdown(full_response)
-        st.session_state.messages.append({"role": "assistant", "content": full_response})
-# if __name__ == "__main__":
-main()

 import os
+import streamlit as st
+from openai import AzureOpenAI
+import PyPDF2
+import openai
+from io import BytesIO
+from typing import List, Dict
+from dotenv import load_dotenv
+# Load environment variables
+OPENAI_API_KEY = os.getenv("OPENAI_API")
 TOKEN=os.getenv('HF_TOKEN')
 subprocess.run(["huggingface-cli", "login", "--token", TOKEN, "--add-to-git-credential"])
+st.sidebar.title("Welcome to MBAL Chatbot")
+class PDFChatbot:
+   def __init__(self):
+       # Initialize Azure OpenAI client
+       # self.azure_client = AzureOpenAI(
+       #     api_key=os.getenv("AZURE_OPENAI_KEY"),
+       #     api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-01"),
+       #     azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
+       # )
+       self.azure_client = openai.OpenAI()
+       # Model name for your deployment
+       # self.model_name = os.getenv("AZURE_OPENAI_MODEL", "gpt-4")
+       self.model_name = ChatOpenAI(model="gpt-3.5-turbo-0125",openai_api_key = OPENAI_API_KEY)
+       # Store conversation history
+       self.conversation_history = []
+       self.pdf_content = ""
+   def extract_text_from_pdf(self, pdf_file):
+       """Extract text content from uploaded PDF file."""
+       try:
+           pdf_reader = PyPDF2.PdfReader(pdf_file)
+           text = ""
+           for page_num in range(len(pdf_reader.pages)):
+               page = pdf_reader.pages[page_num]
+               text += page.extract_text() + "\n"
+           return text.strip()
+       except Exception as e:
+           st.error(f"Error reading PDF: {str(e)}")
+           return None
+   def chunk_text(self, text: str, chunk_size: int = 3000) -> List[str]:
+       """Split text into smaller chunks for better processing."""
+       words = text.split()
+       chunks = []
+       current_chunk = []
+       current_length = 0
+       for word in words:
+           if current_length + len(word) + 1 > chunk_size:
+               if current_chunk:
+                   chunks.append(" ".join(current_chunk))
+                   current_chunk = [word]
+                   current_length = len(word)
+           else:
+               current_chunk.append(word)
+               current_length += len(word) + 1
+       if current_chunk:
+           chunks.append(" ".join(current_chunk))
+       return chunks
+   def get_relevant_context(self, query: str, chunks: List[str], max_chunks: int = 3) -> str:
+       """Get the most relevant chunks for the query (simple keyword matching)."""
+       # Simple keyword-based relevance scoring
+       query_words = set(query.lower().split())
+       chunk_scores = []
+       for i, chunk in enumerate(chunks):
+           chunk_words = set(chunk.lower().split())
+           # Calculate simple overlap score
+           overlap = len(query_words.intersection(chunk_words))
+           chunk_scores.append((i, overlap, chunk))
+       # Sort by relevance score and take top chunks
+       chunk_scores.sort(key=lambda x: x[1], reverse=True)
+       relevant_chunks = [chunk for _, _, chunk in chunk_scores[:max_chunks]]
+       return "\n\n".join(relevant_chunks)
+   def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
+       """Generate response using Azure OpenAI based on PDF content and user question."""
+       try:
+           # Split PDF content into chunks
+           chunks = self.chunk_text(pdf_content)
+           # Get relevant context for the question
+           relevant_context = self.get_relevant_context(user_question, chunks)
+           # Prepare messages for the chat
+           messages = [
+               {
+                   "role": "system",
+                   "content": """You are an experienced insurance agent assistant who helps customers understand their insurance policies and coverage details. Follow these guidelines:
+                   1. Only provide information based on the PDF content provided
+                   2. If the answer is not in the PDF, clearly state that the information is not available in the document
+                   3. Provide clear, concise, and helpful responses in a professional manner
+                   4. Always respond in English using proper grammar and formatting
+                   5. When possible, reference specific sections or clauses from the policy
+                   6. Use insurance terminology appropriately but explain complex terms when necessary
+                   7. Be empathetic and patient, as insurance can be confusing for customers
+                   8. If asked about claims, coverage limits, deductibles, or policy terms, provide accurate information from the document
+                   9. Always prioritize customer understanding and satisfaction
+                   10. If multiple interpretations are possible, explain the different scenarios clearly
+                   Remember: You are here to help customers understand their insurance coverage better."""
+               },
+               {
+                   "role": "user",
+                   "content": f"""Insurance Document Content:
+{relevant_context}
+Customer Question: {user_question}
+Please provide a helpful response based on the insurance document content above."""
+               }
+           ]
+           # Add conversation history
+           for msg in self.conversation_history[-6:]:  # Keep last 6 messages for context
+               messages.append(msg)
+           # Get response from Azure OpenAI
+           response = self.azure_client.chat.completions.create(
+               model=self.model_name,
+               messages=messages,
+               max_tokens=1000,
+               temperature=0.7
+           )
+           bot_response = response.choices[0].message.content
+           # Update conversation history
+           self.conversation_history.append({"role": "user", "content": user_question})
+           self.conversation_history.append({"role": "assistant", "content": bot_response})
+           return bot_response
+       except Exception as e:
+           return f"Error generating response: {str(e)}"
 def main():
+   st.set_page_config(page_title="Insurance PDF Chatbot", page_icon="🛡️", layout="wide")
+   st.title("🛡️ Insurance Policy Assistant")
+   st.markdown("Upload your insurance policy PDF and ask questions about your coverage, claims, deductibles, and more!")
+   # Initialize chatbot
+   if 'chatbot' not in st.session_state:
+       st.session_state.chatbot = PDFChatbot()
+       st.session_state.pdf_processed = False
+       st.session_state.chat_history = []
+   # Sidebar for PDF upload and settings
+   with st.sidebar:
+       st.header("📁 Upload Insurance Document")
+       uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+       if uploaded_file is not None:
+           if st.button("Process PDF"):
+               with st.spinner("Processing your insurance document..."):
+                   # Extract text from PDF
+                   text_content = st.session_state.chatbot.extract_text_from_pdf(uploaded_file)
+                   if text_content:
+                       st.session_state.chatbot.pdf_content = text_content
+                       st.session_state.pdf_processed = True
+                       st.success("Insurance document processed successfully!")
+                       # Show PDF summary
+                       st.subheader("Document Preview")
+                       st.text_area(
+                           "First 500 characters:",
+                           text_content[:500] + "..." if len(text_content) > 500 else text_content,
+                           height=100
+                       )
+                   else:
+                       st.error("Failed to process PDF")
+       # Clear conversation
+       if st.button("Clear Conversation"):
+           st.session_state.chatbot.conversation_history = []
+           st.session_state.chat_history = []
+           st.rerun()
+   # Main chat interface
+   if st.session_state.pdf_processed:
+       st.header("💬 Ask About Your Insurance Policy")
+       # Display chat history
+       for i, (question, answer) in enumerate(st.session_state.chat_history):
+           with st.container():
+               st.markdown(f"**You:** {question}")
+               st.markdown(f"**Insurance Assistant:** {answer}")
+               st.divider()
+       # Chat input
+       user_question = st.chat_input("Ask about your insurance coverage, claims, deductibles, or any policy details...")
+       if user_question:
+           with st.spinner("Analyzing your policy..."):
+               # Get response from chatbot
+               response = st.session_state.chatbot.chat_with_pdf(
+                   user_question,
+                   st.session_state.chatbot.pdf_content
+               )
+               # Add to chat history
+               st.session_state.chat_history.append((user_question, response))
+               # Display the new response
+               st.markdown(f"**You:** {user_question}")
+               st.markdown(f"**Insurance Assistant:** {response}")
+   else:
+       st.info("👆 Please upload and process an insurance PDF document to start chatting!")
+       # Show example questions
+       st.subheader("Example questions you can ask:")
+       st.markdown("""
+       - What is my coverage limit for property damage?
+       - What is my deductible amount?
+       - What types of incidents are covered under this policy?
+       - What is excluded from my coverage?
+       - How do I file a claim?
+       - What is the process for claim settlement?
+       - What are my premium payment options?
+       - When does my policy expire?
+       - Is flood damage covered?
+       - What documentation do I need for a claim?
+       """)
+       # Add insurance tips
+       st.subheader("💡 Insurance Tips")
+       st.markdown("""
+       - Review your policy regularly to understand your coverage
+       - Keep your policy documents in a safe place
+       - Update your coverage when your circumstances change
+       - Document any incidents immediately
+       - Contact your insurance agent if you have questions
+       """)
+if __name__ == "__main__":
+   main()