import os import tempfile import streamlit as st from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain_community.chat_models import ChatOpenAI # Título de la aplicación st.title("✨ DataKlug ") st.markdown( """ """, unsafe_allow_html=True ) st.markdown("### Bienvenido a tu asistente DataKlug") st.markdown("Sube tu PDF y conversa con él utilizando inteligencia artificial. ¡Disfruta de la experiencia!") # Obtenemos la API Key desde los secretos de Streamlit api_key = st.secrets.get("DEEPSEEK_API_KEY", None) # Verificamos si existe la API Key en los secretos if not api_key: st.error("No se encontró la variable `DEEPSEEK_API_KEY` en los secretos de Streamlit. Por favor, configúrala antes de continuar.") st.stop() else: # Almacenamos la API Key en las variables de entorno (opcional) os.environ["DEEPSEEK_API_KEY"] = api_key # Paso 1: Subir el documento PDF st.markdown("### 1. Sube un documento PDF para analizar") uploaded_file = st.file_uploader("Arrastra o haz clic para subir un PDF", type=["pdf"]) # Utilizamos el estado de sesión para mantener el vector_store if "vector_store" not in st.session_state: st.session_state.vector_store = None # Procesamos el PDF al subirlo (solo si no se ha creado antes el vector_store) if uploaded_file and st.session_state.vector_store is None: try: with st.spinner("Procesando tu documento, por favor espera..."): # Guardamos el archivo temporalmente with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name # Cargamos el PDF con PyPDFLoader loader = PyPDFLoader(tmp_file_path) documents = loader.load() # Borramos el archivo temporal os.unlink(tmp_file_path) # Dividimos el texto en fragmentos text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(documents) # Generamos embeddings y los almacenamos en una base vectorial embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") st.session_state.vector_store = FAISS.from_documents(chunks, embeddings) st.success("¡Documento procesado con éxito!") except Exception as e: st.error(f"Error al procesar el documento: {e}") st.stop() # Paso 2: Hacer preguntas sobre el documento if st.session_state.vector_store: st.markdown("### 2. Chatea con tu documento") user_query = st.text_input("Escribe tu pregunta aquí:") if user_query: try: # Configuramos el proceso de Recuperación + Generación (RAG) con DeepSeek retriever = st.session_state.vector_store.as_retriever() llm = ChatOpenAI( model="deepseek-chat", openai_api_key=api_key, openai_api_base="https://api.deepseek.com/v1", temperature=0.85, max_tokens=1000 # Ajusta este valor según tus necesidades ) qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) # Obtenemos la respuesta with st.spinner("Generando respuesta..."): response = qa_chain.run(user_query) st.write(f"**Respuesta:** {response}") except Exception as e: st.error(f"Error al generar la respuesta: {e}") else: st.info("Por favor, sube tu PDF para comenzar.")