Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from pinecone import Pinecone, ServerlessSpec | |
| from langchain_pinecone import PineconeVectorStore | |
| from typing import List | |
| from langchain_core.documents import Document | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # API keys | |
| PINECONE_API_KEY=os.getenv("PINECONE_API_KEY") | |
| OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") | |
| # text splitter and embedding function | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len) | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024, api_key=OPENAI_API_KEY) | |
| # Pinecone vector store | |
| pc = Pinecone(api_key="pcsk_55CAuv_4KzXk8TYtxsauVoxwBsqqTgxWYmNJEEYrCpVxPqHcYbeS8njTWjZ14xCMEbksPS") | |
| def load_and_split_document(file_path: str) -> List[Document]: | |
| if file_path.endswith('.pdf'): | |
| loader = PyPDFLoader(file_path) | |
| elif file_path.endswith('.txt'): | |
| loader = TextLoader(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_path}") | |
| documents = loader.load() | |
| return text_splitter.split_documents(documents) | |
| INDEX_NAME = "smart-research-assistant" | |
| def create_pinecone_vectorstore()-> PineconeVectorStore: | |
| try: | |
| if not pc.has_index(INDEX_NAME): | |
| pc.create_index( | |
| name=INDEX_NAME, | |
| dimension=1024, | |
| metric="cosine", | |
| spec=ServerlessSpec(cloud="aws", region="us-east-1") | |
| ) | |
| index = pc.Index(INDEX_NAME) | |
| return PineconeVectorStore(index=index, embedding=embeddings) | |
| except Exception as e: | |
| print(f"Index initialization failed: {e}") | |
| raise | |
| vectorstore=create_pinecone_vectorstore() | |
| def index_document_to_pinecone(file_path: str, file_id: int) -> bool: | |
| try: | |
| splits = load_and_split_document(file_path) | |
| # Add metadata to each split | |
| for split in splits: | |
| split.metadata['file_id'] = file_id | |
| vectorstore.add_documents(splits) | |
| return True | |
| except Exception as e: | |
| print(f"Error indexing document: {e}") | |
| return False | |
| def delete_doc_from_pinecone(file_id: int): | |
| try: | |
| index = pc.Index(INDEX_NAME) | |
| # Query for all vectors with file_id metadata | |
| query_result = index.query( | |
| vector=[0.0]*1024, | |
| filter={"file_id": {"$eq": str(file_id)}}, | |
| top_k=10000, | |
| include_metadata=True | |
| ) | |
| ids = [match["id"] for match in query_result["matches"]] | |
| if ids: | |
| index.delete(ids=ids) | |
| return True | |
| except Exception as e: | |
| print(f"Error deleting from Pinecone: {str(e)}") | |
| return False | |