constitutional_bot / load_data.py
sankalp2606's picture
Upload 3 files
d07fe0d verified
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader # Updated imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import glob
def load_and_split():
"""
Load documents from various sources and split into chunks with metadata preservation.
Returns:
list: List of Document objects with text and metadata
"""
documents = []
# Load all PDFs in data folder
pdf_files = glob.glob("data/*.pdf")
for file in pdf_files:
loader = PyPDFLoader(file)
docs = loader.load()
# Add file source to metadata
for doc in docs:
doc.metadata["source"] = os.path.basename(file)
documents.extend(docs)
# Load all text files
txt_files = glob.glob("data/*.txt")
for file in txt_files:
loader = TextLoader(file)
docs = loader.load()
# Add metadata
for doc in docs:
doc.metadata["source"] = os.path.basename(file)
documents.extend(docs)
# Load all CSV files
csv_files = glob.glob("data/*.csv")
for file in csv_files:
try:
loader = CSVLoader(file)
docs = loader.load()
# Add metadata
for doc in docs:
doc.metadata["source"] = os.path.basename(file)
documents.extend(docs)
except Exception as e:
print(f"Error loading CSV file {file}: {e}")
print(f"Loaded {len(documents)} documents from {len(pdf_files)} PDFs, {len(txt_files)} TXTs, and {len(csv_files)} CSVs")
# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Increased chunk size for better context
chunk_overlap=200, # Increased overlap
separators=["\n\n", "\n", " ", ""]
)
split_docs = text_splitter.split_documents(documents)
print(f"Split into {len(split_docs)} chunks")
return split_docs
if __name__ == "__main__":
documents = load_and_split()
print(f"Loaded and split {len(documents)} text chunks from all documents!")