Shedify / pdf_processor.py
Borislav18's picture
Upload 2 files
f77e8bf verified
import os
import json
import argparse
from pathlib import Path
from typing import List, Dict, Any
try:
from PyPDF2 import PdfReader
from tqdm import tqdm
except ImportError:
print("Installing required dependencies...")
import subprocess
subprocess.check_call(["pip", "install", "PyPDF2", "tqdm"])
from PyPDF2 import PdfReader
from tqdm import tqdm
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extract text from a PDF file."""
try:
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error extracting text from {pdf_path}: {e}")
return ""
def process_pdfs(pdf_dir: str, output_dir: str, chunk_size: int = 1000) -> List[Dict[str, Any]]:
"""Process all PDFs in a directory and save the extracted text."""
pdf_files = list(Path(pdf_dir).glob("*.pdf"))
if not pdf_files:
raise ValueError(f"No PDF files found in {pdf_dir}")
os.makedirs(output_dir, exist_ok=True)
all_data = []
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
try:
file_name = pdf_file.stem
print(f"Processing {file_name}")
text = extract_text_from_pdf(str(pdf_file))
if not text.strip():
print(f"Warning: No text extracted from {file_name}")
continue
# Split into chunks to avoid context length issues
words = text.split()
for i in range(0, len(words), chunk_size):
chunk = " ".join(words[i:i+chunk_size])
if len(chunk.strip()) > 100: # Ensure chunk has enough content
data_point = {
"text": chunk,
"source": file_name,
"chunk_id": i // chunk_size
}
all_data.append(data_point)
except Exception as e:
print(f"Error processing {pdf_file}: {e}")
# Save all data to a single JSON file
with open(os.path.join(output_dir, "pdf_data.json"), "w", encoding="utf-8") as f:
json.dump(all_data, f, ensure_ascii=False, indent=2)
print(f"Processed {len(pdf_files)} PDFs into {len(all_data)} text chunks")
return all_data
def prepare_training_data(pdf_data: List[Dict[str, Any]], output_dir: str):
"""Prepare data in the format needed for fine-tuning LLMs."""
training_data = []
for item in pdf_data:
# Format for instruction fine-tuning
train_item = {
"instruction": "Use the following text from the document to answer questions or generate content about the topics it covers.",
"input": item["text"][:500], # Use beginning of text as input
"output": item["text"][500:], # Use rest of text as output
}
training_data.append(train_item)
# Create train/validation split (90/10)
split_idx = int(len(training_data) * 0.9)
train_data = training_data[:split_idx]
val_data = training_data[split_idx:]
# Save splits
os.makedirs(os.path.join(output_dir, "training_data"), exist_ok=True)
with open(os.path.join(output_dir, "training_data", "train.json"), "w", encoding="utf-8") as f:
json.dump(train_data, f, ensure_ascii=False, indent=2)
with open(os.path.join(output_dir, "training_data", "validation.json"), "w", encoding="utf-8") as f:
json.dump(val_data, f, ensure_ascii=False, indent=2)
print(f"Created training dataset: {len(train_data)} train, {len(val_data)} validation examples")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process PDFs and prepare training data")
parser.add_argument("--pdf_dir", type=str, required=True, help="Directory containing PDF files")
parser.add_argument("--output_dir", type=str, default="processed_data", help="Output directory for processed data")
parser.add_argument("--chunk_size", type=int, default=1000, help="Number of words per chunk")
args = parser.parse_args()
pdf_data = process_pdfs(args.pdf_dir, args.output_dir, args.chunk_size)
prepare_training_data(pdf_data, args.output_dir)
print("PDF processing complete. Data is ready for fine-tuning.")