import os import json import argparse from pathlib import Path from typing import List, Dict, Any try: from PyPDF2 import PdfReader from tqdm import tqdm except ImportError: print("Installing required dependencies...") import subprocess subprocess.check_call(["pip", "install", "PyPDF2", "tqdm"]) from PyPDF2 import PdfReader from tqdm import tqdm def extract_text_from_pdf(pdf_path: str) -> str: """Extract text from a PDF file.""" try: reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text except Exception as e: print(f"Error extracting text from {pdf_path}: {e}") return "" def process_pdfs(pdf_dir: str, output_dir: str, chunk_size: int = 1000) -> List[Dict[str, Any]]: """Process all PDFs in a directory and save the extracted text.""" pdf_files = list(Path(pdf_dir).glob("*.pdf")) if not pdf_files: raise ValueError(f"No PDF files found in {pdf_dir}") os.makedirs(output_dir, exist_ok=True) all_data = [] for pdf_file in tqdm(pdf_files, desc="Processing PDFs"): try: file_name = pdf_file.stem print(f"Processing {file_name}") text = extract_text_from_pdf(str(pdf_file)) if not text.strip(): print(f"Warning: No text extracted from {file_name}") continue # Split into chunks to avoid context length issues words = text.split() for i in range(0, len(words), chunk_size): chunk = " ".join(words[i:i+chunk_size]) if len(chunk.strip()) > 100: # Ensure chunk has enough content data_point = { "text": chunk, "source": file_name, "chunk_id": i // chunk_size } all_data.append(data_point) except Exception as e: print(f"Error processing {pdf_file}: {e}") # Save all data to a single JSON file with open(os.path.join(output_dir, "pdf_data.json"), "w", encoding="utf-8") as f: json.dump(all_data, f, ensure_ascii=False, indent=2) print(f"Processed {len(pdf_files)} PDFs into {len(all_data)} text chunks") return all_data def prepare_training_data(pdf_data: List[Dict[str, Any]], output_dir: str): """Prepare data in the format needed for fine-tuning LLMs.""" training_data = [] for item in pdf_data: # Format for instruction fine-tuning train_item = { "instruction": "Use the following text from the document to answer questions or generate content about the topics it covers.", "input": item["text"][:500], # Use beginning of text as input "output": item["text"][500:], # Use rest of text as output } training_data.append(train_item) # Create train/validation split (90/10) split_idx = int(len(training_data) * 0.9) train_data = training_data[:split_idx] val_data = training_data[split_idx:] # Save splits os.makedirs(os.path.join(output_dir, "training_data"), exist_ok=True) with open(os.path.join(output_dir, "training_data", "train.json"), "w", encoding="utf-8") as f: json.dump(train_data, f, ensure_ascii=False, indent=2) with open(os.path.join(output_dir, "training_data", "validation.json"), "w", encoding="utf-8") as f: json.dump(val_data, f, ensure_ascii=False, indent=2) print(f"Created training dataset: {len(train_data)} train, {len(val_data)} validation examples") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process PDFs and prepare training data") parser.add_argument("--pdf_dir", type=str, required=True, help="Directory containing PDF files") parser.add_argument("--output_dir", type=str, default="processed_data", help="Output directory for processed data") parser.add_argument("--chunk_size", type=int, default=1000, help="Number of words per chunk") args = parser.parse_args() pdf_data = process_pdfs(args.pdf_dir, args.output_dir, args.chunk_size) prepare_training_data(pdf_data, args.output_dir) print("PDF processing complete. Data is ready for fine-tuning.")