Spaces:
Runtime error
Runtime error
import os | |
import json | |
import argparse | |
from pathlib import Path | |
from typing import List, Dict, Any | |
try: | |
from PyPDF2 import PdfReader | |
from tqdm import tqdm | |
except ImportError: | |
print("Installing required dependencies...") | |
import subprocess | |
subprocess.check_call(["pip", "install", "PyPDF2", "tqdm"]) | |
from PyPDF2 import PdfReader | |
from tqdm import tqdm | |
def extract_text_from_pdf(pdf_path: str) -> str: | |
"""Extract text from a PDF file.""" | |
try: | |
reader = PdfReader(pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
print(f"Error extracting text from {pdf_path}: {e}") | |
return "" | |
def process_pdfs(pdf_dir: str, output_dir: str, chunk_size: int = 1000) -> List[Dict[str, Any]]: | |
"""Process all PDFs in a directory and save the extracted text.""" | |
pdf_files = list(Path(pdf_dir).glob("*.pdf")) | |
if not pdf_files: | |
raise ValueError(f"No PDF files found in {pdf_dir}") | |
os.makedirs(output_dir, exist_ok=True) | |
all_data = [] | |
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"): | |
try: | |
file_name = pdf_file.stem | |
print(f"Processing {file_name}") | |
text = extract_text_from_pdf(str(pdf_file)) | |
if not text.strip(): | |
print(f"Warning: No text extracted from {file_name}") | |
continue | |
# Split into chunks to avoid context length issues | |
words = text.split() | |
for i in range(0, len(words), chunk_size): | |
chunk = " ".join(words[i:i+chunk_size]) | |
if len(chunk.strip()) > 100: # Ensure chunk has enough content | |
data_point = { | |
"text": chunk, | |
"source": file_name, | |
"chunk_id": i // chunk_size | |
} | |
all_data.append(data_point) | |
except Exception as e: | |
print(f"Error processing {pdf_file}: {e}") | |
# Save all data to a single JSON file | |
with open(os.path.join(output_dir, "pdf_data.json"), "w", encoding="utf-8") as f: | |
json.dump(all_data, f, ensure_ascii=False, indent=2) | |
print(f"Processed {len(pdf_files)} PDFs into {len(all_data)} text chunks") | |
return all_data | |
def prepare_training_data(pdf_data: List[Dict[str, Any]], output_dir: str): | |
"""Prepare data in the format needed for fine-tuning LLMs.""" | |
training_data = [] | |
for item in pdf_data: | |
# Format for instruction fine-tuning | |
train_item = { | |
"instruction": "Use the following text from the document to answer questions or generate content about the topics it covers.", | |
"input": item["text"][:500], # Use beginning of text as input | |
"output": item["text"][500:], # Use rest of text as output | |
} | |
training_data.append(train_item) | |
# Create train/validation split (90/10) | |
split_idx = int(len(training_data) * 0.9) | |
train_data = training_data[:split_idx] | |
val_data = training_data[split_idx:] | |
# Save splits | |
os.makedirs(os.path.join(output_dir, "training_data"), exist_ok=True) | |
with open(os.path.join(output_dir, "training_data", "train.json"), "w", encoding="utf-8") as f: | |
json.dump(train_data, f, ensure_ascii=False, indent=2) | |
with open(os.path.join(output_dir, "training_data", "validation.json"), "w", encoding="utf-8") as f: | |
json.dump(val_data, f, ensure_ascii=False, indent=2) | |
print(f"Created training dataset: {len(train_data)} train, {len(val_data)} validation examples") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Process PDFs and prepare training data") | |
parser.add_argument("--pdf_dir", type=str, required=True, help="Directory containing PDF files") | |
parser.add_argument("--output_dir", type=str, default="processed_data", help="Output directory for processed data") | |
parser.add_argument("--chunk_size", type=int, default=1000, help="Number of words per chunk") | |
args = parser.parse_args() | |
pdf_data = process_pdfs(args.pdf_dir, args.output_dir, args.chunk_size) | |
prepare_training_data(pdf_data, args.output_dir) | |
print("PDF processing complete. Data is ready for fine-tuning.") |