# extract_glossary_from_txt.py import os import json from typing import List from txt_processor import TextFileProcessor from chapter_splitter import ChapterSplitter from bs4 import BeautifulSoup def extract_chapters_from_txt(txt_path: str) -> List[str]: """Extract chapters from text file for glossary extraction""" processor = TextFileProcessor(txt_path, os.path.dirname(txt_path)) chapters = processor.extract_chapters() # Initialize chapter splitter model_name = os.getenv("MODEL", "gpt-3.5-turbo") chapter_splitter = ChapterSplitter(model_name=model_name) # Get max tokens from environment max_input_tokens_str = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() if not max_input_tokens_str or max_input_tokens_str == "": # Token limit disabled - use a very large number max_input_tokens = 10000000 # 10M tokens else: max_input_tokens = int(max_input_tokens_str) # Calculate available tokens (leaving room for system prompt and context) system_prompt_size = 2000 # Estimate for glossary system prompt context_size = 5000 # Estimate for context history safety_margin = 1000 available_tokens = max_input_tokens - system_prompt_size - context_size - safety_margin text_chapters = [] for idx, chapter in enumerate(chapters): # Check if chapter needs splitting chapter_tokens = chapter_splitter.count_tokens(chapter['body']) if chapter_tokens > available_tokens: print(f"Chapter {idx+1} has {chapter_tokens} tokens, splitting into smaller chunks...") # Use ChapterSplitter to split the HTML content chunks = chapter_splitter.split_chapter(chapter['body'], available_tokens) # Extract text from each chunk for chunk_html, chunk_idx, total_chunks in chunks: soup = BeautifulSoup(chunk_html, 'html.parser') text = soup.get_text(strip=True) if text: text_chapters.append(text) print(f" Added chunk {chunk_idx}/{total_chunks} ({chapter_splitter.count_tokens(text)} tokens)") else: # Chapter is small enough, extract text as-is soup = BeautifulSoup(chapter['body'], 'html.parser') text = soup.get_text(strip=True) if text: text_chapters.append(text) print(f"Total text chunks for glossary extraction: {len(text_chapters)}") return text_chapters