# TransateKRtoEN.py # -*- coding: utf-8 -*- import json import logging import shutil import threading import queue import uuid import inspect import os, sys, io, zipfile, time, re, mimetypes, subprocess, tiktoken import builtins import ebooklib from ebooklib import epub from bs4 import BeautifulSoup try: from bs4 import XMLParsedAsHTMLWarning import warnings # Suppress the warning since we handle both HTML and XHTML content warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) except ImportError: # Older versions of BeautifulSoup might not have this warning pass from collections import Counter from unified_api_client import UnifiedClient, UnifiedClientError import hashlib import tempfile import unicodedata from difflib import SequenceMatcher import unicodedata import re import time from history_manager import HistoryManager from chapter_splitter import ChapterSplitter from image_translator import ImageTranslator from typing import Dict, List, Tuple from txt_processor import TextFileProcessor from ai_hunter_enhanced import ImprovedAIHunterDetection import csv from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed # Module-level functions for ProcessPoolExecutor compatibility def _check_sentence_batch_for_terms(args): """Check a batch of sentences for term matches - used by ProcessPoolExecutor""" batch_sentences, terms = args filtered = [] # Use pre-compiled term list for fast checking for sentence in batch_sentences: # Quick check using any() - stops at first match if any(term in sentence for term in terms): filtered.append(sentence) return filtered def _process_sentence_batch_for_extraction(args): """Process sentences to extract terms - used by ProcessPoolExecutor""" batch_sentences, batch_idx, combined_pattern, exclude_check_data = args from collections import Counter import re local_word_freq = Counter() local_important = [] local_seen = set() # Rebuild the exclusion check function from data honorifics_to_exclude, title_patterns_str, common_words, chinese_nums = exclude_check_data title_patterns = [re.compile(p) for p in title_patterns_str] def should_exclude_term(term): term_lower = term.lower() # Check if it's a common word if term in common_words or term_lower in common_words: return True # Check if it contains honorifics for honorific in honorifics_to_exclude: if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): return True # Check if it matches title patterns for pattern in title_patterns: if pattern.search(term): return True # Check if it's a number if term in chinese_nums or term.isdigit(): return True return False for sentence in batch_sentences: sentence = sentence.strip() if len(sentence) < 10 or len(sentence) > 500: continue # Find all potential terms in this sentence matches = re.findall(combined_pattern, sentence) if matches: # Filter out excluded terms filtered_matches = [] for match in matches: if not should_exclude_term(match): local_word_freq[match] += 1 filtered_matches.append(match) # Keep sentences with valid potential terms if filtered_matches: sentence_key = ' '.join(sorted(filtered_matches)) if sentence_key not in local_seen: local_important.append(sentence) local_seen.add(sentence_key) return local_word_freq, local_important, local_seen, batch_idx from tqdm import tqdm def is_traditional_translation_api(model: str) -> bool: """Check if the model is a traditional translation API""" return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/') def get_chapter_terminology(is_text_file, chapter_data=None): """Get appropriate terminology (Chapter/Section) based on source type""" if is_text_file: return "Section" if chapter_data: if chapter_data.get('filename', '').endswith('.txt') or chapter_data.get('is_chunk', False): return "Section" return "Chapter" # ===================================================== # CONFIGURATION AND ENVIRONMENT MANAGEMENT # ===================================================== class TranslationConfig: """Centralized configuration management""" def __init__(self): self.MODEL = os.getenv("MODEL", "gemini-1.5-flash") self.input_path = os.getenv("input_path", "default.epub") self.PROFILE_NAME = os.getenv("PROFILE_NAME", "korean").lower() self.CONTEXTUAL = os.getenv("CONTEXTUAL", "1") == "1" self.DELAY = float(os.getenv("SEND_INTERVAL_SECONDS", "1")) self.SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip() self.REMOVE_AI_ARTIFACTS = os.getenv("REMOVE_AI_ARTIFACTS", "0") == "1" self.TEMP = float(os.getenv("TRANSLATION_TEMPERATURE", "0.3")) self.HIST_LIMIT = int(os.getenv("TRANSLATION_HISTORY_LIMIT", "20")) self.MAX_OUTPUT_TOKENS = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) self.EMERGENCY_RESTORE = os.getenv("EMERGENCY_PARAGRAPH_RESTORE", "1") == "1" self.BATCH_TRANSLATION = os.getenv("BATCH_TRANSLATION", "0") == "1" self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) self.ENABLE_IMAGE_TRANSLATION = os.getenv("ENABLE_IMAGE_TRANSLATION", "1") == "1" self.TRANSLATE_BOOK_TITLE = os.getenv("TRANSLATE_BOOK_TITLE", "1") == "1" self.DISABLE_ZERO_DETECTION = os.getenv("DISABLE_ZERO_DETECTION", "0") == "1" self.ENABLE_AUTO_GLOSSARY = os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1" self.COMPREHENSIVE_EXTRACTION = os.getenv("COMPREHENSIVE_EXTRACTION", "0") == "1" self.MANUAL_GLOSSARY = os.getenv("MANUAL_GLOSSARY") self.RETRY_TRUNCATED = os.getenv("RETRY_TRUNCATED", "0") == "1" self.RETRY_DUPLICATE_BODIES = os.getenv("RETRY_DUPLICATE_BODIES", "1") == "1" self.RETRY_TIMEOUT = os.getenv("RETRY_TIMEOUT", "0") == "1" self.CHUNK_TIMEOUT = int(os.getenv("CHUNK_TIMEOUT", "900")) self.MAX_RETRY_TOKENS = int(os.getenv("MAX_RETRY_TOKENS", "16384")) self.DUPLICATE_LOOKBACK_CHAPTERS = int(os.getenv("DUPLICATE_LOOKBACK_CHAPTERS", "3")) self.USE_ROLLING_SUMMARY = os.getenv("USE_ROLLING_SUMMARY", "0") == "1" self.ROLLING_SUMMARY_EXCHANGES = int(os.getenv("ROLLING_SUMMARY_EXCHANGES", "5")) self.ROLLING_SUMMARY_MODE = os.getenv("ROLLING_SUMMARY_MODE", "replace") # New: maximum number of rolling summary entries to retain when in append mode (0 = unlimited) self.ROLLING_SUMMARY_MAX_ENTRIES = int(os.getenv("ROLLING_SUMMARY_MAX_ENTRIES", "10")) self.DUPLICATE_DETECTION_MODE = os.getenv("DUPLICATE_DETECTION_MODE", "basic") self.AI_HUNTER_THRESHOLD = int(os.getenv("AI_HUNTER_THRESHOLD", "75")) self.TRANSLATION_HISTORY_ROLLING = os.getenv("TRANSLATION_HISTORY_ROLLING", "0") == "1" self.API_KEY = (os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_OR_Gemini_API_KEY") or os.getenv("GEMINI_API_KEY")) # NEW: Simple chapter number offset self.CHAPTER_NUMBER_OFFSET = int(os.getenv("CHAPTER_NUMBER_OFFSET", "0")) self.ENABLE_WATERMARK_REMOVAL = os.getenv("ENABLE_WATERMARK_REMOVAL", "1") == "1" self.SAVE_CLEANED_IMAGES = os.getenv("SAVE_CLEANED_IMAGES", "1") == "1" self.WATERMARK_PATTERN_THRESHOLD = int(os.getenv("WATERMARK_PATTERN_THRESHOLD", "10")) self.WATERMARK_CLAHE_LIMIT = float(os.getenv("WATERMARK_CLAHE_LIMIT", "3.0")) self.COMPRESSION_FACTOR = float(os.getenv("COMPRESSION_FACTOR", "1.0")) # Multi API key support self.use_multi_api_keys = os.environ.get('USE_MULTI_API_KEYS', '0') == '1' self.multi_api_keys = [] if self.use_multi_api_keys: multi_keys_json = os.environ.get('MULTI_API_KEYS', '[]') try: self.multi_api_keys = json.loads(multi_keys_json) print(f"Loaded {len(self.multi_api_keys)} API keys for multi-key mode") except Exception as e: print(f"Failed to load multi API keys: {e}") self.use_multi_api_keys = False # ===================================================== # UNIFIED PATTERNS AND CONSTANTS # ===================================================== class PatternManager: """Centralized pattern management""" CHAPTER_PATTERNS = [ # English patterns (r'chapter[\s_-]*(\d+)', re.IGNORECASE, 'english_chapter'), (r'\bch\.?\s*(\d+)\b', re.IGNORECASE, 'english_ch'), (r'part[\s_-]*(\d+)', re.IGNORECASE, 'english_part'), (r'episode[\s_-]*(\d+)', re.IGNORECASE, 'english_episode'), # Chinese patterns (r'第\s*(\d+)\s*[章节話话回]', 0, 'chinese_chapter'), (r'第\s*([一二三四五六七八九十百千万]+)\s*[章节話话回]', 0, 'chinese_chapter_cn'), (r'(\d+)[章节話话回]', 0, 'chinese_short'), # Japanese patterns (r'第\s*(\d+)\s*話', 0, 'japanese_wa'), (r'第\s*(\d+)\s*章', 0, 'japanese_chapter'), (r'その\s*(\d+)', 0, 'japanese_sono'), (r'(\d+)話目', 0, 'japanese_wame'), # Korean patterns (r'제\s*(\d+)\s*[장화권부편]', 0, 'korean_chapter'), (r'(\d+)\s*[장화권부편]', 0, 'korean_short'), (r'에피소드\s*(\d+)', 0, 'korean_episode'), # Generic numeric patterns (r'^\s*(\d+)\s*[-–—.\:]', re.MULTILINE, 'generic_numbered'), (r'_(\d+)\.x?html?$', re.IGNORECASE, 'filename_number'), (r'/(\d+)\.x?html?$', re.IGNORECASE, 'path_number'), (r'(\d+)', 0, 'any_number'), ] FILENAME_EXTRACT_PATTERNS = [ # IMPORTANT: More specific patterns MUST come first r'^\d{3}(\d)_(\d{2})_\.x?html?$', # Captures both parts for decimal: group1.group2 r'^\d{4}_(\d+)\.x?html?$', # "0000_1.xhtml" - extracts 1, not 0000 r'^\d+_(\d+)[_\.]', # Any digits followed by underscore then capture next digits r'^(\d+)[_\.]', # Standard: "0249_" or "0249." r'response_(\d+)_', # Standard pattern: response_001_ r'response_(\d+)\.', # Pattern: response_001. r'(\d{3,5})[_\.]', # 3-5 digit pattern with padding r'[Cc]hapter[_\s]*(\d+)', # Chapter word pattern r'[Cc]h[_\s]*(\d+)', # Ch abbreviation r'No(\d+)Chapter', # No prefix with Chapter - matches "No00013Chapter.xhtml" r'No(\d+)Section', # No prefix with Section - matches "No00013Section.xhtml" r'No(\d+)(?=\.|_|$)', # No prefix followed by end, dot, or underscore (not followed by text) r'第(\d+)[章话回]', # Chinese chapter markers r'_(\d+)(?:_|\.|$)', # Number between underscores or at end r'^(\d+)(?:_|\.|$)', # Starting with number r'(\d+)', # Any number (fallback) ] CJK_HONORIFICS = { 'korean': [ # Modern honorifics '님', '씨', '선배', '후배', '동기', '형', '누나', '언니', '오빠', '동생', '선생님', '교수님', '박사님', '사장님', '회장님', '부장님', '과장님', '대리님', '팀장님', '실장님', '이사님', '전무님', '상무님', '부사장님', '고문님', # Classical/formal honorifics '공', '옹', '군', '양', '낭', '랑', '생', '자', '부', '모', '시', '제', '족하', # Royal/noble address forms '마마', '마노라', '대감', '영감', '나리', '도령', '낭자', '아씨', '규수', '각하', '전하', '폐하', '저하', '합하', '대비', '대왕', '왕자', '공주', # Buddhist/religious '스님', '사부님', '조사님', '큰스님', '화상', '대덕', '대사', '법사', '선사', '율사', '보살님', '거사님', '신부님', '목사님', '장로님', '집사님', # Confucian/scholarly '부자', '선생', '대인', '어른', '어르신', '존자', '현자', '군자', '대부', '학사', '진사', '문하생', '제자', # Kinship honorifics '어르신', '할아버님', '할머님', '아버님', '어머님', '형님', '누님', '아주버님', '아주머님', '삼촌', '이모님', '고모님', '외삼촌', '장인어른', '장모님', '시아버님', '시어머님', '처남', '처형', '매형', '손님', # Verb-based honorific endings and speech levels '습니다', 'ㅂ니다', '습니까', 'ㅂ니까', '시다', '세요', '셔요', '십시오', '시오', '이에요', '예요', '이예요', '에요', '어요', '아요', '여요', '해요', '이세요', '으세요', '으시', '시', '으십니다', '십니다', '으십니까', '십니까', '으셨', '셨', '드립니다', '드려요', '드릴게요', '드리겠습니다', '올립니다', '올려요', '사옵니다', '사뢰', '여쭙니다', '여쭤요', '아뢰', '뵙니다', '뵈요', '모십니다', '시지요', '시죠', '시네요', '시는군요', '시는구나', '으실', '실', '드시다', '잡수시다', '주무시다', '계시다', '가시다', '오시다', # Common verb endings with 있다/없다/하다 '있어요', '있습니다', '있으세요', '있으십니까', '없어요', '없습니다', '없으세요', '해요', '합니다', '하세요', '하십시오', '하시죠', '하시네요', '했어요', '했습니다', '되세요', '되셨어요', '되십니다', '됩니다', '되요', '돼요', '이야', '이네', '이구나', '이군', '이네요', '인가요', '인가', '일까요', '일까', '거예요', '거에요', '겁니다', '건가요', '게요', '을게요', '을까요', '었어요', '었습니다', '겠습니다', '겠어요', '겠네요', '을겁니다', '을거예요', '을거에요', # Common endings '요', '죠', '네요', '는데요', '거든요', '니까', '으니까', '는걸요', '군요', '구나', '는구나', '는군요', '더라고요', '더군요', '던데요', '나요', '가요', '까요', '라고요', '다고요', '냐고요', '자고요', '란다', '단다', '냔다', '잔다', # Formal archaic endings '나이다', '사옵나이다', '옵니다', '오', '소서', '으오', '으옵소서', '사이다', '으시옵니다', '시옵니다', '으시옵니까', '시옵니까', '나이까', '리이까', '리이다', '옵소서', '으소서', '소이다', '로소이다', '이옵니다', '이올시다', '하옵니다' ], 'japanese': [ # Modern honorifics 'さん', 'ちゃん', '君', 'くん', '様', 'さま', '先生', 'せんせい', '殿', 'どの', '先輩', 'せんぱい', # Classical/historical '氏', 'し', '朝臣', 'あそん', '宿禰', 'すくね', '連', 'むらじ', '臣', 'おみ', '君', 'きみ', '真人', 'まひと', '道師', 'みちのし', '稲置', 'いなぎ', '直', 'あたい', '造', 'みやつこ', # Court titles '卿', 'きょう', '大夫', 'たいふ', '郎', 'ろう', '史', 'し', '主典', 'さかん', # Buddhist titles '和尚', 'おしょう', '禅師', 'ぜんじ', '上人', 'しょうにん', '聖人', 'しょうにん', '法師', 'ほうし', '阿闍梨', 'あじゃり', '大和尚', 'だいおしょう', # Shinto titles '大宮司', 'だいぐうじ', '宮司', 'ぐうじ', '禰宜', 'ねぎ', '祝', 'はふり', # Samurai era '守', 'かみ', '介', 'すけ', '掾', 'じょう', '目', 'さかん', '丞', 'じょう', # Keigo (honorific language) verb forms 'です', 'ます', 'ございます', 'いらっしゃる', 'いらっしゃいます', 'おっしゃる', 'おっしゃいます', 'なさる', 'なさいます', 'くださる', 'くださいます', 'いただく', 'いただきます', 'おります', 'でございます', 'ございません', 'いたします', 'いたしました', '申す', '申します', '申し上げる', '申し上げます', '存じる', '存じます', '存じ上げる', '伺う', '伺います', '参る', '参ります', 'お目にかかる', 'お目にかかります', '拝見', '拝見します', '拝聴', '拝聴します', '承る', '承ります', # Respectful prefixes/suffixes 'お', 'ご', '御', 'み', '美', '貴', '尊' ], 'chinese': [ # Modern forms '先生', '小姐', '夫人', '公子', '大人', '老师', '师父', '师傅', '同志', '同学', # Ancient/classical forms '子', '丈', '翁', '公', '侯', '伯', '叔', '仲', '季', '父', '甫', '卿', '君', '生', # Imperial court '陛下', '殿下', '千岁', '万岁', '圣上', '皇上', '天子', '至尊', '御前', '爷', # Nobility/officials '阁下', '大人', '老爷', '相公', '官人', '郎君', '娘子', '夫子', '足下', # Religious titles '上人', '法师', '禅师', '大师', '高僧', '圣僧', '神僧', '活佛', '仁波切', '真人', '天师', '道长', '道友', '仙长', '上仙', '祖师', '掌教', # Scholarly/Confucian '夫子', '圣人', '贤人', '君子', '大儒', '鸿儒', '宗师', '泰斗', '巨擘', # Martial arts '侠士', '大侠', '少侠', '女侠', '英雄', '豪杰', '壮士', '义士', # Family/kinship '令尊', '令堂', '令郎', '令爱', '贤弟', '贤侄', '愚兄', '小弟', '家父', '家母', # Humble forms '在下', '小人', '鄙人', '不才', '愚', '某', '仆', '妾', '奴', '婢', # Polite verbal markers '请', '请问', '敢问', '恭请', '敬请', '烦请', '有请', '请教', '赐教', '惠顾', '惠赐', '惠存', '笑纳', '雅正', '指正', '斧正', '垂询', '拜', '拜见', '拜访', '拜读', '拜托', '拜谢', '敬上', '谨上', '顿首' ], 'english': [ # Modern Korean romanizations (Revised Romanization of Korean - 2000) '-nim', '-ssi', '-seonbae', '-hubae', '-donggi', '-hyeong', '-nuna', '-eonni', '-oppa', '-dongsaeng', '-seonsaengnim', '-gyosunim', '-baksanim', '-sajangnim', '-hoejangnim', '-bujangnim', '-gwajangnim', '-daerim', '-timjangnim', '-siljangnim', '-isanim', '-jeonmunim', '-sangmunim', '-busajangnim', '-gomunnim', # Classical/formal Korean romanizations '-gong', '-ong', '-gun', '-yang', '-nang', '-rang', '-saeng', '-ja', '-bu', '-mo', '-si', '-je', '-jokha', # Royal/noble Korean romanizations '-mama', '-manora', '-daegam', '-yeonggam', '-nari', '-doryeong', '-nangja', '-assi', '-gyusu', '-gakha', '-jeonha', '-pyeha', '-jeoha', '-hapka', '-daebi', '-daewang', '-wangja', '-gongju', # Buddhist/religious Korean romanizations '-seunim', '-sabunim', '-josanim', '-keunseunim', '-hwasang', '-daedeok', '-daesa', '-beopsa', '-seonsa', '-yulsa', '-bosalnim', '-geosanim', '-sinbunim', '-moksanim', '-jangnonim', '-jipsanim', # Confucian/scholarly Korean romanizations '-buja', '-seonsaeng', '-daein', '-eoreun', '-eoreusin', '-jonja', '-hyeonja', '-gunja', '-daebu', '-haksa', '-jinsa', '-munhasaeng', '-jeja', # Kinship Korean romanizations '-harabeonim', '-halmeonim', '-abeonim', '-eomeonim', '-hyeongnim', '-nunim', '-ajubeonim', '-ajumeonim', '-samchon', '-imonim', '-gomonim', '-oesamchon', '-jangineoreun', '-jangmonim', '-siabeonim', '-sieomeonim', '-cheonam', '-cheohyeong', '-maehyeong', '-sonnim', # Korean verb endings romanized (Revised Romanization) '-seumnida', '-mnida', '-seumnikka', '-mnikka', '-sida', '-seyo', '-syeoyo', '-sipsio', '-sio', '-ieyo', '-yeyo', '-iyeyo', '-eyo', '-eoyo', '-ayo', '-yeoyo', '-haeyo', '-iseyo', '-euseyo', '-eusi', '-si', '-eusimnida', '-simnida', '-eusimnikka', '-simnikka', '-eusyeot', '-syeot', '-deurimnida', '-deuryeoyo', '-deurilgeyo', '-deurigesseumnida', '-ollimnida', '-ollyeoyo', '-saomnida', '-saroe', '-yeojjumnida', '-yeojjwoyo', '-aroe', '-boemnida', '-boeyo', '-mosimnida', '-sijiyo', '-sijyo', '-sineyo', '-sineungunyo', '-sineunguna', '-eusil', '-sil', '-deusida', '-japsusida', '-jumusida', '-gyesida', '-gasida', '-osida', # Common Korean verb endings romanized '-isseoyo', '-isseumnida', '-isseuseyo', '-isseusimnikka', '-eopseoyo', '-eopseumnida', '-eopseuseyo', '-hamnida', '-haseyo', '-hasipsio', '-hasijyo', '-hasineyo', '-haesseoyo', '-haesseumnida', '-doeseyo', '-doesyeosseoyo', '-doesimnida', '-doemnida', '-doeyo', '-dwaeyo', '-iya', '-ine', '-iguna', '-igun', '-ineyo', '-ingayo', '-inga', '-ilkkayo', '-ilkka', '-geoyeyo', '-geoeyo', '-geomnida', '-geongayo', '-geyo', '-eulgeyo', '-eulkkayo', '-eosseoyo', '-eosseumnida', '-gesseumnida', '-gesseoyo', '-genneyo', '-eulgeommida', '-eulgeoyeyo', '-eulgeoeyo', # Common Korean endings romanized '-yo', '-jyo', '-neyo', '-neundeyo', '-geodeunyo', '-nikka', '-eunikka', '-neungeolyo', '-gunyo', '-guna', '-neunguna', '-neungunyo', '-deoragoyo', '-deogunyo', '-deondeyo', '-nayo', '-gayo', '-kkayo', '-ragoyo', '-dagoyo', '-nyagoyo', '-jagoyo', '-randa', '-danda', '-nyanda', '-janda', # Formal archaic Korean romanized '-naida', '-saomnaida', '-omnida', '-o', '-soseo', '-euo', '-euopsoseo', '-saida', '-eusiomnida', '-siomnida', '-eusiomnikka', '-siomnikka', '-naikka', '-riikka', '-riida', '-opsoseo', '-eusoseo', '-soida', '-rosoida', '-iomnida', '-iolsida', '-haomnida', # Japanese keigo romanized (keeping existing) '-san', '-chan', '-kun', '-sama', '-sensei', '-senpai', '-dono', '-shi', '-tan', '-chin', '-desu', '-masu', '-gozaimasu', '-irassharu', '-irasshaimasu', '-ossharu', '-osshaimasu', '-nasaru', '-nasaimasu', '-kudasaru', '-kudasaimasu', '-itadaku', '-itadakimasu', '-orimasu', '-degozaimasu', '-gozaimasen', '-itashimasu', '-itashimashita', '-mousu', '-moushimasu', '-moushiageru', '-moushiagemasu', '-zonjiru', '-zonjimasu', '-ukagau', '-ukagaimasu', '-mairu', '-mairimasu', '-haiken', '-haikenshimasu', # Chinese romanizations (keeping existing) '-xiong', '-di', '-ge', '-gege', '-didi', '-jie', '-jiejie', '-meimei', '-shixiong', '-shidi', '-shijie', '-shimei', '-gongzi', '-guniang', '-xiaojie', '-daren', '-qianbei', '-daoyou', '-zhanglao', '-shibo', '-shishu', '-shifu', '-laoshi', '-xiansheng', '-daxia', '-shaoxia', '-nvxia', '-jushi', '-shanren', '-dazhang', '-zhenren', # Ancient Chinese romanizations '-zi', '-gong', '-hou', '-bo', '-jun', '-qing', '-weng', '-fu', '-sheng', '-lang', '-langjun', '-niangzi', '-furen', '-gege', '-jiejie', '-yeye', '-nainai', # Chinese politeness markers romanized '-qing', '-jing', '-gong', '-hui', '-ci', '-bai', '-gan', '-chui', 'qingwen', 'ganwen', 'gongjing', 'jingjing', 'baijian', 'baifang', 'baituo' ] } TITLE_PATTERNS = { 'korean': [ # Modern titles r'\b(왕|여왕|왕자|공주|황제|황후|대왕|대공|공작|백작|자작|남작|기사|장군|대장|원수|제독|함장|대신|재상|총리|대통령|시장|지사|검사|판사|변호사|의사|박사|교수|신부|목사|스님|도사)\b', r'\b(폐하|전하|각하|예하|님|대감|영감|나리|도련님|아가씨|부인|선생)\b', # Historical/classical titles r'\b(대왕|태왕|왕비|왕후|세자|세자빈|대군|군|옹주|공주|부마|원자|원손)\b', r'\b(영의정|좌의정|우의정|판서|참판|참의|정승|판사|사또|현령|군수|목사|부사)\b', r'\b(대제학|제학|대사간|사간|대사헌|사헌|도승지|승지|한림|사관|내시|환관)\b', r'\b(병조판서|이조판서|호조판서|예조판서|형조판서|공조판서)\b', r'\b(도원수|부원수|병마절도사|수군절도사|첨절제사|만호|천호|백호)\b', r'\b(정일품|종일품|정이품|종이품|정삼품|종삼품|정사품|종사품|정오품|종오품)\b', # Korean honorific verb endings patterns r'(습니다|ㅂ니다|습니까|ㅂ니까|세요|셔요|십시오|시오)$', r'(이에요|예요|이예요|에요|어요|아요|여요|해요)$', r'(으시|시)(었|겠|ㄹ|을|는|던)*(습니다|ㅂ니다|어요|아요|세요)', r'(드립니다|드려요|드릴게요|드리겠습니다|올립니다|올려요)$', r'(사옵니다|여쭙니다|여쭤요|뵙니다|뵈요|모십니다)$', r'(나이다|사옵나이다|옵니다|으오|으옵소서|사이다)$' ], 'japanese': [ # Modern titles r'\b(王|女王|王子|姫|皇帝|皇后|天皇|皇太子|大王|大公|公爵|伯爵|子爵|男爵|騎士|将軍|大将|元帥|提督|艦長|大臣|宰相|総理|大統領|市長|知事|検事|裁判官|弁護士|医者|博士|教授|神父|牧師|僧侶|道士)\b', r'\b(陛下|殿下|閣下|猊下|様|大人|殿|卿|君|氏)\b', # Historical titles r'\b(天皇|皇后|皇太子|親王|内親王|王|女王|太政大臣|左大臣|右大臣|内大臣|大納言|中納言|参議)\b', r'\b(関白|摂政|征夷大将軍|管領|執権|守護|地頭|代官|奉行|与力|同心)\b', r'\b(太政官|神祇官|式部省|治部省|民部省|兵部省|刑部省|大蔵省|宮内省)\b', r'\b(大僧正|僧正|大僧都|僧都|律師|大法師|法師|大禅師|禅師)\b', r'\b(正一位|従一位|正二位|従二位|正三位|従三位|正四位|従四位|正五位|従五位)\b', r'\b(大和守|山城守|摂津守|河内守|和泉守|伊賀守|伊勢守|尾張守|三河守|遠江守)\b', # Japanese keigo (honorific language) patterns r'(です|ます|ございます)$', r'(いらっしゃ|おっしゃ|なさ|くださ)(います|いました|る|った)$', r'(いただ|お|ご|御)(き|きます|きました|く|ける|けます)', r'(申し上げ|申し|存じ上げ|存じ|伺い|参り)(ます|ました|る)$', r'(拝見|拝聴|承り|承)(します|しました|いたします|いたしました)$', r'お[^あ-ん]+[になる|になります|くださる|くださいます]' ], 'chinese': [ # Modern titles r'\b(王|女王|王子|公主|皇帝|皇后|大王|大公|公爵|伯爵|子爵|男爵|骑士|将军|大将|元帅|提督|舰长|大臣|宰相|总理|大总统|市长|知事|检察官|法官|律师|医生|博士|教授|神父|牧师|和尚|道士)\b', r'\b(陛下|殿下|阁下|大人|老爷|夫人|小姐|公子|少爷|姑娘|先生)\b', # Imperial titles r'\b(天子|圣上|皇上|万岁|万岁爷|太上皇|皇太后|太后|皇后|贵妃|妃|嫔|贵人|常在|答应)\b', r'\b(太子|皇子|皇孙|亲王|郡王|贝勒|贝子|公主|格格|郡主|县主|郡君|县君)\b', # Ancient official titles r'\b(丞相|相国|太师|太傅|太保|太尉|司徒|司空|大司马|大司农|大司寇)\b', r'\b(尚书|侍郎|郎中|员外郎|主事|知府|知州|知县|同知|通判|推官|巡抚|总督)\b', r'\b(御史大夫|御史中丞|监察御史|给事中|都察院|翰林院|国子监|钦天监)\b', r'\b(大学士|学士|侍读|侍讲|编修|检讨|庶吉士|举人|进士|状元|榜眼|探花)\b', # Military ranks r'\b(大元帅|元帅|大将军|将军|都督|都指挥使|指挥使|千户|百户|总兵|副将|参将|游击|都司|守备)\b', r'\b(提督|总兵官|副总兵|参将|游击将军|都司|守备|千总|把总|外委)\b', # Religious titles r'\b(国师|帝师|法王|活佛|堪布|仁波切|大和尚|方丈|住持|首座|维那|知客)\b', r'\b(天师|真人|道长|掌教|监院|高功|都讲|总理|提点|知观)\b', # Nobility ranks r'\b(公|侯|伯|子|男|开国公|郡公|国公|郡侯|县侯|郡伯|县伯|县子|县男)\b', r'\b(一品|二品|三品|四品|五品|六品|七品|八品|九品|正一品|从一品|正二品|从二品)\b', # Chinese politeness markers r'(请|敢|恭|敬|烦|有)(问|请|赐|教|告|示)', r'(拜|惠|赐|垂|雅|笑)(见|访|读|托|谢|顾|赐|存|纳|正|询)', r'(敬|谨|顿)(上|呈|启|白|首)' ], 'english': [ # Western titles r'\b(King|Queen|Prince|Princess|Emperor|Empress|Duke|Duchess|Marquis|Marquess|Earl|Count|Countess|Viscount|Viscountess|Baron|Baroness|Knight|Lord|Lady|Sir|Dame|General|Admiral|Captain|Major|Colonel|Commander|Lieutenant|Sergeant|Minister|Chancellor|President|Mayor|Governor|Judge|Doctor|Professor|Father|Reverend|Master|Mistress)\b', r'\b(His|Her|Your|Their)\s+(Majesty|Highness|Grace|Excellency|Honor|Worship|Lordship|Ladyship)\b', # Romanized historical titles r'\b(Tianzi|Huangdi|Huanghou|Taizi|Qinwang|Junwang|Beile|Beizi|Gongzhu|Gege)\b', r'\b(Chengxiang|Zaixiang|Taishi|Taifu|Taibao|Taiwei|Situ|Sikong|Dasima)\b', r'\b(Shogun|Daimyo|Samurai|Ronin|Ninja|Tenno|Mikado|Kampaku|Sessho)\b', r'\b(Taewang|Wangbi|Wanghu|Seja|Daegun|Gun|Ongju|Gongju|Buma)\b' ] } # Expanded Chinese numbers including classical forms CHINESE_NUMS = { # Basic numbers '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22, '二十三': 23, '二十四': 24, '二十五': 25, '三十': 30, '四十': 40, '五十': 50, '六十': 60, '七十': 70, '八十': 80, '九十': 90, '百': 100, # Classical/formal numbers '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000, # Ordinal indicators '第一': 1, '第二': 2, '第三': 3, '第四': 4, '第五': 5, '首': 1, '次': 2, '初': 1, '末': -1, } # Common words - keeping the same for filtering COMMON_WORDS = { '이', '그', '저', '우리', '너희', '자기', '당신', '여기', '거기', '저기', '오늘', '내일', '어제', '지금', '아까', '나중', '먼저', '다음', '마지막', '모든', '어떤', '무슨', '이런', '그런', '저런', '같은', '다른', '새로운', '하다', '있다', '없다', '되다', '하는', '있는', '없는', '되는', '것', '수', '때', '년', '월', '일', '시', '분', '초', '은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '만', '에서', '으로', '로', '까지', '부터', '에게', '한테', '께', '께서', 'この', 'その', 'あの', 'どの', 'これ', 'それ', 'あれ', 'どれ', 'わたし', 'あなた', 'かれ', 'かのじょ', 'わたしたち', 'あなたたち', 'きょう', 'あした', 'きのう', 'いま', 'あとで', 'まえ', 'つぎ', 'の', 'は', 'が', 'を', 'に', 'で', 'と', 'も', 'や', 'から', 'まで', '这', '那', '哪', '这个', '那个', '哪个', '这里', '那里', '哪里', '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '今天', '明天', '昨天', '现在', '刚才', '以后', '以前', '后来', '的', '了', '在', '是', '有', '和', '与', '或', '但', '因为', '所以', '一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', } # ===================================================== # CHUNK CONTEXT MANAGER (unchanged - already optimal) # ===================================================== class ChunkContextManager: """Manage context within a chapter separate from history""" def __init__(self): self.current_chunks = [] self.chapter_num = None self.chapter_title = None def start_chapter(self, chapter_num, chapter_title): """Start a new chapter context""" self.current_chunks = [] self.chapter_num = chapter_num self.chapter_title = chapter_title def add_chunk(self, user_content, assistant_content, chunk_idx, total_chunks): """Add a chunk to the current chapter context""" self.current_chunks.append({ "user": user_content, "assistant": assistant_content, "chunk_idx": chunk_idx, "total_chunks": total_chunks }) def get_context_messages(self, limit=3): """Get last N chunks as messages for API context""" context = [] for chunk in self.current_chunks[-limit:]: context.extend([ {"role": "user", "content": chunk["user"]}, {"role": "assistant", "content": chunk["assistant"]} ]) return context def get_summary_for_history(self): """Create a summary representation for the history""" if not self.current_chunks: return None, None total_chunks = len(self.current_chunks) user_summary = f"[Chapter {self.chapter_num}: {self.chapter_title}]\n" user_summary += f"[{total_chunks} chunks processed]\n" if self.current_chunks: first_chunk = self.current_chunks[0]['user'] if len(first_chunk) > 500: user_summary += first_chunk[:500] + "..." else: user_summary += first_chunk assistant_summary = f"[Chapter {self.chapter_num} Translation Complete]\n" assistant_summary += f"[Translated in {total_chunks} chunks]\n" if self.current_chunks: samples = [] first_trans = self.current_chunks[0]['assistant'] samples.append(f"Beginning: {first_trans[:200]}..." if len(first_trans) > 200 else f"Beginning: {first_trans}") if total_chunks > 2: mid_idx = total_chunks // 2 mid_trans = self.current_chunks[mid_idx]['assistant'] samples.append(f"Middle: {mid_trans[:200]}..." if len(mid_trans) > 200 else f"Middle: {mid_trans}") if total_chunks > 1: last_trans = self.current_chunks[-1]['assistant'] samples.append(f"End: {last_trans[:200]}..." if len(last_trans) > 200 else f"End: {last_trans}") assistant_summary += "\n".join(samples) return user_summary, assistant_summary def clear(self): """Clear the current chapter context""" self.current_chunks = [] self.chapter_num = None self.chapter_title = None # ===================================================== # UNIFIED UTILITIES # ===================================================== class FileUtilities: """Utilities for file and path operations""" @staticmethod def extract_actual_chapter_number(chapter, patterns=None, config=None): """Extract actual chapter number from filename using improved logic""" # IMPORTANT: Check if this is a pre-split TEXT FILE chunk first if (chapter.get('is_chunk', False) and 'num' in chapter and isinstance(chapter['num'], float) and chapter.get('filename', '').endswith('.txt')): # For text file chunks only, preserve the decimal number return chapter['num'] # This will be 1.1, 1.2, etc. # Get filename for extraction filename = chapter.get('original_basename') or chapter.get('filename', '') # Use our improved extraction function # Note: We don't have opf_spine_position here, so pass None actual_num, method = extract_chapter_number_from_filename(filename, opf_spine_position=None) # If extraction succeeded, return the result if actual_num is not None: #print(f"[DEBUG] Extracted {actual_num} from '{filename}' using method: {method}") return actual_num # Fallback to original complex logic for edge cases actual_num = None if patterns is None: patterns = PatternManager.FILENAME_EXTRACT_PATTERNS # Try to extract from original basename first if chapter.get('original_basename'): basename = chapter['original_basename'] # Check if decimal chapters are enabled for EPUBs enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' # For EPUBs, only check decimal patterns if the toggle is enabled if enable_decimal: # Check for standard decimal chapter numbers (e.g., Chapter_1.1, 1.2.html) decimal_match = re.search(r'(\d+)\.(\d+)', basename) if decimal_match: actual_num = float(f"{decimal_match.group(1)}.{decimal_match.group(2)}") return actual_num # Check for the XXXX_YY pattern where it represents X.YY decimal chapters decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename) if decimal_prefix_match: first_part = decimal_prefix_match.group(1) second_part = decimal_prefix_match.group(2) if len(second_part) == 2 and int(second_part) > 9: chapter_num = int(first_part[-1]) decimal_part = second_part actual_num = float(f"{chapter_num}.{decimal_part}") return actual_num # Standard XXXX_Y format handling (existing logic) prefix_suffix_match = re.match(r'^(\d+)_(\d+)', basename) if prefix_suffix_match: second_part = prefix_suffix_match.group(2) if not enable_decimal: actual_num = int(second_part) return actual_num else: if len(second_part) == 1 or (len(second_part) == 2 and int(second_part) <= 9): actual_num = int(second_part) return actual_num # Check other patterns if no match yet for pattern in patterns: if pattern in [r'^(\d+)[_\.]', r'(\d{3,5})[_\.]', r'^(\d+)_']: continue match = re.search(pattern, basename, re.IGNORECASE) if match: actual_num = int(match.group(1)) break # Final fallback to chapter num if actual_num is None: actual_num = chapter.get("num", 0) print(f"[DEBUG] No pattern matched, using chapter num: {actual_num}") return actual_num @staticmethod def create_chapter_filename(chapter, actual_num=None): """Create consistent chapter filename""" # Check if we should use header as output name use_header_output = os.getenv("USE_HEADER_AS_OUTPUT", "0") == "1" # Check if this is for a text file is_text_file = chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) # Respect toggle: retain source extension and remove 'response_' prefix retain = should_retain_source_extension() # Helper to compute full original extension chain (e.g., '.html.xhtml') def _full_ext_from_original(ch): fn = ch.get('original_filename') if not fn: return '.html' bn = os.path.basename(fn) root, ext = os.path.splitext(bn) if not ext: return '.html' full_ext = '' while ext: full_ext = ext + full_ext root, ext = os.path.splitext(root) return full_ext or '.html' if use_header_output and chapter.get('title'): safe_title = make_safe_filename(chapter['title'], actual_num or chapter.get('num', 0)) if safe_title and safe_title != f"chapter_{actual_num or chapter.get('num', 0):03d}": if is_text_file: return f"{safe_title}.txt" if retain else f"response_{safe_title}.txt" else: # If retaining, use full original ext chain; else default .html if retain: return f"{safe_title}{_full_ext_from_original(chapter)}" return f"response_{safe_title}.html" # Check if decimal chapters are enabled enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' # For EPUBs with decimal detection enabled if enable_decimal and 'original_basename' in chapter and chapter['original_basename']: basename = chapter['original_basename'] # Check for standard decimal pattern (e.g., Chapter_1.1) decimal_match = re.search(r'(\d+)\.(\d+)', basename) if decimal_match: # Create a modified basename that preserves the decimal base = os.path.splitext(basename)[0] # Replace dots with underscores for filesystem compatibility base = base.replace('.', '_') # Use .txt extension for text files if is_text_file: return f"{base}.txt" if retain else f"response_{base}.txt" else: if retain: return f"{base}{_full_ext_from_original(chapter)}" return f"response_{base}.html" # Check for the special XXXX_YY decimal pattern decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename) if decimal_prefix_match: first_part = decimal_prefix_match.group(1) second_part = decimal_prefix_match.group(2) # If this matches our decimal pattern (e.g., 0002_33 -> 2.33) if len(second_part) == 2 and int(second_part) > 9: chapter_num = int(first_part[-1]) decimal_part = second_part # Create filename reflecting the decimal interpretation if is_text_file: return f"{chapter_num:04d}_{decimal_part}.txt" if retain else f"response_{chapter_num:04d}_{decimal_part}.txt" else: return f"{chapter_num:04d}_{decimal_part}{_full_ext_from_original(chapter)}" if retain else f"response_{chapter_num:04d}_{decimal_part}.html" # Standard EPUB handling - use original basename if 'original_basename' in chapter and chapter['original_basename']: base = os.path.splitext(chapter['original_basename'])[0] # Use .txt extension for text files if is_text_file: return f"{base}.txt" if retain else f"response_{base}.txt" else: if retain: # Preserve the full original extension chain return f"{base}{_full_ext_from_original(chapter)}" return f"response_{base}.html" else: # Text file handling (no original basename) if actual_num is None: actual_num = chapter.get('actual_chapter_num', chapter.get('num', 0)) # Handle decimal chapter numbers from text file splitting if isinstance(actual_num, float): major = int(actual_num) minor = int(round((actual_num - major) * 10)) if is_text_file: return f"{major:04d}_{minor}.txt" if retain else f"response_{major:04d}_{minor}.txt" else: return f"{major:04d}_{minor}.html" if retain else f"response_{major:04d}_{minor}.html" else: if is_text_file: return f"{actual_num:04d}.txt" if retain else f"response_{actual_num:04d}.txt" else: return f"{actual_num:04d}.html" if retain else f"response_{actual_num:04d}.html" # ===================================================== # UNIFIED PROGRESS MANAGER # ===================================================== class ProgressManager: """Unified progress management""" def __init__(self, payloads_dir): self.payloads_dir = payloads_dir self.PROGRESS_FILE = os.path.join(payloads_dir, "translation_progress.json") self.prog = self._init_or_load() def _init_or_load(self): """Initialize or load progress tracking with improved structure""" if os.path.exists(self.PROGRESS_FILE): try: with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: prog = json.load(pf) except json.JSONDecodeError as e: print(f"⚠️ Warning: Progress file is corrupted: {e}") print("🔧 Attempting to fix JSON syntax...") try: with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: content = pf.read() content = re.sub(r',\s*\]', ']', content) content = re.sub(r',\s*\}', '}', content) prog = json.loads(content) with open(self.PROGRESS_FILE, "w", encoding="utf-8") as pf: json.dump(prog, pf, ensure_ascii=False, indent=2) print("✅ Successfully fixed and saved progress file") except Exception as fix_error: print(f"❌ Could not fix progress file: {fix_error}") print("🔄 Creating backup and starting fresh...") backup_name = f"translation_progress_backup_{int(time.time())}.json" backup_path = os.path.join(self.payloads_dir, backup_name) try: shutil.copy(self.PROGRESS_FILE, backup_path) print(f"📁 Backup saved to: {backup_name}") except: pass prog = { "chapters": {}, "chapter_chunks": {}, "version": "2.0" } if "chapters" not in prog: prog["chapters"] = {} for idx in prog.get("completed", []): prog["chapters"][str(idx)] = { "status": "completed", "timestamp": None } if "chapter_chunks" not in prog: prog["chapter_chunks"] = {} else: prog = { "chapters": {}, "chapter_chunks": {}, "image_chunks": {}, "version": "2.1" } return prog def save(self): """Save progress to file""" try: self.prog["completed_list"] = [] for chapter_key, chapter_info in self.prog.get("chapters", {}).items(): if chapter_info.get("status") == "completed" and chapter_info.get("output_file"): self.prog["completed_list"].append({ "num": chapter_info.get("chapter_num", 0), "idx": chapter_info.get("chapter_idx", 0), "title": f"Chapter {chapter_info.get('chapter_num', 0)}", "file": chapter_info.get("output_file", ""), "key": chapter_key }) if self.prog.get("completed_list"): self.prog["completed_list"].sort(key=lambda x: x["num"]) temp_file = self.PROGRESS_FILE + '.tmp' with open(temp_file, "w", encoding="utf-8") as pf: json.dump(self.prog, pf, ensure_ascii=False, indent=2) if os.path.exists(self.PROGRESS_FILE): os.remove(self.PROGRESS_FILE) os.rename(temp_file, self.PROGRESS_FILE) except Exception as e: print(f"⚠️ Warning: Failed to save progress: {e}") temp_file = self.PROGRESS_FILE + '.tmp' if os.path.exists(temp_file): try: os.remove(temp_file) except: pass def update(self, idx, actual_num, content_hash, output_file, status="in_progress", ai_features=None, raw_num=None): """Update progress for a chapter""" # CHANGE THIS LINE - Use actual_num instead of idx chapter_key = str(actual_num) # WAS: chapter_key = str(idx) chapter_info = { "actual_num": actual_num, "content_hash": content_hash, "output_file": output_file, "status": status, "last_updated": time.time() } # Add raw number tracking if raw_num is not None: chapter_info["raw_chapter_num"] = raw_num # Check if zero detection was disabled if hasattr(builtins, '_DISABLE_ZERO_DETECTION') and builtins._DISABLE_ZERO_DETECTION: chapter_info["zero_adjusted"] = False else: chapter_info["zero_adjusted"] = (raw_num != actual_num) if raw_num is not None else False # FIXED: Store AI features if provided if ai_features is not None: chapter_info["ai_features"] = ai_features # Preserve existing AI features if not overwriting elif chapter_key in self.prog["chapters"] and "ai_features" in self.prog["chapters"][chapter_key]: chapter_info["ai_features"] = self.prog["chapters"][chapter_key]["ai_features"] self.prog["chapters"][chapter_key] = chapter_info def check_chapter_status(self, chapter_idx, actual_num, content_hash, output_dir, chapter_obj=None): """Check if a chapter needs translation""" chapter_key = str(actual_num) # Check if we have tracking for this chapter if chapter_key in self.prog["chapters"]: chapter_info = self.prog["chapters"][chapter_key] status = chapter_info.get("status") # Failed statuses ALWAYS trigger retranslation if status in ["qa_failed", "failed", "error", "file_missing"]: return True, None, None # Completed - check file exists if status in ["completed", "completed_empty", "completed_image_only"]: output_file = chapter_info.get("output_file") if output_file: output_path = os.path.join(output_dir, output_file) if os.path.exists(output_path): return False, f"Chapter {actual_num} already translated: {output_file}", output_file # File missing - retranslate del self.prog["chapters"][chapter_key] if chapter_key in self.prog.get("chapter_chunks", {}): del self.prog["chapter_chunks"][chapter_key] self.save() return True, None, None # Any other status - retranslate return True, None, None # BEFORE auto-discovery, check if ANY entry exists for this chapter's file if chapter_obj: from TransateKRtoEN import FileUtilities output_filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num) # Check if ANY entry has this output file for key, info in self.prog["chapters"].items(): if info.get("output_file") == output_filename: # Entry exists somewhere else - don't auto-discover return True, None, None # NOW check if file exists for auto-discovery output_path = os.path.join(output_dir, output_filename) if os.path.exists(output_path): print(f"📁 Found existing file for chapter {actual_num}: {output_filename}") self.prog["chapters"][chapter_key] = { "actual_num": actual_num, "content_hash": content_hash, "output_file": output_filename, "status": "completed", "last_updated": os.path.getmtime(output_path), "auto_discovered": True } self.save() return False, f"Chapter {actual_num} already exists: {output_filename}", output_filename # No entry and no file - needs translation return True, None, None def cleanup_missing_files(self, output_dir): """Remove missing files and duplicates - NO RESTORATION BULLSHIT""" cleaned_count = 0 # Remove entries for missing files for chapter_key, chapter_info in list(self.prog["chapters"].items()): output_file = chapter_info.get("output_file") if output_file: output_path = os.path.join(output_dir, output_file) if not os.path.exists(output_path): print(f"🗑️ Removing entry for missing file: {output_file}") # Delete the entry del self.prog["chapters"][chapter_key] # Remove chunk data if chapter_key in self.prog.get("chapter_chunks", {}): del self.prog["chapter_chunks"][chapter_key] cleaned_count += 1 if cleaned_count > 0: print(f"🔄 Removed {cleaned_count} entries - will retranslate") def migrate_to_content_hash(self, chapters): """Change keys to match actual_num values for proper mapping and sort by chapter number""" new_chapters = {} migrated_count = 0 for old_key, chapter_info in self.prog["chapters"].items(): actual_num = chapter_info.get("actual_num") if actual_num is not None: new_key = str(actual_num) # If key needs to change if old_key != new_key: print(f" Migrating: key '{old_key}' → '{new_key}' (actual_num: {actual_num})") migrated_count += 1 # Check for collision if new_key in new_chapters: print(f" ⚠️ Warning: Key '{new_key}' already exists, keeping newer entry") if chapter_info.get("last_updated", 0) > new_chapters[new_key].get("last_updated", 0): new_chapters[new_key] = chapter_info else: new_chapters[new_key] = chapter_info else: # Key already matches actual_num new_chapters[old_key] = chapter_info else: # No actual_num, keep as-is print(f" ⚠️ Warning: No actual_num for key '{old_key}', keeping as-is") new_chapters[old_key] = chapter_info # Sort chapters by actual_num field, then by key as fallback def sort_key(item): key, chapter_info = item actual_num = chapter_info.get("actual_num") if actual_num is not None: return actual_num else: # Fallback to key if no actual_num try: return int(key) except ValueError: # For non-numeric keys, sort them at the end return float('inf') sorted_chapters = dict(sorted(new_chapters.items(), key=sort_key)) if migrated_count > 0: # Also migrate and sort chapter_chunks if they exist if "chapter_chunks" in self.prog: new_chunks = {} for old_key, chunk_data in self.prog["chapter_chunks"].items(): if old_key in self.prog["chapters"] and "actual_num" in self.prog["chapters"][old_key]: new_key = str(self.prog["chapters"][old_key]["actual_num"]) new_chunks[new_key] = chunk_data else: new_chunks[old_key] = chunk_data # Sort chapter_chunks using the same sorting logic sorted_chunks = dict(sorted(new_chunks.items(), key=sort_key)) self.prog["chapter_chunks"] = sorted_chunks self.prog["chapters"] = sorted_chapters self.save() print(f"✅ Migrated {migrated_count} entries to use actual_num as key and sorted by chapter number") else: # Even if no migration occurred, still apply sorting self.prog["chapters"] = sorted_chapters if "chapter_chunks" in self.prog: sorted_chunks = dict(sorted(self.prog["chapter_chunks"].items(), key=sort_key)) self.prog["chapter_chunks"] = sorted_chunks self.save() print("✅ Sorted chapters by chapter number") def get_stats(self, output_dir): """Get statistics about translation progress""" stats = { "total_tracked": len(self.prog["chapters"]), "completed": 0, "missing_files": 0, "in_progress": 0 } for chapter_info in self.prog["chapters"].values(): status = chapter_info.get("status") output_file = chapter_info.get("output_file") if status == "completed" and output_file: output_path = os.path.join(output_dir, output_file) if os.path.exists(output_path): stats["completed"] += 1 else: stats["missing_files"] += 1 elif status == "in_progress": stats["in_progress"] += 1 elif status == "file_missing": stats["missing_files"] += 1 return stats # ===================================================== # UNIFIED CONTENT PROCESSOR # ===================================================== class ContentProcessor: """Unified content processing""" @staticmethod def clean_ai_artifacts(text, remove_artifacts=True): """Remove AI response artifacts from text - but ONLY when enabled""" if not remove_artifacts: return text # First, remove thinking tags if they exist text = ContentProcessor._remove_thinking_tags(text) # After removing thinking tags, re-analyze the text structure # to catch AI artifacts that may now be at the beginning lines = text.split('\n') # Clean up empty lines at the beginning while lines and not lines[0].strip(): lines.pop(0) if not lines: return text # Check the first non-empty line for AI artifacts first_line = lines[0].strip() ai_patterns = [ r'^(?:Sure|Okay|Understood|Of course|Got it|Alright|Certainly|Here\'s|Here is)', r'^(?:I\'ll|I will|Let me) (?:translate|help|assist)', r'^(?:System|Assistant|AI|User|Human|Model)\s*:', r'^\[PART\s+\d+/\d+\]', r'^(?:Translation note|Note|Here\'s the translation|I\'ve translated)', r'^```(?:html|xml|text)?\s*$', # Enhanced code block detection r'^', remaining_text, re.IGNORECASE) or len(remaining_text.strip()) > 50): # Reduced from 100 to 50 print(f"✂️ Removed AI artifact: {first_line[:50]}...") return remaining_text.lstrip() if first_line.lower() in ['html', 'text', 'content', 'translation', 'output']: remaining_lines = lines[1:] remaining_text = '\n'.join(remaining_lines) if remaining_text.strip(): print(f"✂️ Removed single word artifact: {first_line}") return remaining_text.lstrip() return '\n'.join(lines) @staticmethod def _remove_thinking_tags(text): """Remove thinking tags that some AI models produce""" if not text: return text # Common thinking tag patterns used by various AI models thinking_patterns = [ # XML-style thinking tags (r'.*?', 'thinking'), (r'.*?', 'think'), (r'.*?', 'thoughts'), (r'.*?', 'reasoning'), (r'.*?', 'analysis'), (r'.*?', 'reflection'), # OpenAI o1-style reasoning blocks - fix the regex escaping (r'<\|thinking\|>.*?', 'o1-thinking'), # Claude-style thinking blocks (r'\[thinking\].*?\[/thinking\]', 'claude-thinking'), # Generic bracketed thinking patterns (r'\[THINKING\].*?\[/THINKING\]', 'bracketed-thinking'), (r'\[ANALYSIS\].*?\[/ANALYSIS\]', 'bracketed-analysis'), ] original_text = text removed_count = 0 for pattern, tag_type in thinking_patterns: # Use DOTALL flag to match across newlines matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE) if matches: text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE) removed_count += len(matches) # Also remove standalone code block markers that might be artifacts # But preserve all actual content - only remove the ``` markers themselves code_block_removed = 0 code_block_patterns = [ (r'^```\w*\s*\n', '\n'), # Opening code blocks - replace with newline (r'\n```\s*$', ''), # Closing code blocks at end - remove entirely (r'^```\w*\s*$', ''), # Standalone ``` on its own line - remove entirely ] for pattern, replacement in code_block_patterns: matches = re.findall(pattern, text, re.MULTILINE) if matches: text = re.sub(pattern, replacement, text, flags=re.MULTILINE) code_block_removed += len(matches) # Clean up any extra whitespace or empty lines left after removing thinking tags total_removed = removed_count + code_block_removed if total_removed > 0: # Remove multiple consecutive newlines text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) # Remove leading/trailing whitespace text = text.strip() if removed_count > 0 and code_block_removed > 0: print(f"🧠 Removed {removed_count} thinking tag(s) and {code_block_removed} code block marker(s)") elif removed_count > 0: print(f"🧠 Removed {removed_count} thinking tag(s)") elif code_block_removed > 0: print(f"📝 Removed {code_block_removed} code block marker(s)") return text @staticmethod def clean_memory_artifacts(text): """Remove any memory/summary artifacts that leaked into the translation""" text = re.sub(r'\[MEMORY\].*?\[END MEMORY\]', '', text, flags=re.DOTALL) lines = text.split('\n') cleaned_lines = [] skip_next = False for line in lines: if any(marker in line for marker in ['[MEMORY]', '[END MEMORY]', 'Previous context summary:', 'memory summary', 'context summary', '[Context]']): skip_next = True continue if skip_next and line.strip() == '': skip_next = False continue skip_next = False cleaned_lines.append(line) return '\n'.join(cleaned_lines) @staticmethod def emergency_restore_paragraphs(text, original_html=None, verbose=True): """Emergency restoration when AI returns wall of text without proper paragraph tags""" def log(message): if verbose: print(message) if text.count('

') >= 3: return text if original_html: original_para_count = original_html.count('

') current_para_count = text.count('

') if current_para_count < original_para_count / 2: log(f"⚠️ Paragraph mismatch! Original: {original_para_count}, Current: {current_para_count}") log("🔧 Attempting emergency paragraph restoration...") if '

' not in text and len(text) > 300: log("❌ No paragraph tags found - applying emergency restoration") if '\n\n' in text: parts = text.split('\n\n') paragraphs = ['

' + part.strip() + '

' for part in parts if part.strip()] return '\n'.join(paragraphs) dialogue_pattern = r'(?<=[.!?])\s+(?=[""\u201c\u201d])' if re.search(dialogue_pattern, text): parts = re.split(dialogue_pattern, text) paragraphs = [] for part in parts: part = part.strip() if part: if not part.startswith('

'): part = '

' + part if not part.endswith('

'): part = part + '

' paragraphs.append(part) return '\n'.join(paragraphs) sentence_boundary = r'(?<=[.!?])\s+(?=[A-Z\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af])' sentences = re.split(sentence_boundary, text) if len(sentences) > 1: paragraphs = [] current_para = [] for sentence in sentences: sentence = sentence.strip() if not sentence: continue current_para.append(sentence) should_break = ( len(current_para) >= 3 or sentence.rstrip().endswith(('"', '"', '"')) or '* * *' in sentence or '***' in sentence or '---' in sentence ) if should_break: para_text = ' '.join(current_para) if not para_text.startswith('

'): para_text = '

' + para_text if not para_text.endswith('

'): para_text = para_text + '

' paragraphs.append(para_text) current_para = [] if current_para: para_text = ' '.join(current_para) if not para_text.startswith('

'): para_text = '

' + para_text if not para_text.endswith('

'): para_text = para_text + '

' paragraphs.append(para_text) result = '\n'.join(paragraphs) log(f"✅ Restored {len(paragraphs)} paragraphs from wall of text") return result words = text.split() if len(words) > 100: paragraphs = [] words_per_para = max(100, len(words) // 10) for i in range(0, len(words), words_per_para): chunk = ' '.join(words[i:i + words_per_para]) if chunk.strip(): paragraphs.append('

' + chunk.strip() + '

') return '\n'.join(paragraphs) elif '

' in text and text.count('

') < 3 and len(text) > 1000: log("⚠️ Very few paragraphs for long text - checking if more breaks needed") soup = BeautifulSoup(text, 'html.parser') existing_paras = soup.find_all('p') new_paragraphs = [] for para in existing_paras: para_text = para.get_text() if len(para_text) > 500: sentences = re.split(r'(?<=[.!?])\s+', para_text) if len(sentences) > 5: chunks = [] current = [] for sent in sentences: current.append(sent) if len(current) >= 3: chunks.append('

' + ' '.join(current) + '

') current = [] if current: chunks.append('

' + ' '.join(current) + '

') new_paragraphs.extend(chunks) else: new_paragraphs.append(str(para)) else: new_paragraphs.append(str(para)) return '\n'.join(new_paragraphs) return text @staticmethod def get_content_hash(html_content): """Create a stable hash of content""" try: soup = BeautifulSoup(html_content, 'html.parser') for tag in soup(['script', 'style', 'meta', 'link']): tag.decompose() text_content = soup.get_text(separator=' ', strip=True) text_content = ' '.join(text_content.split()) return hashlib.md5(text_content.encode('utf-8')).hexdigest() except Exception as e: print(f"[WARNING] Failed to create hash: {e}") return hashlib.md5(html_content.encode('utf-8')).hexdigest() @staticmethod def is_meaningful_text_content(html_content): """Check if chapter has meaningful text beyond just structure""" try: # Check if this is plain text from enhanced extraction (html2text output) # html2text output characteristics: # - Often starts with # for headers # - Contains markdown-style formatting # - Doesn't have HTML tags content_stripped = html_content.strip() # Quick check for plain text/markdown content is_plain_text = False if content_stripped and ( not content_stripped.startswith('<') or # Doesn't start with HTML tag content_stripped.startswith('#') or # Markdown header '\n\n' in content_stripped[:500] or # Markdown paragraphs not '

' in content_stripped[:500] and not '

' in content_stripped[:500] # No common HTML tags ): # This looks like plain text or markdown from html2text is_plain_text = True if is_plain_text: # For plain text, just check the length text_length = len(content_stripped) # Be more lenient with plain text since it's already extracted return text_length > 50 # Much lower threshold for plain text # Original HTML parsing logic soup = BeautifulSoup(html_content, 'html.parser') soup_copy = BeautifulSoup(str(soup), 'html.parser') for img in soup_copy.find_all('img'): img.decompose() text_elements = soup_copy.find_all(['p', 'div', 'span']) text_content = ' '.join(elem.get_text(strip=True) for elem in text_elements) headers = soup_copy.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) header_text = ' '.join(h.get_text(strip=True) for h in headers) if headers and len(text_content.strip()) > 1: return True if len(text_content.strip()) > 200: return True if len(header_text.strip()) > 100: return True return False except Exception as e: print(f"Warning: Error checking text content: {e}") return True # ===================================================== # UNIFIED CHAPTER EXTRACTOR # ===================================================== class ChapterExtractor: """Unified chapter extraction with three modes: Smart, Comprehensive, and Full""" def __init__(self, progress_callback=None): self.pattern_manager = PatternManager() self.progress_callback = progress_callback # Add progress callback self.parser = self._get_best_parser() # Determine best parser on init def _get_best_parser(self): """Determine the best parser available, preferring lxml for CJK text""" try: import lxml return 'lxml' except ImportError: return 'html.parser' def _sort_by_opf_spine(self, chapters, opf_path): """Sort chapters according to OPF spine order""" try: import xml.etree.ElementTree as ET # Read OPF file with open(opf_path, 'r', encoding='utf-8') as f: opf_content = f.read() # Parse OPF root = ET.fromstring(opf_content) # Find namespaces ns = {'opf': 'http://www.idpf.org/2007/opf'} if root.tag.startswith('{'): default_ns = root.tag[1:root.tag.index('}')] ns = {'opf': default_ns} # Build manifest map (id -> href) manifest = {} for item in root.findall('.//opf:manifest/opf:item', ns): item_id = item.get('id') href = item.get('href') if item_id and href: manifest[item_id] = href # Get spine order spine_order = [] spine = root.find('.//opf:spine', ns) if spine is not None: for itemref in spine.findall('opf:itemref', ns): idref = itemref.get('idref') if idref and idref in manifest: href = manifest[idref] spine_order.append(href) if not spine_order: print("⚠️ No spine order found in OPF, keeping original order") return chapters # Create a mapping of filenames to spine position spine_map = {} for idx, href in enumerate(spine_order): # Try different matching strategies basename = os.path.basename(href) spine_map[basename] = idx spine_map[href] = idx # Also store without extension for flexible matching name_no_ext = os.path.splitext(basename)[0] spine_map[name_no_ext] = idx print(f"📋 OPF spine contains {len(spine_order)} items") # Sort chapters based on spine order def get_spine_position(chapter): # Try to match chapter to spine filename = chapter.get('filename', '') basename = chapter.get('original_basename', '') # Try exact filename match if filename in spine_map: return spine_map[filename] # Try basename match if basename in spine_map: return spine_map[basename] # Try basename of filename if filename: fname_base = os.path.basename(filename) if fname_base in spine_map: return spine_map[fname_base] # Try without extension if basename: if basename + '.html' in spine_map: return spine_map[basename + '.html'] if basename + '.xhtml' in spine_map: return spine_map[basename + '.xhtml'] # Fallback to chapter number * 1000 (to sort after spine items) return 1000000 + chapter.get('num', 0) # Sort chapters sorted_chapters = sorted(chapters, key=get_spine_position) # Renumber chapters based on new order for idx, chapter in enumerate(sorted_chapters, 1): chapter['spine_order'] = idx # Optionally update chapter numbers to match spine order # chapter['num'] = idx # Uncomment if you want to renumber # Log reordering info reordered_count = 0 for idx, chapter in enumerate(sorted_chapters): original_idx = chapters.index(chapter) if original_idx != idx: reordered_count += 1 if reordered_count > 0: print(f"🔄 Reordered {reordered_count} chapters to match OPF spine") else: print(f"✅ Chapter order already matches OPF spine") return sorted_chapters except Exception as e: print(f"⚠️ Could not sort by OPF spine: {e}") import traceback traceback.print_exc() return chapters def protect_angle_brackets_with_korean(self, text: str) -> str: """Protect CJK text in angle brackets from HTML parsing""" if text is None: return "" import re # Extended pattern to include Korean, Chinese, and Japanese characters cjk_pattern = r'[가-힣ㄱ-ㅎㅏ-ㅣ一-龿ぁ-ゟァ-ヿ]' bracket_pattern = rf'<([^<>]*{cjk_pattern}[^<>]*)>' def replace_brackets(match): content = match.group(1) return f'<{content}>' return re.sub(bracket_pattern, replace_brackets, text) def ensure_all_opf_chapters_extracted(zf, chapters, out): """Ensure ALL chapters from OPF spine are extracted, not just what ChapterExtractor found""" # Parse OPF to get ALL chapters in spine opf_chapters = [] try: # Find content.opf opf_content = None for name in zf.namelist(): if name.endswith('content.opf'): opf_content = zf.read(name) break if not opf_content: return chapters # No OPF, return original import xml.etree.ElementTree as ET root = ET.fromstring(opf_content) # Handle namespaces ns = {'opf': 'http://www.idpf.org/2007/opf'} if root.tag.startswith('{'): default_ns = root.tag[1:root.tag.index('}')] ns = {'opf': default_ns} # Get manifest manifest = {} for item in root.findall('.//opf:manifest/opf:item', ns): item_id = item.get('id') href = item.get('href') media_type = item.get('media-type', '') if item_id and href and ('html' in media_type.lower() or href.endswith(('.html', '.xhtml', '.htm'))): manifest[item_id] = href # Get spine order spine = root.find('.//opf:spine', ns) if spine: for itemref in spine.findall('opf:itemref', ns): idref = itemref.get('idref') if idref and idref in manifest: href = manifest[idref] filename = os.path.basename(href) # Skip nav, toc, cover if any(skip in filename.lower() for skip in ['nav', 'toc', 'cover']): continue opf_chapters.append(href) print(f"📚 OPF spine contains {len(opf_chapters)} chapters") # Check which OPF chapters are missing from extraction extracted_files = set() for c in chapters: if 'filename' in c: extracted_files.add(c['filename']) if 'original_basename' in c: extracted_files.add(c['original_basename']) missing_chapters = [] for opf_chapter in opf_chapters: basename = os.path.basename(opf_chapter) if basename not in extracted_files and opf_chapter not in extracted_files: missing_chapters.append(opf_chapter) if missing_chapters: print(f"⚠️ {len(missing_chapters)} chapters in OPF but not extracted!") print(f" Missing: {missing_chapters[:5]}{'...' if len(missing_chapters) > 5 else ''}") # Extract the missing chapters for href in missing_chapters: try: # Read the chapter content content = zf.read(href).decode('utf-8') # Extract chapter number import re basename = os.path.basename(href) matches = re.findall(r'(\d+)', basename) if matches: chapter_num = int(matches[-1]) else: chapter_num = len(chapters) + 1 # Create chapter entry from bs4 import BeautifulSoup parser = 'lxml' if 'lxml' in sys.modules else 'html.parser' soup = BeautifulSoup(content, parser) # Get title title = "Chapter " + str(chapter_num) title_tag = soup.find('title') if title_tag: title = title_tag.get_text().strip() or title else: for tag in ['h1', 'h2', 'h3']: header = soup.find(tag) if header: title = header.get_text().strip() or title break # Save the chapter file output_filename = f"chapter_{chapter_num:04d}_{basename}" output_path = os.path.join(out, output_filename) with open(output_path, 'w', encoding='utf-8') as f: f.write(content) # Add to chapters list new_chapter = { 'num': chapter_num, 'title': title, 'body': content, 'filename': href, 'original_basename': basename, 'file_size': len(content), 'has_images': bool(soup.find_all('img')), 'detection_method': 'opf_recovery', 'content_hash': None # Will be calculated later } chapters.append(new_chapter) print(f" ✅ Recovered chapter {chapter_num}: {basename}") except Exception as e: print(f" ❌ Failed to extract {href}: {e}") # Re-sort chapters by number chapters.sort(key=lambda x: x['num']) print(f"✅ Total chapters after OPF recovery: {len(chapters)}") except Exception as e: print(f"⚠️ Error checking OPF chapters: {e}") import traceback traceback.print_exc() return chapters def extract_chapters(self, zf, output_dir): """Extract chapters and all resources from EPUB using ThreadPoolExecutor""" import time # Check stop at the very beginning if is_stop_requested(): print("❌ Extraction stopped by user") return [] print("🚀 Starting EPUB extraction with ThreadPoolExecutor...") print(f"📄 Using parser: {self.parser} {'(optimized for CJK)' if self.parser == 'lxml' else '(standard)'}") # Initial progress if self.progress_callback: self.progress_callback("Starting EPUB extraction...") # First, extract and save content.opf for reference for name in zf.namelist(): if name.endswith('.opf'): try: opf_content = zf.read(name).decode('utf-8', errors='ignore') opf_output_path = os.path.join(output_dir, 'content.opf') with open(opf_output_path, 'w', encoding='utf-8') as f: f.write(opf_content) print(f"📋 Saved OPF file: {name} → content.opf") break except Exception as e: print(f"⚠️ Could not save OPF file: {e}") # Get extraction mode from environment extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() print(f"✅ Using {extraction_mode.capitalize()} extraction mode") # Get number of workers from environment or use default max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) print(f"🔧 Using {max_workers} workers for parallel processing") extracted_resources = self._extract_all_resources(zf, output_dir) # Check stop after resource extraction if is_stop_requested(): print("❌ Extraction stopped by user") return [] metadata_path = os.path.join(output_dir, 'metadata.json') if os.path.exists(metadata_path): print("📋 Loading existing metadata...") with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) else: print("📋 Extracting fresh metadata...") metadata = self._extract_epub_metadata(zf) print(f"📋 Extracted metadata: {list(metadata.keys())}") chapters, detected_language = self._extract_chapters_universal(zf, extraction_mode) # Sort chapters according to OPF spine order if available opf_path = os.path.join(output_dir, 'content.opf') if os.path.exists(opf_path) and chapters: print("📋 Sorting chapters according to OPF spine order...") chapters = self._sort_by_opf_spine(chapters, opf_path) print(f"✅ Chapters sorted according to OPF reading order") # Check stop after chapter extraction if is_stop_requested(): print("❌ Extraction stopped by user") return [] if not chapters: print("❌ No chapters could be extracted!") return [] chapters_info_path = os.path.join(output_dir, 'chapters_info.json') chapters_info = [] chapters_info_lock = threading.Lock() def process_chapter(chapter): """Process a single chapter""" # Check stop in worker if is_stop_requested(): return None info = { 'num': chapter['num'], 'title': chapter['title'], 'original_filename': chapter.get('filename', ''), 'has_images': chapter.get('has_images', False), 'image_count': chapter.get('image_count', 0), 'text_length': chapter.get('file_size', len(chapter.get('body', ''))), 'detection_method': chapter.get('detection_method', 'unknown'), 'content_hash': chapter.get('content_hash', '') } if chapter.get('has_images'): try: soup = BeautifulSoup(chapter.get('body', ''), self.parser) images = soup.find_all('img') info['images'] = [img.get('src', '') for img in images] except: info['images'] = [] return info # Process chapters in parallel print(f"🔄 Processing {len(chapters)} chapters in parallel...") if self.progress_callback: self.progress_callback(f"Processing {len(chapters)} chapters...") with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future_to_chapter = { executor.submit(process_chapter, chapter): chapter for chapter in chapters } # Process completed tasks completed = 0 for future in as_completed(future_to_chapter): if is_stop_requested(): print("❌ Extraction stopped by user") # Cancel remaining futures for f in future_to_chapter: f.cancel() return [] try: result = future.result() if result: with chapters_info_lock: chapters_info.append(result) completed += 1 # Yield to GUI periodically (can be disabled for max speed) if completed % 5 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) # Progress updates if completed % 10 == 0 or completed == len(chapters): progress_msg = f"Processed {completed}/{len(chapters)} chapters" print(f" 📊 {progress_msg}") if self.progress_callback: self.progress_callback(progress_msg) except Exception as e: chapter = future_to_chapter[future] print(f" ❌ Error processing chapter {chapter['num']}: {e}") # Sort chapters_info by chapter number to maintain order chapters_info.sort(key=lambda x: x['num']) print(f"✅ Successfully processed {len(chapters_info)} chapters") with open(chapters_info_path, 'w', encoding='utf-8') as f: json.dump(chapters_info, f, ensure_ascii=False, indent=2) print(f"💾 Saved detailed chapter info to: chapters_info.json") metadata.update({ 'chapter_count': len(chapters), 'detected_language': detected_language, 'extracted_resources': extracted_resources, 'extraction_mode': extraction_mode, 'extraction_summary': { 'total_chapters': len(chapters), 'chapter_range': f"{chapters[0]['num']}-{chapters[-1]['num']}", 'resources_extracted': sum(len(files) for files in extracted_resources.values()) } }) metadata['chapter_titles'] = { str(c['num']): c['title'] for c in chapters } with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) print(f"💾 Saved comprehensive metadata to: {metadata_path}") self._create_extraction_report(output_dir, metadata, chapters, extracted_resources) self._log_extraction_summary(chapters, extracted_resources, detected_language) print(f"🔍 VERIFICATION: {extraction_mode.capitalize()} chapter extraction completed successfully") print(f"⚡ Used {max_workers} workers for parallel processing") return chapters def _extract_all_resources(self, zf, output_dir): """Extract all resources with parallel processing""" import time extracted_resources = { 'css': [], 'fonts': [], 'images': [], 'epub_structure': [], 'other': [] } # Check if already extracted extraction_marker = os.path.join(output_dir, '.resources_extracted') if os.path.exists(extraction_marker): print("📦 Resources already extracted, skipping...") return self._count_existing_resources(output_dir, extracted_resources) self._cleanup_old_resources(output_dir) # Create directories for resource_type in ['css', 'fonts', 'images']: os.makedirs(os.path.join(output_dir, resource_type), exist_ok=True) print(f"📦 Extracting resources in parallel...") # Get list of files to process file_list = [f for f in zf.namelist() if not f.endswith('/') and os.path.basename(f)] # Thread-safe lock for extracted_resources resource_lock = threading.Lock() def extract_single_resource(file_path): if is_stop_requested(): return None try: file_data = zf.read(file_path) resource_info = self._categorize_resource(file_path, os.path.basename(file_path)) if resource_info: resource_type, target_dir, safe_filename = resource_info target_path = os.path.join(output_dir, target_dir, safe_filename) if target_dir else os.path.join(output_dir, safe_filename) with open(target_path, 'wb') as f: f.write(file_data) # Thread-safe update with resource_lock: extracted_resources[resource_type].append(safe_filename) return (resource_type, safe_filename) except Exception as e: print(f"[WARNING] Failed to extract {file_path}: {e}") return None # Process files in parallel total_resources = len(file_list) extracted_count = 0 # Use same worker count as chapter processing resource_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) with ThreadPoolExecutor(max_workers=resource_workers) as executor: futures = {executor.submit(extract_single_resource, file_path): file_path for file_path in file_list} for future in as_completed(futures): if is_stop_requested(): executor.shutdown(wait=False) break extracted_count += 1 # Progress update every 20 files if extracted_count % 20 == 0 and self.progress_callback: self.progress_callback(f"Extracting resources: {extracted_count}/{total_resources}") # Yield to GUI periodically (can be disabled for max speed) if extracted_count % 10 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) result = future.result() if result: resource_type, filename = result # Only print for important resources if extracted_count < 10 or resource_type in ['css', 'fonts']: print(f" 📄 Extracted {resource_type}: {filename}") # Mark as complete with open(extraction_marker, 'w') as f: f.write(f"Resources extracted at {time.time()}") self._validate_critical_files(output_dir, extracted_resources) return extracted_resources def _extract_chapters_universal(self, zf, extraction_mode="smart"): """Universal chapter extraction with four modes: smart, comprehensive, full, enhanced All modes now properly merge Section/Chapter pairs Enhanced mode uses html2text for superior text processing Now with parallel processing for improved performance """ # Check stop at the beginning if is_stop_requested(): print("❌ Chapter extraction stopped by user") return [], 'unknown' # Import time for yielding import time # Initialize enhanced extractor if using enhanced mode enhanced_extractor = None enhanced_filtering = extraction_mode # Default fallback preserve_structure = True # Independent control: translate cover.html when requested translate_cover_html = os.getenv("TRANSLATE_COVER_HTML", "0") == "1" if extraction_mode == "enhanced": print("🚀 Initializing Enhanced extraction mode with html2text...") # Get enhanced mode configuration from environment enhanced_filtering = os.getenv("ENHANCED_FILTERING", "smart") # Avoid 'full' with html2text to prevent XML declaration artifacts; use 'comprehensive' instead if str(enhanced_filtering).lower() == 'full': enhanced_filtering = 'comprehensive' preserve_structure = os.getenv("ENHANCED_PRESERVE_STRUCTURE", "1") == "1" print(f" • Enhanced filtering level: {enhanced_filtering}") print(f" • Preserve structure: {preserve_structure}") # Try to initialize enhanced extractor try: # Import our enhanced extractor (assume it's in the same directory or importable) from enhanced_text_extractor import EnhancedTextExtractor enhanced_extractor = EnhancedTextExtractor( filtering_mode=enhanced_filtering, preserve_structure=preserve_structure ) print("✅ Enhanced text extractor initialized successfully") except ImportError as e: print(f"❌ Enhanced text extractor module not found: {e}") print(f"❌ Cannot use enhanced extraction mode. Please install enhanced_text_extractor or select a different extraction mode.") raise e except Exception as e: print(f"❌ Enhanced extractor initialization failed: {e}") print(f"❌ Cannot use enhanced extraction mode. Please select a different extraction mode.") raise e chapters = [] sample_texts = [] # First phase: Collect HTML files html_files = [] file_list = zf.namelist() total_files = len(file_list) # Update progress for file collection if self.progress_callback and total_files > 100: self.progress_callback(f"Scanning {total_files} files in EPUB...") for idx, name in enumerate(file_list): # Check stop while collecting files if is_stop_requested(): print("❌ Chapter extraction stopped by user") return [], 'unknown' # Yield to GUI every 50 files (can be disabled for max speed) if idx % 50 == 0 and idx > 0: if os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) # Brief yield to GUI if self.progress_callback and total_files > 100: self.progress_callback(f"Scanning files: {idx}/{total_files}") if name.lower().endswith(('.xhtml', '.html', '.htm')): # Skip cover files by default unless override is enabled basename = os.path.basename(name).lower() if basename in ['cover.html', 'cover.xhtml', 'cover.htm'] and not translate_cover_html: print(f"[SKIP] Cover file excluded from all modes: {name}") continue # Apply filtering based on the actual extraction mode (or enhanced_filtering for enhanced mode) current_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode if current_filtering == "smart": # Smart mode: aggressive filtering lower_name = name.lower() if any(skip in lower_name for skip in [ 'nav', 'toc', 'contents', 'title', 'index', 'copyright', 'acknowledgment', 'dedication' ]): continue elif current_filtering == "comprehensive": # Comprehensive mode: moderate filtering skip_keywords = ['nav.', 'toc.', 'contents.', 'copyright.'] basename = os.path.basename(name.lower()) should_skip = False for skip in skip_keywords: if basename == skip + 'xhtml' or basename == skip + 'html' or basename == skip + 'htm': should_skip = True break if should_skip: print(f"[SKIP] Navigation/TOC file: {name}") continue # else: full mode - no filtering at all (except cover which is filtered above) html_files.append(name) # Update mode description to include enhanced mode mode_description = { "smart": "potential content files", "comprehensive": "HTML files", "full": "ALL HTML/XHTML files (no filtering)", "enhanced": f"files (enhanced with {enhanced_filtering} filtering)" } print(f"📚 Found {len(html_files)} {mode_description.get(extraction_mode, 'files')} in EPUB") # Sort files to ensure proper order html_files.sort() # Check if merging is disabled via environment variable disable_merging = os.getenv("DISABLE_CHAPTER_MERGING", "0") == "1" processed_files = set() merge_candidates = {} # Store potential merges without reading files yet if disable_merging: print("📌 Chapter merging is DISABLED - processing all files independently") else: print("📌 Chapter merging is ENABLED") # Only do merging logic if not disabled file_groups = {} # Group files by their base number to detect Section/Chapter pairs for file_path in html_files: filename = os.path.basename(file_path) # Try different patterns to extract base number base_num = None # Pattern 1: "No00014" from "No00014Section.xhtml" match = re.match(r'(No\d+)', filename) if match: base_num = match.group(1) else: # Pattern 2: "0014" from "0014_section.html" or "0014_chapter.html" match = re.match(r'^(\d+)[_\-]', filename) if match: base_num = match.group(1) else: # Pattern 3: Just numbers at the start match = re.match(r'^(\d+)', filename) if match: base_num = match.group(1) if base_num: if base_num not in file_groups: file_groups[base_num] = [] file_groups[base_num].append(file_path) # Identify merge candidates WITHOUT reading files yet for base_num, group_files in sorted(file_groups.items()): if len(group_files) == 2: # Check if we have a Section/Chapter pair based on filenames only section_file = None chapter_file = None for file_path in group_files: basename = os.path.basename(file_path) # More strict detection - must have 'section' or 'chapter' in the filename if 'section' in basename.lower() and 'chapter' not in basename.lower(): section_file = file_path elif 'chapter' in basename.lower() and 'section' not in basename.lower(): chapter_file = file_path if section_file and chapter_file: # Store as potential merge candidate merge_candidates[chapter_file] = section_file processed_files.add(section_file) print(f"[DEBUG] Potential merge candidate: {base_num}") print(f" Section: {os.path.basename(section_file)}") print(f" Chapter: {os.path.basename(chapter_file)}") # Filter out section files that were marked for merging files_to_process = [] for file_path in html_files: if not disable_merging and file_path in processed_files: print(f"[DEBUG] Skipping section file: {file_path}") continue files_to_process.append(file_path) print(f"📚 Processing {len(files_to_process)} files after merge analysis") # Thread-safe collections sample_texts_lock = threading.Lock() file_size_groups_lock = threading.Lock() h1_count_lock = threading.Lock() h2_count_lock = threading.Lock() # Initialize counters file_size_groups = {} h1_count = 0 h2_count = 0 processed_count = 0 processed_count_lock = threading.Lock() # Progress tracking total_files = len(files_to_process) # Function to process a single HTML file def process_single_html_file(file_path, file_index): nonlocal h1_count, h2_count, processed_count # Check stop if is_stop_requested(): return None # Update progress with processed_count_lock: processed_count += 1 current_count = processed_count if self.progress_callback and current_count % 5 == 0: progress_msg = f"Processing chapters: {current_count}/{total_files} ({current_count*100//total_files}%)" self.progress_callback(progress_msg) try: # Read file data file_data = zf.read(file_path) # Decode the file data html_content = None detected_encoding = None for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']: try: html_content = file_data.decode(encoding) detected_encoding = encoding break except UnicodeDecodeError: continue if not html_content: print(f"[WARNING] Could not decode {file_path}") return None # Check if this file needs merging if not disable_merging and file_path in merge_candidates: section_file = merge_candidates[file_path] print(f"[DEBUG] Processing merge for: {file_path}") try: # Read section file section_data = zf.read(section_file) section_html = None for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']: try: section_html = section_data.decode(encoding) break except UnicodeDecodeError: continue if section_html: # Quick check if section is small enough to merge section_soup = BeautifulSoup(section_html, self.parser) section_text = section_soup.get_text(strip=True) if len(section_text) < 200: # Merge if section is small # Extract body content chapter_soup = BeautifulSoup(html_content, self.parser) if section_soup.body: section_body_content = ''.join(str(child) for child in section_soup.body.children) else: section_body_content = section_html if chapter_soup.body: chapter_body_content = ''.join(str(child) for child in chapter_soup.body.children) else: chapter_body_content = html_content # Merge content html_content = section_body_content + "\n
\n" + chapter_body_content print(f" → MERGED: Section ({len(section_text)} chars) + Chapter") else: print(f" → NOT MERGED: Section too large ({len(section_text)} chars)") # Remove from processed files so it gets processed separately processed_files.discard(section_file) except Exception as e: print(f"[WARNING] Failed to merge {file_path}: {e}") # === ENHANCED EXTRACTION POINT === # Initialize variables that will be set by extraction content_html = None content_text = None chapter_title = None enhanced_extraction_used = False # Determine whether to use enhanced extractor based on toggle and provider use_enhanced = enhanced_extractor and extraction_mode == "enhanced" force_bs_traditional = False try: force_bs = os.getenv('FORCE_BS_FOR_TRADITIONAL', '0') == '1' model_env = os.getenv('MODEL', '') if force_bs and is_traditional_translation_api(model_env): use_enhanced = False force_bs_traditional = True except Exception: pass # Use enhanced extractor if available and allowed if use_enhanced: print(f"🚀 Using enhanced extraction for: {os.path.basename(file_path)}") # Get clean text from html2text clean_content, _, chapter_title = enhanced_extractor.extract_chapter_content( html_content, enhanced_filtering ) enhanced_extraction_used = True print(f"✅ Enhanced extraction complete: {len(clean_content)} chars") # For enhanced mode, store the markdown/plain text # This will be sent to the translation API as-is content_html = clean_content # This is MARKDOWN/PLAIN TEXT from html2text content_text = clean_content # Same clean text for analysis # BeautifulSoup method (only for non-enhanced modes) if not enhanced_extraction_used: if extraction_mode == "enhanced" and not force_bs_traditional: # Enhanced mode failed - skip this file print(f"❌ Skipping {file_path} - enhanced extraction required but not available") return None # Parse the (possibly merged) content protected_html = self.protect_angle_brackets_with_korean(html_content) # Use lxml parser which handles both HTML and XHTML well soup = BeautifulSoup(protected_html, self.parser) # Get effective mode for filtering effective_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode # In full mode, keep the entire HTML structure if effective_filtering == "full": content_html = html_content # Keep EVERYTHING content_text = soup.get_text(strip=True) else: # Smart and comprehensive modes extract body content if soup.body: content_html = str(soup.body) content_text = soup.body.get_text(strip=True) else: content_html = html_content content_text = soup.get_text(strip=True) # Extract title (with ignore settings support) chapter_title = None # Check ignore settings for batch translation batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Extract from title tag if not ignored if not ignore_title_tag and soup.title and soup.title.string: chapter_title = soup.title.string.strip() # Extract from header tags if not ignored and no title found if not chapter_title and not ignore_header_tags: for header_tag in ['h1', 'h2', 'h3']: header = soup.find(header_tag) if header: chapter_title = header.get_text(strip=True) break # Fallback to filename if nothing found if not chapter_title: chapter_title = os.path.splitext(os.path.basename(file_path))[0] # Get the effective extraction mode for processing logic effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode # Skip truly empty files in smart mode # BUT: Never skip anything when merging is disabled (to ensure section files are processed) if effective_mode == "smart" and not disable_merging and len(content_text.strip()) < 10: print(f"[SKIP] Nearly empty file: {file_path} ({len(content_text)} chars)") return None # Get actual chapter number based on original position actual_chapter_num = files_to_process.index(file_path) + 1 # Mode-specific logic if effective_mode == "comprehensive" or effective_mode == "full": # For comprehensive/full mode, use sequential numbering chapter_num = actual_chapter_num if not chapter_title: chapter_title = os.path.splitext(os.path.basename(file_path))[0] detection_method = f"{extraction_mode}_sequential" if extraction_mode == "enhanced" else f"{effective_mode}_sequential" elif effective_mode == "smart": # For smart mode, when merging is disabled, use sequential numbering if disable_merging: chapter_num = actual_chapter_num if not chapter_title: chapter_title = os.path.splitext(os.path.basename(file_path))[0] detection_method = f"{extraction_mode}_sequential_no_merge" if extraction_mode == "enhanced" else "sequential_no_merge" else: # When merging is enabled, try to extract chapter info protected_html = self.protect_angle_brackets_with_korean(html_content) soup = BeautifulSoup(protected_html, self.parser) # Count headers (thread-safe) h1_tags = soup.find_all('h1') h2_tags = soup.find_all('h2') if h1_tags: with h1_count_lock: h1_count += 1 if h2_tags: with h2_count_lock: h2_count += 1 # Try to extract chapter number and title chapter_num, extracted_title, detection_method = self._extract_chapter_info( soup, file_path, content_text, html_content ) # Use extracted title if we don't have one if extracted_title and not chapter_title: chapter_title = extracted_title # For hash-based filenames, chapter_num might be None if chapter_num is None: chapter_num = actual_chapter_num # Use actual chapter count detection_method = f"{extraction_mode}_sequential_fallback" if extraction_mode == "enhanced" else "sequential_fallback" print(f"[DEBUG] No chapter number found in {file_path}, assigning: {chapter_num}") # Filter content_html for ignore settings (before processing) batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active if (ignore_title_tag or ignore_header_tags) and content_html and not enhanced_extraction_used: # Parse the content HTML to remove ignored tags content_soup = BeautifulSoup(content_html, self.parser) # Remove title tags if ignored if ignore_title_tag: for title_tag in content_soup.find_all('title'): title_tag.decompose() # Remove header tags if ignored if ignore_header_tags: for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): header_tag.decompose() # Update content_html with filtered version content_html = str(content_soup) # Process images and metadata (same for all modes) protected_html = self.protect_angle_brackets_with_korean(html_content) soup = BeautifulSoup(protected_html, self.parser) images = soup.find_all('img') has_images = len(images) > 0 is_image_only_chapter = has_images and len(content_text.strip()) < 500 if is_image_only_chapter: print(f"[DEBUG] Image-only chapter detected: {file_path} ({len(images)} images, {len(content_text)} chars)") content_hash = ContentProcessor.get_content_hash(content_html) # Collect file size groups for smart mode (thread-safe) if effective_mode == "smart": file_size = len(content_text) with file_size_groups_lock: if file_size not in file_size_groups: file_size_groups[file_size] = [] file_size_groups[file_size].append(file_path) # Collect sample texts (thread-safe) with sample_texts_lock: if len(sample_texts) < 5: sample_texts.append(content_text[:1000]) # Ensure chapter_num is always an integer if isinstance(chapter_num, float): chapter_num = int(chapter_num) # Create chapter info chapter_info = { "num": chapter_num, # Now guaranteed to have a value "title": chapter_title or f"Chapter {chapter_num}", "body": content_html, "filename": file_path, "original_filename": os.path.basename(file_path), "original_basename": os.path.splitext(os.path.basename(file_path))[0], "content_hash": content_hash, "detection_method": detection_method if detection_method else "pending", "file_size": len(content_text), "has_images": has_images, "image_count": len(images), "is_empty": len(content_text.strip()) == 0, "is_image_only": is_image_only_chapter, "extraction_mode": extraction_mode, "file_index": file_index # Store original file index for sorting } # Add enhanced extraction info if used if enhanced_extraction_used: chapter_info["enhanced_extraction"] = True chapter_info["enhanced_filtering"] = enhanced_filtering chapter_info["preserve_structure"] = preserve_structure # Add merge info if applicable if not disable_merging and file_path in merge_candidates: chapter_info["was_merged"] = True chapter_info["merged_with"] = merge_candidates[file_path] if effective_mode == "smart": chapter_info["language_sample"] = content_text[:500] # Debug for section files if 'section' in chapter_info['original_basename'].lower(): print(f"[DEBUG] Added section file to candidates: {chapter_info['original_basename']} (size: {chapter_info['file_size']})") return chapter_info except Exception as e: print(f"[ERROR] Failed to process {file_path}: {e}") import traceback traceback.print_exc() return None # Process files in parallel or sequentially based on file count print(f"🚀 Processing {len(files_to_process)} HTML files...") # Initial progress if self.progress_callback: self.progress_callback(f"Processing {len(files_to_process)} chapters...") candidate_chapters = [] # For smart mode chapters_direct = [] # For other modes # Decide whether to use parallel processing use_parallel = len(files_to_process) > 10 if use_parallel: # Get worker count from environment variable max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) print(f"📦 Using parallel processing with {max_workers} workers...") # Process files in parallel with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all files for processing future_to_file = { executor.submit(process_single_html_file, file_path, idx): (file_path, idx) for idx, file_path in enumerate(files_to_process) } # Collect results as they complete for future in as_completed(future_to_file): if is_stop_requested(): print("❌ Chapter processing stopped by user") executor.shutdown(wait=False) return [], 'unknown' try: chapter_info = future.result() if chapter_info: effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode # For smart mode when merging is enabled, collect candidates # Otherwise, add directly to chapters if effective_mode == "smart" and not disable_merging: candidate_chapters.append(chapter_info) else: chapters_direct.append(chapter_info) except Exception as e: file_path, idx = future_to_file[future] print(f"[ERROR] Thread error processing {file_path}: {e}") else: print("📦 Using sequential processing (small file count)...") # Process files sequentially for small EPUBs for idx, file_path in enumerate(files_to_process): if is_stop_requested(): print("❌ Chapter processing stopped by user") return [], 'unknown' chapter_info = process_single_html_file(file_path, idx) if chapter_info: effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode # For smart mode when merging is enabled, collect candidates # Otherwise, add directly to chapters if effective_mode == "smart" and not disable_merging: candidate_chapters.append(chapter_info) else: chapters_direct.append(chapter_info) # Final progress update if self.progress_callback: self.progress_callback(f"Chapter processing complete: {len(candidate_chapters) + len(chapters_direct)} chapters") # Sort direct chapters by file index to maintain order chapters_direct.sort(key=lambda x: x["file_index"]) # Post-process smart mode candidates (only when merging is enabled) effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode if effective_mode == "smart" and candidate_chapters and not disable_merging: # Check stop before post-processing if is_stop_requested(): print("❌ Chapter post-processing stopped by user") return chapters, 'unknown' print(f"\n[SMART MODE] Processing {len(candidate_chapters)} candidate files...") # Sort candidates by file index to maintain order candidate_chapters.sort(key=lambda x: x["file_index"]) # Debug: Show what files we have section_files = [c for c in candidate_chapters if 'section' in c['original_basename'].lower()] chapter_files = [c for c in candidate_chapters if 'chapter' in c['original_basename'].lower() and 'section' not in c['original_basename'].lower()] other_files = [c for c in candidate_chapters if c not in section_files and c not in chapter_files] print(f" 📊 File breakdown:") print(f" • Section files: {len(section_files)}") print(f" • Chapter files: {len(chapter_files)}") print(f" • Other files: {len(other_files)}") # Original smart mode logic when merging is enabled # First, separate files with detected chapter numbers from those without numbered_chapters = [] unnumbered_chapters = [] for idx, chapter in enumerate(candidate_chapters): # Yield periodically during categorization (can be disabled for max speed) if idx % 10 == 0 and idx > 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) if chapter["num"] is not None: numbered_chapters.append(chapter) else: unnumbered_chapters.append(chapter) print(f" • Files with chapter numbers: {len(numbered_chapters)}") print(f" • Files without chapter numbers: {len(unnumbered_chapters)}") # Check if we have hash-based filenames (no numbered chapters found) if not numbered_chapters and unnumbered_chapters: print(" ⚠️ No chapter numbers found - likely hash-based filenames") print(" → Using file order as chapter sequence") # Sort by file index to maintain order unnumbered_chapters.sort(key=lambda x: x["file_index"]) # Assign sequential numbers for i, chapter in enumerate(unnumbered_chapters, 1): chapter["num"] = i chapter["detection_method"] = f"{extraction_mode}_hash_filename_sequential" if extraction_mode == "enhanced" else "hash_filename_sequential" if not chapter["title"] or chapter["title"] == chapter["original_basename"]: chapter["title"] = f"Chapter {i}" chapters = unnumbered_chapters else: # We have some numbered chapters chapters = numbered_chapters # For unnumbered files, check if they might be duplicates or appendices if unnumbered_chapters: print(f" → Analyzing {len(unnumbered_chapters)} unnumbered files...") # Get the max chapter number max_num = max(c["num"] for c in numbered_chapters) # Check each unnumbered file for chapter in unnumbered_chapters: # Check stop in post-processing loop if is_stop_requested(): print("❌ Chapter post-processing stopped by user") return chapters, 'unknown' # Check if it's very small (might be a separator or note) if chapter["file_size"] < 200: print(f" [SKIP] Very small file: {chapter['filename']} ({chapter['file_size']} chars)") continue # Check if it has similar size to existing chapters (might be duplicate) size = chapter["file_size"] similar_chapters = [c for c in numbered_chapters if abs(c["file_size"] - size) < 50] if similar_chapters: # Might be a duplicate, skip it print(f" [SKIP] Possible duplicate: {chapter['filename']} (similar size to {len(similar_chapters)} chapters)") continue # Otherwise, add as appendix max_num += 1 chapter["num"] = max_num chapter["detection_method"] = f"{extraction_mode}_appendix_sequential" if extraction_mode == "enhanced" else "appendix_sequential" if not chapter["title"] or chapter["title"] == chapter["original_basename"]: chapter["title"] = f"Appendix {max_num}" chapters.append(chapter) print(f" [ADD] Added as chapter {max_num}: {chapter['filename']}") else: # For other modes or smart mode with merging disabled chapters = chapters_direct # Sort chapters by number chapters.sort(key=lambda x: x["num"]) # Ensure chapter numbers are integers # When merging is disabled, all chapters should have integer numbers anyway for chapter in chapters: if isinstance(chapter["num"], float): chapter["num"] = int(chapter["num"]) # Final validation if chapters: print(f"\n✅ Final chapter count: {len(chapters)}") print(f" • Chapter range: {chapters[0]['num']} - {chapters[-1]['num']}") # Enhanced mode summary if extraction_mode == "enhanced": enhanced_count = sum(1 for c in chapters if c.get('enhanced_extraction', False)) print(f" 🚀 Enhanced extraction used: {enhanced_count}/{len(chapters)} chapters") # Check for gaps chapter_nums = [c["num"] for c in chapters] expected_nums = list(range(min(chapter_nums), max(chapter_nums) + 1)) missing = set(expected_nums) - set(chapter_nums) if missing: print(f" ⚠️ Missing chapter numbers: {sorted(missing)}") # Language detection combined_sample = ' '.join(sample_texts) if effective_mode == "smart" else '' detected_language = self._detect_content_language(combined_sample) if combined_sample else 'unknown' if chapters: self._print_extraction_summary(chapters, detected_language, extraction_mode, h1_count if effective_mode == "smart" else 0, h2_count if effective_mode == "smart" else 0, file_size_groups if effective_mode == "smart" else {}) return chapters, detected_language def _extract_chapter_info(self, soup, file_path, content_text, html_content): """Extract chapter number and title from various sources with parallel pattern matching""" chapter_num = None chapter_title = None detection_method = None # SPECIAL HANDLING: When we have Section/Chapter pairs, differentiate them filename = os.path.basename(file_path) # Handle different naming patterns for Section/Chapter files if ('section' in filename.lower() or '_section' in filename.lower()) and 'chapter' not in filename.lower(): # For Section files, add 0.1 to the base number # Try different patterns match = re.search(r'No(\d+)', filename) if not match: match = re.search(r'^(\d+)[_\-]', filename) if not match: match = re.search(r'^(\d+)', filename) if match: base_num = int(match.group(1)) chapter_num = base_num + 0.1 # Section gets .1 detection_method = "filename_section_special" elif ('chapter' in filename.lower() or '_chapter' in filename.lower()) and 'section' not in filename.lower(): # For Chapter files, use the base number # Try different patterns match = re.search(r'No(\d+)', filename) if not match: match = re.search(r'^(\d+)[_\-]', filename) if not match: match = re.search(r'^(\d+)', filename) if match: chapter_num = int(match.group(1)) detection_method = "filename_chapter_special" # If not handled by special logic, continue with normal extraction if not chapter_num: # Try filename first - use parallel pattern matching for better performance chapter_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS if method.endswith('_number')] if len(chapter_patterns) > 3: # Only parallelize if we have enough patterns # Parallel pattern matching for filename with ThreadPoolExecutor(max_workers=min(4, len(chapter_patterns))) as executor: def try_pattern(pattern_info): pattern, flags, method = pattern_info match = re.search(pattern, file_path, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"filename_{method}" elif method == 'chinese_chapter_cn': converted = self._convert_chinese_number(num_str) if converted: return converted, f"filename_{method}" except (ValueError, IndexError): pass return None, None # Submit all patterns futures = [executor.submit(try_pattern, pattern_info) for pattern_info in chapter_patterns] # Check results as they complete for future in as_completed(futures): try: num, method = future.result() if num: chapter_num = num detection_method = method # Cancel remaining futures for f in futures: f.cancel() break except Exception: continue else: # Sequential processing for small pattern sets for pattern, flags, method in chapter_patterns: match = re.search(pattern, file_path, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): chapter_num = int(num_str) detection_method = f"filename_{method}" break elif method == 'chinese_chapter_cn': converted = self._convert_chinese_number(num_str) if converted: chapter_num = converted detection_method = f"filename_{method}" break except (ValueError, IndexError): continue # Try content if not found in filename if not chapter_num: # Check ignore settings for batch translation batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Prepare all text sources to check in parallel text_sources = [] # Add title tag if not ignored if not ignore_title_tag and soup.title and soup.title.string: title_text = soup.title.string.strip() text_sources.append(("title", title_text, True)) # True means this can be chapter_title # Add headers if not ignored if not ignore_header_tags: for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: headers = soup.find_all(header_tag) for header in headers[:3]: # Limit to first 3 of each type header_text = header.get_text(strip=True) if header_text: text_sources.append((f"header_{header_tag}", header_text, True)) # Add first paragraphs first_elements = soup.find_all(['p', 'div'])[:5] for elem in first_elements: elem_text = elem.get_text(strip=True) if elem_text: text_sources.append(("content", elem_text, False)) # False means don't use as chapter_title # Process text sources in parallel if we have many if len(text_sources) > 5: with ThreadPoolExecutor(max_workers=min(6, len(text_sources))) as executor: def extract_from_source(source_info): source_type, text, can_be_title = source_info num, method = self._extract_from_text(text, source_type) return num, method, text if (num and can_be_title) else None # Submit all text sources future_to_source = {executor.submit(extract_from_source, source): source for source in text_sources} # Process results as they complete for future in as_completed(future_to_source): try: num, method, title = future.result() if num: chapter_num = num detection_method = method if title and not chapter_title: chapter_title = title # Cancel remaining futures for f in future_to_source: f.cancel() break except Exception: continue else: # Sequential processing for small text sets for source_type, text, can_be_title in text_sources: num, method = self._extract_from_text(text, source_type) if num: chapter_num = num detection_method = method if can_be_title and not chapter_title: chapter_title = text break # Final fallback to filename patterns if not chapter_num: filename_base = os.path.basename(file_path) # Parallel pattern matching for filename extraction if len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS) > 3: with ThreadPoolExecutor(max_workers=min(4, len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS))) as executor: def try_filename_pattern(pattern): match = re.search(pattern, filename_base, re.IGNORECASE) if match: try: return int(match.group(1)) except (ValueError, IndexError): pass return None futures = [executor.submit(try_filename_pattern, pattern) for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS] for future in as_completed(futures): try: num = future.result() if num: chapter_num = num detection_method = "filename_number" for f in futures: f.cancel() break except Exception: continue else: # Sequential for small pattern sets for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS: match = re.search(pattern, filename_base, re.IGNORECASE) if match: chapter_num = int(match.group(1)) detection_method = "filename_number" break # Extract title if not already found (with ignore settings support) if not chapter_title: # Check ignore settings for batch translation batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Try title tag if not ignored if not ignore_title_tag and soup.title and soup.title.string: chapter_title = soup.title.string.strip() # Try header tags if not ignored and no title found if not chapter_title and not ignore_header_tags: for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: header = soup.find(header_tag) if header: chapter_title = header.get_text(strip=True) break # Final fallback if not chapter_title: chapter_title = f"Chapter {chapter_num}" if chapter_num else None chapter_title = re.sub(r'\s+', ' ', chapter_title).strip() if chapter_title else None return chapter_num, chapter_title, detection_method def _extract_from_text(self, text, source_type): """Extract chapter number from text using patterns with parallel matching for large pattern sets""" # Get patterns that don't end with '_number' text_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS if not method.endswith('_number')] # Only use parallel processing if we have many patterns if len(text_patterns) > 5: with ThreadPoolExecutor(max_workers=min(4, len(text_patterns))) as executor: def try_text_pattern(pattern_info): pattern, flags, method = pattern_info match = re.search(pattern, text, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"{source_type}_{method}" elif method == 'chinese_chapter_cn': converted = self._convert_chinese_number(num_str) if converted: return converted, f"{source_type}_{method}" except (ValueError, IndexError): pass return None, None # Submit all patterns futures = [executor.submit(try_text_pattern, pattern_info) for pattern_info in text_patterns] # Check results as they complete for future in as_completed(futures): try: num, method = future.result() if num: # Cancel remaining futures for f in futures: f.cancel() return num, method except Exception: continue else: # Sequential processing for small pattern sets for pattern, flags, method in text_patterns: match = re.search(pattern, text, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"{source_type}_{method}" elif method == 'chinese_chapter_cn': converted = self._convert_chinese_number(num_str) if converted: return converted, f"{source_type}_{method}" except (ValueError, IndexError): continue return None, None def _convert_chinese_number(self, cn_num): """Convert Chinese number to integer""" if cn_num in self.pattern_manager.CHINESE_NUMS: return self.pattern_manager.CHINESE_NUMS[cn_num] if '十' in cn_num: parts = cn_num.split('十') if len(parts) == 2: tens = self.pattern_manager.CHINESE_NUMS.get(parts[0], 1) if parts[0] else 1 ones = self.pattern_manager.CHINESE_NUMS.get(parts[1], 0) if parts[1] else 0 return tens * 10 + ones return None def _detect_content_language(self, text_sample): """Detect the primary language of content with parallel processing for large texts""" # For very short texts, use sequential processing if len(text_sample) < 1000: scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } for char in text_sample: code = ord(char) if 0xAC00 <= code <= 0xD7AF: scripts['korean'] += 1 elif 0x3040 <= code <= 0x309F: scripts['japanese_hiragana'] += 1 elif 0x30A0 <= code <= 0x30FF: scripts['japanese_katakana'] += 1 elif 0x4E00 <= code <= 0x9FFF: scripts['chinese'] += 1 elif 0x0020 <= code <= 0x007F: scripts['latin'] += 1 else: # For longer texts, use parallel processing # Split text into chunks for parallel processing chunk_size = max(500, len(text_sample) // (os.cpu_count() or 4)) chunks = [text_sample[i:i + chunk_size] for i in range(0, len(text_sample), chunk_size)] # Thread-safe accumulator scripts_lock = threading.Lock() scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } def process_chunk(text_chunk): """Process a chunk of text and return script counts""" local_scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } for char in text_chunk: code = ord(char) if 0xAC00 <= code <= 0xD7AF: local_scripts['korean'] += 1 elif 0x3040 <= code <= 0x309F: local_scripts['japanese_hiragana'] += 1 elif 0x30A0 <= code <= 0x30FF: local_scripts['japanese_katakana'] += 1 elif 0x4E00 <= code <= 0x9FFF: local_scripts['chinese'] += 1 elif 0x0020 <= code <= 0x007F: local_scripts['latin'] += 1 return local_scripts # Process chunks in parallel with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, len(chunks))) as executor: # Submit all chunks futures = [executor.submit(process_chunk, chunk) for chunk in chunks] # Collect results for future in as_completed(futures): try: chunk_scripts = future.result() # Thread-safe accumulation with scripts_lock: for script, count in chunk_scripts.items(): scripts[script] += count except Exception as e: print(f"[WARNING] Error processing chunk in language detection: {e}") # Language determination logic (same as original) total_cjk = scripts['korean'] + scripts['japanese_hiragana'] + scripts['japanese_katakana'] + scripts['chinese'] if scripts['korean'] > total_cjk * 0.3: return 'korean' elif scripts['japanese_hiragana'] + scripts['japanese_katakana'] > total_cjk * 0.2: return 'japanese' elif scripts['chinese'] > total_cjk * 0.3: return 'chinese' elif scripts['latin'] > len(text_sample) * 0.7: return 'english' else: return 'unknown' def _print_extraction_summary(self, chapters, detected_language, extraction_mode, h1_count, h2_count, file_size_groups): """Print extraction summary""" print(f"\n📊 Chapter Extraction Summary ({extraction_mode.capitalize()} Mode):") print(f" • Total chapters extracted: {len(chapters)}") # Format chapter range handling both int and float first_num = chapters[0]['num'] last_num = chapters[-1]['num'] print(f" • Chapter range: {first_num} to {last_num}") print(f" • Detected language: {detected_language}") if extraction_mode == "smart": print(f" • Primary header type: {'

' if h2_count > h1_count else '

'}") image_only_count = sum(1 for c in chapters if c.get('is_image_only', False)) text_only_count = sum(1 for c in chapters if not c.get('has_images', False) and c.get('file_size', 0) >= 500) mixed_count = sum(1 for c in chapters if c.get('has_images', False) and c.get('file_size', 0) >= 500) empty_count = sum(1 for c in chapters if c.get('file_size', 0) < 50) print(f" • Text-only chapters: {text_only_count}") print(f" • Image-only chapters: {image_only_count}") print(f" • Mixed content chapters: {mixed_count}") print(f" • Empty/minimal content: {empty_count}") # Check for merged chapters merged_count = sum(1 for c in chapters if c.get('was_merged', False)) if merged_count > 0: print(f" • Merged chapters: {merged_count}") # Check for missing chapters (only for integer sequences) expected_chapters = set(range(chapters[0]['num'], chapters[-1]['num'] + 1)) actual_chapters = set(c['num'] for c in chapters) missing = expected_chapters - actual_chapters if missing: print(f" ⚠️ Missing chapter numbers: {sorted(missing)}") if extraction_mode == "smart": method_stats = Counter(c['detection_method'] for c in chapters) print(f" 📈 Detection methods used:") for method, count in method_stats.most_common(): print(f" • {method}: {count} chapters") large_groups = [size for size, files in file_size_groups.items() if len(files) > 1] if large_groups: print(f" ⚠️ Found {len(large_groups)} file size groups with potential duplicates") else: print(f" • Empty/placeholder: {empty_count}") if extraction_mode == "full": print(f" 🔍 Full extraction preserved all HTML structure and tags") def _extract_epub_metadata(self, zf): """Extract comprehensive metadata from EPUB file including all custom fields""" meta = {} # Use lxml for XML if available xml_parser = 'lxml-xml' if self.parser == 'lxml' else 'xml' try: for name in zf.namelist(): if name.lower().endswith('.opf'): opf_content = zf.read(name) soup = BeautifulSoup(opf_content, xml_parser) # Extract ALL Dublin Core elements (expanded list) dc_elements = ['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'type', 'format', 'identifier', 'source', 'language', 'relation', 'coverage', 'rights'] for element in dc_elements: tag = soup.find(element) if tag and tag.get_text(strip=True): meta[element] = tag.get_text(strip=True) # Extract ALL meta tags (not just series) meta_tags = soup.find_all('meta') for meta_tag in meta_tags: # Try different attribute names for the metadata name name = meta_tag.get('name') or meta_tag.get('property', '') content = meta_tag.get('content', '') if name and content: # Store original name for debugging original_name = name # Clean up common prefixes if name.startswith('calibre:'): name = name[8:] # Remove 'calibre:' prefix elif name.startswith('dc:'): name = name[3:] # Remove 'dc:' prefix elif name.startswith('opf:'): name = name[4:] # Remove 'opf:' prefix # Normalize the field name - replace hyphens with underscores name = name.replace('-', '_') # Don't overwrite if already exists (prefer direct tags over meta tags) if name not in meta: meta[name] = content # Debug output for custom fields if original_name != name: print(f" • Found custom field: {original_name} → {name}") # Special handling for series information (maintain compatibility) if 'series' not in meta: series_tags = soup.find_all('meta', attrs={'name': lambda x: x and 'series' in x.lower()}) for series_tag in series_tags: series_name = series_tag.get('content', '') if series_name: meta['series'] = series_name break # Extract refines metadata (used by some EPUB creators) refines_metas = soup.find_all('meta', attrs={'refines': True}) for refine in refines_metas: property_name = refine.get('property', '') content = refine.get_text(strip=True) or refine.get('content', '') if property_name and content: # Clean property name if ':' in property_name: property_name = property_name.split(':')[-1] property_name = property_name.replace('-', '_') if property_name not in meta: meta[property_name] = content # Log extraction summary print(f"📋 Extracted {len(meta)} metadata fields") # Show standard vs custom fields standard_keys = {'title', 'creator', 'language', 'subject', 'description', 'publisher', 'date', 'identifier', 'source', 'rights', 'contributor', 'type', 'format', 'relation', 'coverage'} custom_keys = set(meta.keys()) - standard_keys if custom_keys: print(f"📋 Standard fields: {len(standard_keys & set(meta.keys()))}") print(f"📋 Custom fields found: {sorted(custom_keys)}") # Show sample values for custom fields (truncated) for key in sorted(custom_keys)[:5]: # Show first 5 custom fields value = str(meta[key]) if len(value) > 50: value = value[:47] + "..." print(f" • {key}: {value}") if len(custom_keys) > 5: print(f" • ... and {len(custom_keys) - 5} more custom fields") break except Exception as e: print(f"[WARNING] Failed to extract metadata: {e}") import traceback traceback.print_exc() return meta def _categorize_resource(self, file_path, file_name): """Categorize a file and return (resource_type, target_dir, safe_filename)""" file_path_lower = file_path.lower() file_name_lower = file_name.lower() if file_path_lower.endswith('.css'): return 'css', 'css', sanitize_resource_filename(file_name) elif file_path_lower.endswith(('.ttf', '.otf', '.woff', '.woff2', '.eot')): return 'fonts', 'fonts', sanitize_resource_filename(file_name) elif file_path_lower.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.bmp', '.webp')): return 'images', 'images', sanitize_resource_filename(file_name) elif (file_path_lower.endswith(('.opf', '.ncx')) or file_name_lower == 'container.xml' or 'container.xml' in file_path_lower): if 'container.xml' in file_path_lower: safe_filename = 'container.xml' else: safe_filename = file_name return 'epub_structure', None, safe_filename elif file_path_lower.endswith(('.js', '.xml', '.txt')): return 'other', None, sanitize_resource_filename(file_name) return None def _cleanup_old_resources(self, output_dir): """Clean up old resource directories and EPUB structure files""" print("🧹 Cleaning up any existing resource directories...") cleanup_success = True for resource_type in ['css', 'fonts', 'images']: resource_dir = os.path.join(output_dir, resource_type) if os.path.exists(resource_dir): try: shutil.rmtree(resource_dir) print(f" 🗑️ Removed old {resource_type} directory") except PermissionError as e: print(f" ⚠️ Cannot remove {resource_type} directory (permission denied) - will merge with existing files") cleanup_success = False except Exception as e: print(f" ⚠️ Error removing {resource_type} directory: {e} - will merge with existing files") cleanup_success = False epub_structure_files = ['container.xml', 'content.opf', 'toc.ncx'] for epub_file in epub_structure_files: input_path = os.path.join(output_dir, epub_file) if os.path.exists(input_path): try: os.remove(input_path) print(f" 🗑️ Removed old {epub_file}") except PermissionError: print(f" ⚠️ Cannot remove {epub_file} (permission denied) - will use existing file") except Exception as e: print(f" ⚠️ Error removing {epub_file}: {e}") try: for file in os.listdir(output_dir): if file.lower().endswith(('.opf', '.ncx')): file_path = os.path.join(output_dir, file) try: os.remove(file_path) print(f" 🗑️ Removed old EPUB file: {file}") except PermissionError: print(f" ⚠️ Cannot remove {file} (permission denied)") except Exception as e: print(f" ⚠️ Error removing {file}: {e}") except Exception as e: print(f"⚠️ Error scanning for EPUB files: {e}") if not cleanup_success: print("⚠️ Some cleanup operations failed due to file permissions") print(" The program will continue and merge with existing files") return cleanup_success def _count_existing_resources(self, output_dir, extracted_resources): """Count existing resources when skipping extraction""" for resource_type in ['css', 'fonts', 'images', 'epub_structure']: if resource_type == 'epub_structure': epub_files = [] for file in ['container.xml', 'content.opf', 'toc.ncx']: if os.path.exists(os.path.join(output_dir, file)): epub_files.append(file) try: for file in os.listdir(output_dir): if file.lower().endswith(('.opf', '.ncx')) and file not in epub_files: epub_files.append(file) except: pass extracted_resources[resource_type] = epub_files else: resource_dir = os.path.join(output_dir, resource_type) if os.path.exists(resource_dir): try: files = [f for f in os.listdir(resource_dir) if os.path.isfile(os.path.join(resource_dir, f))] extracted_resources[resource_type] = files except: extracted_resources[resource_type] = [] total_existing = sum(len(files) for files in extracted_resources.values()) print(f"✅ Found {total_existing} existing resource files") return extracted_resources def _validate_critical_files(self, output_dir, extracted_resources): """Validate that critical EPUB files were extracted""" total_extracted = sum(len(files) for files in extracted_resources.values()) print(f"✅ Extracted {total_extracted} resource files:") for resource_type, files in extracted_resources.items(): if files: if resource_type == 'epub_structure': print(f" • EPUB Structure: {len(files)} files") for file in files: print(f" - {file}") else: print(f" • {resource_type.title()}: {len(files)} files") critical_files = ['container.xml'] missing_critical = [f for f in critical_files if not os.path.exists(os.path.join(output_dir, f))] if missing_critical: print(f"⚠️ WARNING: Missing critical EPUB files: {missing_critical}") print(" This may prevent proper EPUB reconstruction!") else: print("✅ All critical EPUB structure files extracted successfully") opf_files = [f for f in extracted_resources['epub_structure'] if f.lower().endswith('.opf')] if not opf_files: print("⚠️ WARNING: No OPF file found! This will prevent EPUB reconstruction.") else: print(f"✅ Found OPF file(s): {opf_files}") def _create_extraction_report(self, output_dir, metadata, chapters, extracted_resources): """Create comprehensive extraction report with HTML file tracking""" report_path = os.path.join(output_dir, 'extraction_report.txt') with open(report_path, 'w', encoding='utf-8') as f: f.write("EPUB Extraction Report\n") f.write("=" * 50 + "\n\n") f.write(f"EXTRACTION MODE: {metadata.get('extraction_mode', 'unknown').upper()}\n\n") f.write("METADATA:\n") for key, value in metadata.items(): if key not in ['chapter_titles', 'extracted_resources', 'extraction_mode']: f.write(f" {key}: {value}\n") f.write(f"\nCHAPTERS ({len(chapters)}):\n") text_chapters = [] image_only_chapters = [] mixed_chapters = [] for chapter in chapters: if chapter.get('has_images') and chapter.get('file_size', 0) < 500: image_only_chapters.append(chapter) elif chapter.get('has_images') and chapter.get('file_size', 0) >= 500: mixed_chapters.append(chapter) else: text_chapters.append(chapter) if text_chapters: f.write(f"\n TEXT CHAPTERS ({len(text_chapters)}):\n") for c in text_chapters: f.write(f" {c['num']:3d}. {c['title']} ({c['detection_method']})\n") if c.get('original_html_file'): f.write(f" → {c['original_html_file']}\n") if image_only_chapters: f.write(f"\n IMAGE-ONLY CHAPTERS ({len(image_only_chapters)}):\n") for c in image_only_chapters: f.write(f" {c['num']:3d}. {c['title']} (images: {c.get('image_count', 0)})\n") if c.get('original_html_file'): f.write(f" → {c['original_html_file']}\n") if 'body' in c: try: soup = BeautifulSoup(c['body'], 'html.parser') images = soup.find_all('img') for img in images[:3]: src = img.get('src', 'unknown') f.write(f" • Image: {src}\n") if len(images) > 3: f.write(f" • ... and {len(images) - 3} more images\n") except: pass if mixed_chapters: f.write(f"\n MIXED CONTENT CHAPTERS ({len(mixed_chapters)}):\n") for c in mixed_chapters: f.write(f" {c['num']:3d}. {c['title']} (text: {c.get('file_size', 0)} chars, images: {c.get('image_count', 0)})\n") if c.get('original_html_file'): f.write(f" → {c['original_html_file']}\n") f.write(f"\nRESOURCES EXTRACTED:\n") for resource_type, files in extracted_resources.items(): if files: if resource_type == 'epub_structure': f.write(f" EPUB Structure: {len(files)} files\n") for file in files: f.write(f" - {file}\n") else: f.write(f" {resource_type.title()}: {len(files)} files\n") for file in files[:5]: f.write(f" - {file}\n") if len(files) > 5: f.write(f" ... and {len(files) - 5} more\n") f.write(f"\nHTML FILES WRITTEN:\n") html_files_written = metadata.get('html_files_written', 0) f.write(f" Total: {html_files_written} files\n") f.write(f" Location: Main directory and 'originals' subdirectory\n") f.write(f"\nPOTENTIAL ISSUES:\n") issues = [] if image_only_chapters: issues.append(f" • {len(image_only_chapters)} chapters contain only images (may need OCR)") missing_html = sum(1 for c in chapters if not c.get('original_html_file')) if missing_html > 0: issues.append(f" • {missing_html} chapters failed to write HTML files") if not extracted_resources.get('epub_structure'): issues.append(" • No EPUB structure files found (may affect reconstruction)") if not issues: f.write(" None detected - extraction appears successful!\n") else: for issue in issues: f.write(issue + "\n") print(f"📄 Saved extraction report to: {report_path}") def _log_extraction_summary(self, chapters, extracted_resources, detected_language, html_files_written=0): """Log final extraction summary with HTML file information""" extraction_mode = chapters[0].get('extraction_mode', 'unknown') if chapters else 'unknown' print(f"\n✅ {extraction_mode.capitalize()} extraction complete!") print(f" 📚 Chapters: {len(chapters)}") print(f" 📄 HTML files written: {html_files_written}") print(f" 🎨 Resources: {sum(len(files) for files in extracted_resources.values())}") print(f" 🌍 Language: {detected_language}") image_only_count = sum(1 for c in chapters if c.get('has_images') and c.get('file_size', 0) < 500) if image_only_count > 0: print(f" 📸 Image-only chapters: {image_only_count}") epub_files = extracted_resources.get('epub_structure', []) if epub_files: print(f" 📋 EPUB Structure: {len(epub_files)} files ({', '.join(epub_files)})") else: print(f" ⚠️ No EPUB structure files extracted!") print(f"\n🔍 Pre-flight check readiness:") print(f" ✅ HTML files: {'READY' if html_files_written > 0 else 'NOT READY'}") print(f" ✅ Metadata: READY") print(f" ✅ Resources: READY") # ===================================================== # UNIFIED TRANSLATION PROCESSOR # ===================================================== class TranslationProcessor: """Handles the translation of individual chapters""" def __init__(self, config, client, out_dir, log_callback=None, stop_callback=None, uses_zero_based=False, is_text_file=False): self.config = config self.client = client self.out_dir = out_dir self.log_callback = log_callback self.stop_callback = stop_callback self.chapter_splitter = ChapterSplitter(model_name=config.MODEL) self.uses_zero_based = uses_zero_based self.is_text_file = is_text_file # Check and log multi-key status if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: stats = self.client.get_stats() self._log(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys") self._log(f" Active keys: {stats.get('active_keys', 0)}") def _log(self, message): """Log a message""" if self.log_callback: self.log_callback(message) else: print(message) def report_key_status(self): """Report multi-key status if available""" if hasattr(self.client, 'get_stats'): stats = self.client.get_stats() if stats.get('multi_key_mode', False): self._log(f"\n📊 API Key Status:") self._log(f" Active Keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") self._log(f" Success Rate: {stats.get('success_rate', 0):.1%}") self._log(f" Total Requests: {stats.get('total_requests', 0)}\n") def check_stop(self): """Check if translation should stop""" if self.stop_callback and self.stop_callback(): print("❌ Translation stopped by user request.") return True def check_duplicate_content(self, result, idx, prog, out, actual_num=None): """Check if translated content is duplicate - with mode selection""" # Get detection mode from config detection_mode = getattr(self.config, 'DUPLICATE_DETECTION_MODE', 'basic') print(f" 🔍 DEBUG: Detection mode = '{detection_mode}'") print(f" 🔍 DEBUG: Lookback chapters = {self.config.DUPLICATE_LOOKBACK_CHAPTERS}") # Extract content_hash if available from progress content_hash = None if detection_mode == 'ai-hunter': # Try to get content_hash from the current chapter info # Use actual_num if provided, otherwise fallback to idx+1 if actual_num is not None: chapter_key = str(actual_num) else: chapter_key = str(idx + 1) if chapter_key in prog.get("chapters", {}): chapter_info = prog["chapters"][chapter_key] content_hash = chapter_info.get("content_hash") print(f" 🔍 DEBUG: Found content_hash for chapter {idx}: {content_hash}") if detection_mode == 'ai-hunter': print(" 🤖 DEBUG: Routing to AI Hunter detection...") # Check if AI Hunter method is available (injected by the wrapper) if hasattr(self, '_check_duplicate_ai_hunter'): return self._check_duplicate_ai_hunter(result, idx, prog, out, content_hash) else: print(" ⚠️ AI Hunter method not available, falling back to basic detection") return self._check_duplicate_basic(result, idx, prog, out) elif detection_mode == 'cascading': print(" 🔄 DEBUG: Routing to Cascading detection...") return self._check_duplicate_cascading(result, idx, prog, out) else: print(" 📋 DEBUG: Routing to Basic detection...") return self._check_duplicate_basic(result, idx, prog, out) def _check_duplicate_basic(self, result, idx, prog, out): """Original basic duplicate detection""" try: result_clean = re.sub(r'<[^>]+>', '', result).strip().lower() result_sample = result_clean[:1000] lookback_chapters = self.config.DUPLICATE_LOOKBACK_CHAPTERS for prev_idx in range(max(0, idx - lookback_chapters), idx): prev_key = str(prev_idx) if prev_key in prog["chapters"] and prog["chapters"][prev_key].get("output_file"): prev_file = prog["chapters"][prev_key]["output_file"] prev_path = os.path.join(out, prev_file) if os.path.exists(prev_path): try: with open(prev_path, 'r', encoding='utf-8') as f: prev_content = f.read() prev_clean = re.sub(r'<[^>]+>', '', prev_content).strip().lower() prev_sample = prev_clean[:1000] # Use SequenceMatcher for similarity comparison similarity = SequenceMatcher(None, result_sample, prev_sample).ratio() if similarity >= 0.85: # 85% threshold print(f" 🚀 Basic detection: Duplicate found ({int(similarity*100)}%)") return True, int(similarity * 100) except Exception as e: print(f" Warning: Failed to read {prev_path}: {e}") continue return False, 0 except Exception as e: print(f" Warning: Failed to check duplicate content: {e}") return False, 0 def _check_duplicate_cascading(self, result, idx, prog, out): """Cascading detection - basic first, then AI Hunter for borderline cases""" # Step 1: Basic is_duplicate_basic, similarity_basic = self._check_duplicate_basic(result, idx, prog, out) if is_duplicate_basic: return True, similarity_basic # Step 2: If basic detection finds moderate similarity, use AI Hunter if similarity_basic >= 60: # Configurable threshold print(f" 🤖 Moderate similarity ({similarity_basic}%) - running AI Hunter analysis...") if hasattr(self, '_check_duplicate_ai_hunter'): is_duplicate_ai, similarity_ai = self._check_duplicate_ai_hunter(result, idx, prog, out) if is_duplicate_ai: return True, similarity_ai else: print(" ⚠️ AI Hunter method not available for cascading analysis") return False, max(similarity_basic, 0) def _extract_text_features(self, text): """Extract multiple features from text for AI Hunter analysis""" features = { 'semantic': {}, 'structural': {}, 'characters': [], 'patterns': {} } # Semantic fingerprint lines = text.split('\n') # Character extraction (names that appear 3+ times) words = re.findall(r'\b[A-Z][a-z]+\b', text) word_freq = Counter(words) features['characters'] = [name for name, count in word_freq.items() if count >= 3] # Dialogue patterns dialogue_patterns = re.findall(r'"([^"]+)"', text) features['semantic']['dialogue_count'] = len(dialogue_patterns) features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]] # Speaker patterns speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower()) features['semantic']['speakers'] = list(set(speaker_patterns[:20])) # Number extraction numbers = re.findall(r'\b\d+\b', text) features['patterns']['numbers'] = numbers[:20] # Structural signature para_lengths = [] dialogue_count = 0 for para in text.split('\n\n'): if para.strip(): para_lengths.append(len(para)) if '"' in para: dialogue_count += 1 features['structural']['para_count'] = len(para_lengths) features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths)) features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths)) # Create structural pattern string pattern = [] for para in text.split('\n\n')[:20]: # First 20 paragraphs if para.strip(): if '"' in para: pattern.append('D') # Dialogue elif len(para) > 300: pattern.append('L') # Long elif len(para) < 100: pattern.append('S') # Short else: pattern.append('M') # Medium features['structural']['pattern'] = ''.join(pattern) return features def _calculate_exact_similarity(self, text1, text2): """Calculate exact text similarity""" return SequenceMatcher(None, text1.lower(), text2.lower()).ratio() def _calculate_smart_similarity(self, text1, text2): """Smart similarity with length-aware sampling""" # Check length ratio first len_ratio = len(text1) / max(1, len(text2)) if len_ratio < 0.7 or len_ratio > 1.3: return 0.0 # Smart sampling for large texts if len(text1) > 10000: sample_size = 3000 samples1 = [ text1[:sample_size], text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2], text1[-sample_size:] ] samples2 = [ text2[:sample_size], text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2], text2[-sample_size:] ] similarities = [SequenceMatcher(None, s1.lower(), s2.lower()).ratio() for s1, s2 in zip(samples1, samples2)] return sum(similarities) / len(similarities) else: # Use first 2000 chars for smaller texts return SequenceMatcher(None, text1[:2000].lower(), text2[:2000].lower()).ratio() def _calculate_semantic_similarity(self, sem1, sem2): """Calculate semantic fingerprint similarity""" score = 0.0 max_score = 0.0 # Compare dialogue counts if 'dialogue_count' in sem1 and 'dialogue_count' in sem2: max_score += 1.0 ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / max(1, max(sem1['dialogue_count'], sem2['dialogue_count'])) score += ratio * 0.3 # Compare speakers if 'speakers' in sem1 and 'speakers' in sem2: max_score += 1.0 if sem1['speakers'] and sem2['speakers']: overlap = len(set(sem1['speakers']) & set(sem2['speakers'])) total = len(set(sem1['speakers']) | set(sem2['speakers'])) score += (overlap / max(1, total)) * 0.4 # Compare dialogue lengths pattern if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2: max_score += 1.0 if sem1['dialogue_lengths'] and sem2['dialogue_lengths']: # Compare dialogue length patterns len1 = sem1['dialogue_lengths'][:10] len2 = sem2['dialogue_lengths'][:10] if len1 and len2: avg1 = sum(len1) / len(len1) avg2 = sum(len2) / len(len2) ratio = min(avg1, avg2) / max(1, max(avg1, avg2)) score += ratio * 0.3 return score / max(1, max_score) def _calculate_structural_similarity(self, struct1, struct2): """Calculate structural signature similarity""" score = 0.0 # Compare paragraph patterns if 'pattern' in struct1 and 'pattern' in struct2: pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio() score += pattern_sim * 0.4 # Compare paragraph statistics if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \ all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']): # Paragraph count ratio para_ratio = min(struct1['para_count'], struct2['para_count']) / max(1, max(struct1['para_count'], struct2['para_count'])) score += para_ratio * 0.2 # Average length ratio avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / max(1, max(struct1['avg_para_length'], struct2['avg_para_length'])) score += avg_ratio * 0.2 # Dialogue ratio similarity dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio']) score += (1 - dialogue_diff) * 0.2 return score def _calculate_character_similarity(self, chars1, chars2): """Calculate character name similarity""" if not chars1 or not chars2: return 0.0 # Find overlapping characters set1 = set(chars1) set2 = set(chars2) overlap = len(set1 & set2) total = len(set1 | set2) return overlap / max(1, total) def _calculate_pattern_similarity(self, pat1, pat2): """Calculate pattern-based similarity""" score = 0.0 # Compare numbers (they rarely change in translations) if 'numbers' in pat1 and 'numbers' in pat2: nums1 = set(pat1['numbers']) nums2 = set(pat2['numbers']) if nums1 and nums2: overlap = len(nums1 & nums2) total = len(nums1 | nums2) score = overlap / max(1, total) return score def generate_rolling_summary(self, history_manager, chapter_num, base_system_content=None, source_text=None): """Generate rolling summary after a chapter for context continuity. Uses a dedicated summary system prompt (with glossary) distinct from translation. Writes the summary to rolling_summary.txt and returns the summary string. """ if not self.config.USE_ROLLING_SUMMARY: return None current_history = history_manager.load_history() messages_to_include = self.config.ROLLING_SUMMARY_EXCHANGES * 2 # Prefer directly provided source text (e.g., just-translated chapter) when available assistant_responses = [] if source_text and isinstance(source_text, str) and source_text.strip(): assistant_responses = [source_text] else: if len(current_history) >= 2: recent_messages = current_history[-messages_to_include:] if messages_to_include > 0 else current_history for h in recent_messages: if h.get("role") == "assistant": assistant_responses.append(h["content"]) # If still empty, skip quietly if not assistant_responses: return None # Build a dedicated summary system prompt (do NOT reuse main translation system prompt) # Append glossary to keep terminology consistent summary_system_template = os.getenv("ROLLING_SUMMARY_SYSTEM_PROMPT", "You create concise summaries for continuity.").strip() try: glossary_path = find_glossary_file(self.out_dir) except Exception: glossary_path = None system_prompt = build_system_prompt(summary_system_template, glossary_path) # Add explicit instruction for clarity system_prompt += "\n\n[Instruction: Generate a concise rolling summary of the previous chapter. Use glossary terms consistently. Do not include warnings or explanations.]" user_prompt_template = os.getenv( "ROLLING_SUMMARY_USER_PROMPT", "Summarize the key events, characters, tone, and important details from these translations. " "Focus on: character names/relationships, plot developments, and any special terminology used.\n\n" "{translations}" ) translations_text = "\n---\n".join(assistant_responses) user_prompt = user_prompt_template.replace("{translations}", translations_text) summary_msgs = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"[Rolling Summary of Chapter {chapter_num}]\n" + user_prompt} ] try: summary_resp, _ = send_with_interrupt( summary_msgs, self.client, self.config.TEMP, min(2000, self.config.MAX_OUTPUT_TOKENS), self.check_stop, context='summary' ) # Save the summary to the output folder summary_file = os.path.join(self.out_dir, "rolling_summary.txt") header = f"=== Rolling Summary of Chapter {chapter_num} ===\n(This is a summary of the previous chapter for context)\n" mode = "a" if self.config.ROLLING_SUMMARY_MODE == "append" else "w" with open(summary_file, mode, encoding="utf-8") as sf: if mode == "a": sf.write("\n\n") sf.write(header) sf.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}]\n") sf.write(summary_resp.strip()) # If in append mode, trim to retain only the last N entries if configured try: if self.config.ROLLING_SUMMARY_MODE == "append": max_entries = int(getattr(self.config, "ROLLING_SUMMARY_MAX_ENTRIES", 0) or 0) if max_entries > 0: with open(summary_file, 'r', encoding='utf-8') as rf: content = rf.read() # Find the start of each summary block by header line headers = [m.start() for m in re.finditer(r"(?m)^===\s*Rolling Summary.*$", content)] if len(headers) > max_entries: # Keep only the last max_entries blocks keep_starts = headers[-max_entries:] blocks = [] for i, s in enumerate(keep_starts): e = keep_starts[i + 1] if i + 1 < len(keep_starts) else len(content) block = content[s:e].strip() if block: blocks.append(block) trimmed_content = ("\n\n".join(blocks) + "\n") if blocks else "" with open(summary_file, 'w', encoding='utf-8') as wf: wf.write(trimmed_content) # Optional log showing retained count try: self._log(f"📚 Total summaries in memory: {len(blocks)} (trimmed to last {max_entries})") except Exception: pass except Exception as _trim_err: try: self._log(f"⚠️ Failed to trim rolling summaries: {_trim_err}") except Exception: pass # Log to GUI if available, otherwise console try: self._log(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)") self._log(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") except Exception: print(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)") print(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") return summary_resp.strip() except Exception as e: try: self._log(f"⚠️ Failed to generate rolling summary: {e}") except Exception: print(f"⚠️ Failed to generate rolling summary: {e}") return None def translate_with_retry(self, msgs, chunk_html, c, chunk_idx, total_chunks): """Handle translation with retry logic""" # CRITICAL FIX: Reset client state for each chunk if hasattr(self.client, 'reset_cleanup_state'): self.client.reset_cleanup_state() # Also ensure we're not in cleanup mode from previous operations if hasattr(self.client, '_in_cleanup'): self.client._in_cleanup = False if hasattr(self.client, '_cancelled'): self.client._cancelled = False retry_count = 0 # Get retry attempts from AI Hunter config if available ai_config = {} try: # Try to get AI Hunter config from environment variable first ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') if ai_hunter_config_str: ai_config = json.loads(ai_hunter_config_str) else: # Fallback to config attribute ai_config = getattr(self.config, 'ai_hunter_config', {}) except (json.JSONDecodeError, AttributeError): ai_config = {} if isinstance(ai_config, dict): max_retries = ai_config.get('retry_attempts', 3) max_duplicate_retries = ai_config.get('retry_attempts', 6) # Use same setting for duplicate retries else: max_retries = 3 max_duplicate_retries = 6 duplicate_retry_count = 0 timeout_retry_count = 0 max_timeout_retries = 2 history_purged = False original_max_tokens = self.config.MAX_OUTPUT_TOKENS original_temp = self.config.TEMP original_user_prompt = msgs[-1]["content"] chunk_timeout = None if self.config.RETRY_TIMEOUT: chunk_timeout = self.config.CHUNK_TIMEOUT result = None finish_reason = None while True: if self.check_stop(): return None, None try: current_max_tokens = self.config.MAX_OUTPUT_TOKENS current_temp = self.config.TEMP total_tokens = sum(self.chapter_splitter.count_tokens(m["content"]) for m in msgs) # Determine file reference if c.get('is_chunk', False): file_ref = f"Section_{c['num']}" else: # Check if this is a text file - need to access from self is_text_source = self.is_text_file or c.get('filename', '').endswith('.txt') terminology = "Section" if is_text_source else "Chapter" file_ref = c.get('original_basename', f'{terminology}_{c["num"]}') print(f"[DEBUG] Chunk {chunk_idx}/{total_chunks} tokens = {total_tokens:,} / {self.get_token_budget_str()} [File: {file_ref}]") self.client.context = 'translation' # Generate filename for chunks if chunk_idx and total_chunks > 1: # This is a chunk - use chunk naming format fname = f"response_{c['num']:03d}_chunk_{chunk_idx}.html" else: # Not a chunk - use regular naming fname = FileUtilities.create_chapter_filename(c, c.get('actual_chapter_num', c['num'])) # Set output filename BEFORE the API call if hasattr(self.client, 'set_output_filename'): self.client.set_output_filename(fname) # Track the filename so truncation logs know which file this is if hasattr(self.client, '_current_output_file'): self.client._current_output_file = fname # Generate unique request ID for this chunk #request_id = f"{c['num']:03d}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}" result, finish_reason = send_with_interrupt( msgs, self.client, current_temp, current_max_tokens, self.check_stop, chunk_timeout ) # Enhanced mode workflow: # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) # 2. Markdown sent to translation API (better for translation quality) # 3. Translated markdown -> HTML conversion (here) if result and c.get("enhanced_extraction", False): print(f"🔄 Converting translated markdown back to HTML...") result = convert_enhanced_text_to_html(result, c) retry_needed = False retry_reason = "" is_duplicate_retry = False # ENHANCED: Force re-read environment variable for latest setting retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" # Debug logging to verify the toggle state #print(f" DEBUG: finish_reason='{finish_reason}', RETRY_TRUNCATED={retry_truncated_enabled}, config.RETRY_TRUNCATED={self.config.RETRY_TRUNCATED}") #print(f" DEBUG: Current tokens={self.config.MAX_OUTPUT_TOKENS}, Min retry tokens={self.config.MAX_RETRY_TOKENS}, retry_count={retry_count}") if finish_reason == "length" and (retry_truncated_enabled or self.config.RETRY_TRUNCATED): if retry_count < max_retries: # For truncated responses, ensure we never go below the minimum retry tokens proposed_limit = self.config.MAX_OUTPUT_TOKENS * 2 # Always enforce minimum - never retry with tokens below the constraint new_token_limit = max(proposed_limit, self.config.MAX_RETRY_TOKENS) if new_token_limit != self.config.MAX_OUTPUT_TOKENS: retry_needed = True retry_reason = "truncated output" old_limit = self.config.MAX_OUTPUT_TOKENS self.config.MAX_OUTPUT_TOKENS = new_token_limit retry_count += 1 if old_limit < self.config.MAX_RETRY_TOKENS: print(f" 🔄 TRUNCATION RETRY: Boosting tokens {old_limit} → {new_token_limit} (enforcing minimum: {self.config.MAX_RETRY_TOKENS})") else: print(f" 🔄 TRUNCATION RETRY: Doubling tokens {old_limit} → {new_token_limit} (above minimum: {self.config.MAX_RETRY_TOKENS})") else: print(f" ⚠️ TRUNCATION DETECTED: Token adjustment not needed - already at maximum {self.config.MAX_OUTPUT_TOKENS}") else: print(f" ⚠️ TRUNCATION DETECTED: Max retries ({max_retries}) reached - accepting truncated response") elif finish_reason == "length" and not (retry_truncated_enabled or self.config.RETRY_TRUNCATED): print(f" ⏭️ TRUNCATION DETECTED: Auto-retry is DISABLED - accepting truncated response") elif finish_reason == "length": print(f" ⚠️ TRUNCATION DETECTED: Unexpected condition - check logic") if not retry_needed: # Force re-read the environment variable to ensure we have current setting duplicate_enabled = os.getenv("RETRY_DUPLICATE_BODIES", "0") == "1" if duplicate_enabled and duplicate_retry_count < max_duplicate_retries: idx = c.get('__index', 0) prog = c.get('__progress', {}) print(f" 🔍 Checking for duplicate content...") # Get actual chapter number for duplicate detection actual_num = c.get('actual_chapter_num', c.get('num', idx + 1)) is_duplicate, similarity = self.check_duplicate_content(result, idx, prog, self.out_dir, actual_num) if is_duplicate: retry_needed = True is_duplicate_retry = True retry_reason = f"duplicate content (similarity: {similarity}%)" duplicate_retry_count += 1 # Check if temperature change is disabled disable_temp_change = ai_config.get('disable_temperature_change', False) if isinstance(ai_config, dict) else False if duplicate_retry_count >= 3 and not history_purged: print(f" 🧹 Clearing history after 3 attempts...") if 'history_manager' in c: c['history_manager'].save_history([]) history_purged = True if not disable_temp_change: self.config.TEMP = original_temp else: print(f" 🌡️ Temperature change disabled - keeping current temp: {self.config.TEMP}") elif duplicate_retry_count == 1: if disable_temp_change: print(f" 🔄 First duplicate retry - temperature change disabled") else: print(f" 🔄 First duplicate retry - same temperature") elif history_purged: if not disable_temp_change: attempts_since_purge = duplicate_retry_count - 3 self.config.TEMP = min(original_temp + (0.1 * attempts_since_purge), 1.0) print(f" 🌡️ Post-purge temp: {self.config.TEMP}") else: print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") else: if not disable_temp_change: self.config.TEMP = min(original_temp + (0.1 * (duplicate_retry_count - 1)), 1.0) print(f" 🌡️ Gradual temp increase: {self.config.TEMP}") else: print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") if duplicate_retry_count == 1: user_prompt = f"[RETRY] Chapter {c['num']}: Ensure unique translation.\n{chunk_html}" elif duplicate_retry_count <= 3: user_prompt = f"[ATTEMPT {duplicate_retry_count}] Translate uniquely:\n{chunk_html}" else: user_prompt = f"Chapter {c['num']}:\n{chunk_html}" msgs[-1] = {"role": "user", "content": user_prompt} elif not duplicate_enabled: print(f" ⏭️ Duplicate detection is DISABLED - skipping check") if retry_needed: if is_duplicate_retry: print(f" 🔄 Duplicate retry {duplicate_retry_count}/{max_duplicate_retries}") else: print(f" 🔄 Retry {retry_count}/{max_retries}: {retry_reason}") time.sleep(2) continue break except UnifiedClientError as e: error_msg = str(e) if "stopped by user" in error_msg: print("❌ Translation stopped by user during API call") return None, None if "took" in error_msg and "timeout:" in error_msg: if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f" ⏱️ Chunk took too long, retry {timeout_retry_count}/{max_timeout_retries}") print(f" 🔄 Retrying") time.sleep(2) continue else: print(f" ❌ Max timeout retries reached") raise UnifiedClientError("Translation failed after timeout retries") elif "timed out" in error_msg and "timeout:" not in error_msg: print(f"⚠️ {error_msg}, retrying...") time.sleep(5) continue elif getattr(e, "error_type", None) == "rate_limit" or getattr(e, "http_status", None) == 429: # Rate limit errors - clean handling without traceback print("⚠️ Rate limited, sleeping 60s…") for i in range(60): if self.check_stop(): print("❌ Translation stopped during rate limit wait") return None, None time.sleep(1) continue else: # For unexpected errors, show the error message but suppress traceback in most cases if getattr(e, "error_type", None) in ["api_error", "validation", "prohibited_content"]: print(f"❌ API Error: {error_msg}") raise UnifiedClientError(f"API Error: {error_msg}") else: raise except Exception as e: print(f"❌ Unexpected error during API call: {e}") raise self.config.MAX_OUTPUT_TOKENS = original_max_tokens self.config.TEMP = original_temp if retry_count > 0 or duplicate_retry_count > 0 or timeout_retry_count > 0: if duplicate_retry_count > 0: print(f" 🔄 Restored original temperature: {self.config.TEMP} (after {duplicate_retry_count} duplicate retries)") elif timeout_retry_count > 0: print(f" 🔄 Restored original settings after {timeout_retry_count} timeout retries") elif retry_count > 0: print(f" 🔄 Restored original settings after {retry_count} retries") if duplicate_retry_count >= max_duplicate_retries: print(f" ⚠️ WARNING: Duplicate content issue persists after {max_duplicate_retries} attempts") return result, finish_reason def get_token_budget_str(self): """Get token budget as string""" _tok_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() max_tokens_limit, budget_str = parse_token_limit(_tok_env) return budget_str # ===================================================== # BATCH TRANSLATION PROCESSOR # ===================================================== class BatchTranslationProcessor: """Handles batch/parallel translation processing""" def __init__(self, config, client, base_msg, out_dir, progress_lock, save_progress_fn, update_progress_fn, check_stop_fn, image_translator=None, is_text_file=False): self.config = config self.client = client self.base_msg = base_msg self.out_dir = out_dir self.progress_lock = progress_lock self.save_progress_fn = save_progress_fn self.update_progress_fn = update_progress_fn self.check_stop_fn = check_stop_fn self.image_translator = image_translator self.chapters_completed = 0 self.chunks_completed = 0 self.is_text_file = is_text_file # Optionally log multi-key status if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: stats = self.client.get_stats() print(f"🔑 Batch processor using multi-key mode: {stats.get('total_keys', 0)} keys") def process_single_chapter(self, chapter_data): """Process a single chapter (runs in thread)""" # APPLY INTERRUPTIBLE THREADING DELAY FIRST thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) if thread_delay > 0: # Check if we need to wait (same logic as unified_api_client) if hasattr(self.client, '_thread_submission_lock') and hasattr(self.client, '_last_thread_submission_time'): with self.client._thread_submission_lock: current_time = time.time() time_since_last = current_time - self.client._last_thread_submission_time if time_since_last < thread_delay: sleep_time = thread_delay - time_since_last thread_name = threading.current_thread().name # PRINT BEFORE THE DELAY STARTS idx, chapter = chapter_data # Extract chapter info for better logging print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.1f}s for Chapter {idx+1}") # Interruptible sleep - check stop flag every 0.1 seconds elapsed = 0 check_interval = 0.1 while elapsed < sleep_time: if self.check_stop_fn(): print(f"🛑 Threading delay interrupted by stop flag") raise Exception("Translation stopped by user during threading delay") sleep_chunk = min(check_interval, sleep_time - elapsed) time.sleep(sleep_chunk) elapsed += sleep_chunk self.client._last_thread_submission_time = time.time() if not hasattr(self.client, '_thread_submission_count'): self.client._thread_submission_count = 0 self.client._thread_submission_count += 1 idx, chapter = chapter_data chap_num = chapter["num"] # Use the pre-calculated actual_chapter_num from the main loop actual_num = chapter.get('actual_chapter_num') # Fallback if not set (common in batch mode where first pass might be skipped) if actual_num is None: # Try to extract it using the same logic as non-batch mode raw_num = FileUtilities.extract_actual_chapter_number(chapter, patterns=None, config=self.config) # Apply offset if configured offset = self.config.CHAPTER_NUMBER_OFFSET if hasattr(self.config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset # Check if zero detection is disabled if hasattr(self.config, 'DISABLE_ZERO_DETECTION') and self.config.DISABLE_ZERO_DETECTION: actual_num = raw_num elif hasattr(self.config, '_uses_zero_based') and self.config._uses_zero_based: # This is a 0-based novel, adjust the number actual_num = raw_num + 1 else: # Default to raw number (1-based or unknown) actual_num = raw_num print(f" 📖 Extracted actual chapter number: {actual_num} (from raw: {raw_num})") try: # Check if this is from a text file ai_features = None is_text_source = self.is_text_file or chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f"🔄 Starting #{idx+1} (Internal: {terminology} {chap_num}, Actual: {terminology} {actual_num}) (thread: {threading.current_thread().name}) [File: {chapter.get('original_basename', f'{terminology}_{chap_num}')}]") content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"]) with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, None, status="in_progress") self.save_progress_fn() chapter_body = chapter["body"] if chapter.get('has_images') and self.image_translator and self.config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Processing images for Chapter {actual_num}...") self.image_translator.set_current_chapter(actual_num) chapter_body, image_translations = process_chapter_images( chapter_body, actual_num, self.image_translator, self.check_stop_fn ) if image_translations: # Create a copy of the processed body from bs4 import BeautifulSoup c = chapter soup_for_text = BeautifulSoup(c["body"], 'html.parser') # Remove all translated content for trans_div in soup_for_text.find_all('div', class_='translated-text-only'): trans_div.decompose() # Use this cleaned version for text translation text_to_translate = str(soup_for_text) final_body_with_images = c["body"] else: text_to_translate = c["body"] image_translations = {} print(f"✅ Processed {len(image_translations)} images for Chapter {actual_num}") chapter_msgs = self.base_msg + [{"role": "user", "content": chapter_body}] # Generate filename before API call fname = FileUtilities.create_chapter_filename(chapter, actual_num) self.client.set_output_filename(fname) if hasattr(self.client, '_current_output_file'): self.client._current_output_file = fname print(f"📤 Sending Chapter {actual_num} to API...") result, finish_reason = send_with_interrupt( chapter_msgs, self.client, self.config.TEMP, self.config.MAX_OUTPUT_TOKENS, self.check_stop_fn ) print(f"📥 Received Chapter {actual_num} response, finish_reason: {finish_reason}") # Enhanced mode workflow (same as non-batch): # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) # 2. Markdown sent to translation API (better for translation quality) # 3. Translated markdown -> HTML conversion (here) if result and chapter.get("enhanced_extraction", False): print(f"🔄 Converting translated markdown back to HTML...") result = convert_enhanced_text_to_html(result, chapter) if finish_reason in ["length", "max_tokens"]: print(f"⚠️ Chapter {actual_num} response was TRUNCATED!") if self.config.REMOVE_AI_ARTIFACTS: result = ContentProcessor.clean_ai_artifacts(result, True) result = ContentProcessor.clean_memory_artifacts(result) cleaned = re.sub(r"^```(?:html)?\s*\n?", "", result, count=1, flags=re.MULTILINE) cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=self.config.REMOVE_AI_ARTIFACTS) fname = FileUtilities.create_chapter_filename(chapter, actual_num) if self.is_text_file: # For text files, save as plain text fname_txt = fname.replace('.html', '.txt') if fname.endswith('.html') else fname # Extract text from HTML from bs4 import BeautifulSoup soup = BeautifulSoup(cleaned, 'html.parser') text_content = soup.get_text(strip=True) # Merge image translations back with text translation if 'final_body_with_images' in locals() and image_translations: # Parse both versions soup_with_images = BeautifulSoup(final_body_with_images, 'html.parser') soup_with_text = BeautifulSoup(cleaned, 'html.parser') # Get the translated text content (without images) body_content = soup_with_text.body # Add image translations to the translated content for trans_div in soup_with_images.find_all('div', class_='translated-text-only'): body_content.insert(0, trans_div) final_html = str(soup_with_text) cleaned = final_html with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned) # Update with .txt filename with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, fname_txt, status="completed", ai_features=ai_features) self.save_progress_fn() else: # Original code for EPUB files with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned) print(f"💾 Saved Chapter {actual_num}: {fname} ({len(cleaned)} chars)") # Initialize ai_features at the beginning to ensure it's always defined if ai_features is None: ai_features = None # Extract and save AI features for future duplicate detection if (self.config.RETRY_DUPLICATE_BODIES and hasattr(self.config, 'DUPLICATE_DETECTION_MODE') and self.config.DUPLICATE_DETECTION_MODE in ['ai-hunter', 'cascading']): try: # Extract features from the translated content cleaned_text = re.sub(r'<[^>]+>', '', cleaned).strip() # Note: self.translator doesn't exist, so we can't extract features here # The features will need to be extracted during regular processing print(f" ⚠️ AI features extraction not available in batch mode") except Exception as e: print(f" ⚠️ Failed to extract AI features: {e}") with self.progress_lock: # Check for QA failures with comprehensive detection if is_qa_failed_response(cleaned): chapter_status = "qa_failed" failure_reason = get_failure_reason(cleaned) print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: {failure_reason}") # Update progress to qa_failed status self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features) self.save_progress_fn() # DO NOT increment chapters_completed for qa_failed # Return False to indicate failure return False, actual_num else: chapter_status = "completed" # Update progress to completed status self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features) self.save_progress_fn() # Only increment chapters_completed for successful chapters self.chapters_completed += 1 self.chunks_completed += 1 print(f"✅ Chapter {actual_num} completed successfully") return True, actual_num except Exception as e: print(f"❌ Chapter {actual_num} failed: {e}") with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, None, status="failed") self.save_progress_fn() return False, actual_num # ===================================================== # GLOSSARY MANAGER - TRUE CSV FORMAT WITH FUZZY MATCHING # ===================================================== class GlossaryManager: """Unified glossary management with true CSV format, fuzzy matching, and parallel processing""" # Class-level shared lock for API submission timing _api_submission_lock = threading.Lock() _last_api_submission_time = 0 def __init__(self): self.pattern_manager = PatternManager() self._results_lock = threading.Lock() # Thread lock for collecting results self._file_write_lock = threading.Lock() # Thread lock for file operations def _atomic_write_file(self, filepath, content, encoding='utf-8'): """Atomically write to a file to prevent corruption from concurrent writes""" # Create temp file in same directory to ensure same filesystem dir_path = os.path.dirname(filepath) with self._file_write_lock: try: # Write to temporary file first with tempfile.NamedTemporaryFile(mode='w', encoding=encoding, dir=dir_path, delete=False) as tmp_file: tmp_file.write(content) tmp_path = tmp_file.name # Atomic rename (on same filesystem) if os.name == 'nt': # Windows # Windows doesn't support atomic rename if target exists if os.path.exists(filepath): os.remove(filepath) os.rename(tmp_path, filepath) else: # Unix/Linux/Mac os.rename(tmp_path, filepath) return True except Exception as e: print(f"⚠️ Atomic write failed: {e}") # Cleanup temp file if it exists if 'tmp_path' in locals() and os.path.exists(tmp_path): try: os.remove(tmp_path) except: pass # Fallback to direct write with lock try: with open(filepath, 'w', encoding=encoding) as f: f.write(content) return True except Exception as e2: print(f"⚠️ Fallback write also failed: {e2}") return False def save_glossary(self, output_dir, chapters, instructions, language="korean"): """Targeted glossary generator with true CSV format output and parallel processing""" print("📑 Targeted Glossary Generator v6.0 (CSV Format + Parallel)") # Check stop flag at start # Ensure output directory exists try: os.makedirs(output_dir, exist_ok=True) except Exception as _e: print(f"⚠️ Could not ensure output directory exists: {output_dir} ({_e})") if is_stop_requested(): print("📑 ❌ Glossary generation stopped by user") return {} # Check if glossary already exists; if so, we'll MERGE it later (do not return early) glossary_path = os.path.join(output_dir, "glossary.csv") existing_glossary_content = None if os.path.exists(glossary_path): print(f"📑 Existing glossary detected (will merge): {glossary_path}") try: with open(glossary_path, 'r', encoding='utf-8') as f: existing_glossary_content = f.read() except Exception as e: print(f"⚠️ Could not read existing glossary: {e}") # Rest of the method continues as before... print("📑 Extracting names and terms with configurable options") # Check stop flag before processing if is_stop_requested(): print("📑 ❌ Glossary generation stopped by user") return {} # Check for manual glossary first (CSV only) manual_glossary_path = os.getenv("MANUAL_GLOSSARY") existing_glossary = None if manual_glossary_path and os.path.exists(manual_glossary_path): print(f"📑 Manual glossary detected: {os.path.basename(manual_glossary_path)}") try: with open(manual_glossary_path, 'r', encoding='utf-8') as f: content = f.read() # Treat as CSV text and stage it for merge; also copy to output for visibility target_path = os.path.join(output_dir, "glossary.csv") with open(target_path, 'w', encoding='utf-8') as f: f.write(content) print(f"📑 ✅ Manual CSV glossary copied to: {target_path}") existing_glossary = content except Exception as e: print(f"⚠️ Could not copy manual glossary: {e}") print(f"📑 Proceeding with automatic generation...") # Check for existing glossary from manual extraction glossary_folder_path = os.path.join(output_dir, "Glossary") # existing_glossary may already be set by MANUAL_GLOSSARY above if os.path.exists(glossary_folder_path): for file in os.listdir(glossary_folder_path): if file.endswith("_glossary.json"): existing_path = os.path.join(glossary_folder_path, file) try: with open(existing_path, 'r', encoding='utf-8') as f: existing_content = f.read() existing_glossary = existing_content print(f"📑 Found existing glossary from manual extraction: {file}") break except Exception as e: print(f"⚠️ Could not load existing glossary: {e}") # Get configuration from environment variables min_frequency = int(os.getenv("GLOSSARY_MIN_FREQUENCY", "2")) max_names = int(os.getenv("GLOSSARY_MAX_NAMES", "50")) max_titles = int(os.getenv("GLOSSARY_MAX_TITLES", "30")) batch_size = int(os.getenv("GLOSSARY_BATCH_SIZE", "50")) strip_honorifics = os.getenv("GLOSSARY_STRIP_HONORIFICS", "1") == "1" fuzzy_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90")) max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000")) print(f"📑 Settings: Min frequency: {min_frequency}, Max names: {max_names}, Max titles: {max_titles}") print(f"📑 Strip honorifics: {'✅ Yes' if strip_honorifics else '❌ No'}") print(f"📑 Fuzzy matching threshold: {fuzzy_threshold}") # Get custom prompt from environment custom_prompt = os.getenv("AUTO_GLOSSARY_PROMPT", "").strip() def clean_html(html_text): """Remove HTML tags to get clean text""" soup = BeautifulSoup(html_text, 'html.parser') return soup.get_text() # Check stop before processing chapters if is_stop_requested(): print("📑 ❌ Glossary generation stopped by user") return {} # Get chapter split threshold and filter mode chapter_split_threshold = int(os.getenv("GLOSSARY_CHAPTER_SPLIT_THRESHOLD", "100000")) filter_mode = os.getenv("GLOSSARY_FILTER_MODE", "all") # all, only_with_honorifics, only_without_honorifics # Check if parallel extraction is enabled for automatic glossary extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) batch_translation = os.getenv("BATCH_TRANSLATION", "0") == "1" api_batch_size = int(os.getenv("BATCH_SIZE", "5")) # Log the settings print(f"📑 Filter mode: {filter_mode}") if extraction_workers > 1: print(f"📑 Parallel extraction enabled: {extraction_workers} workers") if batch_translation: print(f"📑 Batch API calls enabled: {api_batch_size} chunks per batch") all_text = ' '.join(clean_html(chapter["body"]) for chapter in chapters) print(f"📑 Processing {len(all_text):,} characters of text") # Apply smart filtering FIRST to check actual size needed use_smart_filter = os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1" effective_text_size = len(all_text) filtered_text_cache = None if use_smart_filter and custom_prompt: # Only apply for AI extraction print(f"📑 Smart filtering enabled - checking effective text size after filtering...") # Perform filtering ONCE and reuse for chunking filtered_sample, _ = self._filter_text_for_glossary(all_text, min_frequency) filtered_text_cache = filtered_sample effective_text_size = len(filtered_sample) print(f"📑 Effective text size after filtering: {effective_text_size:,} chars (from {len(all_text):,})") # Check if we need to split into chunks based on EFFECTIVE size after filtering if chapter_split_threshold > 0 and effective_text_size > chapter_split_threshold: print(f"📑 Effective text exceeds {chapter_split_threshold:,} chars, will process in chunks...") # If using smart filter, we need to split the FILTERED text, not raw text if use_smart_filter and custom_prompt: # Split the filtered text into chunks (reuse cached filtered text) filtered_text = filtered_text_cache if filtered_text_cache is not None else self._filter_text_for_glossary(all_text, min_frequency)[0] chunks_to_process = [] # Split filtered text into chunks of appropriate size chunk_size = chapter_split_threshold for i in range(0, len(filtered_text), chunk_size): chunk_text = filtered_text[i:i + chunk_size] chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) print(f"📑 Split filtered text into {len(chunks_to_process)} chunks") all_glossary_entries = [] else: # Original logic for unfiltered text all_glossary_entries = [] chunk_size = 0 chunk_chapters = [] chunks_to_process = [] for idx, chapter in enumerate(chapters): if is_stop_requested(): print("📑 ❌ Glossary generation stopped by user") return all_glossary_entries chapter_text = clean_html(chapter["body"]) chunk_size += len(chapter_text) chunk_chapters.append(chapter) # Process chunk when it reaches threshold or last chapter if chunk_size >= chapter_split_threshold or idx == len(chapters) - 1: chunk_text = ' '.join(clean_html(ch["body"]) for ch in chunk_chapters) chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) # Reset for next chunk chunk_size = 0 chunk_chapters = [] print(f"📑 Split into {len(chunks_to_process)} chunks for processing") # Batch toggle decides concurrency: ON => parallel API calls; OFF => strict sequential if batch_translation and custom_prompt and len(chunks_to_process) > 1: print(f"📑 Processing chunks in batch mode with {api_batch_size} chunks per batch...") # Set fast mode for batch processing os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "1" # Use batch API calls for AI extraction all_csv_lines = self._process_chunks_batch_api( chunks_to_process, custom_prompt, language, min_frequency, max_names, max_titles, output_dir, strip_honorifics, fuzzy_threshold, filter_mode, api_batch_size, extraction_workers ) # Reset validation mode os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "0" print(f"📑 All chunks completed. Aggregated raw lines: {len(all_csv_lines)}") # Process all collected entries at once (even if empty) # Add header so downstream steps can work uniformly all_csv_lines.insert(0, "type,raw_name,translated_name") # Merge with any on-disk glossary first (to avoid overwriting user edits) on_disk_path = os.path.join(output_dir, "glossary.csv") if os.path.exists(on_disk_path): try: with open(on_disk_path, 'r', encoding='utf-8') as f: on_disk_content = f.read() all_csv_lines = self._merge_csv_entries(all_csv_lines, on_disk_content, strip_honorifics, language) print("📑 Merged with existing on-disk glossary") except Exception as e: print(f"⚠️ Failed to merge with existing on-disk glossary: {e}") # Apply filter mode if needed if filter_mode == "only_with_honorifics": filtered = [all_csv_lines[0]] # Keep header for line in all_csv_lines[1:]: parts = line.split(',', 2) if len(parts) >= 3 and parts[0] == "character": filtered.append(line) all_csv_lines = filtered print(f"📑 Filter applied: {len(all_csv_lines)-1} character entries with honorifics kept") # Apply fuzzy deduplication (deferred until after all chunks) try: print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") all_csv_lines = self._deduplicate_glossary_with_fuzzy(all_csv_lines, fuzzy_threshold) except Exception as e: print(f"⚠️ Deduplication error: {e} — continuing without dedup") # Sort by type and name print(f"📑 Sorting glossary by type and name...") header = all_csv_lines[0] entries = all_csv_lines[1:] if entries: entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower())) all_csv_lines = [header] + entries # Save # Check format preference use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' if not use_legacy_format: # Convert to token-efficient format all_csv_lines = self._convert_to_token_efficient_format(all_csv_lines) # Final sanitize to prevent stray headers all_csv_lines = self._sanitize_final_glossary_lines(all_csv_lines, use_legacy_format) # Save csv_content = '\n'.join(all_csv_lines) glossary_path = os.path.join(output_dir, "glossary.csv") self._atomic_write_file(glossary_path, csv_content) # Verify file exists; fallback direct write if needed if not os.path.exists(glossary_path): try: with open(glossary_path, 'w', encoding='utf-8') as f: f.write(csv_content) print("📑 Fallback write succeeded for glossary.csv") except Exception as e: print(f"❌ Failed to write glossary.csv: {e}") print(f"\n📑 ✅ GLOSSARY SAVED!") print(f"📑 ✅ AI GLOSSARY SAVED!") c_count, t_count, total = self._count_glossary_entries(all_csv_lines, use_legacy_format) print(f"📑 Character entries: {c_count}") print(f"📑 Term entries: {t_count}") print(f"📑 Total entries: {total}") return self._parse_csv_to_dict(csv_content) else: # Strict sequential processing (one API call at a time) _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") os.environ["GLOSSARY_DEFER_SAVE"] = "1" # Tell the extractor each chunk is already filtered to avoid re-running smart filter per chunk os.environ["_CHUNK_ALREADY_FILTERED"] = "1" os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" try: for chunk_idx, chunk_text in chunks_to_process: if is_stop_requested(): break print(f"📑 Processing chunk {chunk_idx}/{len(chunks_to_process)} ({len(chunk_text):,} chars)...") if custom_prompt: chunk_glossary = self._extract_with_custom_prompt( custom_prompt, chunk_text, language, min_frequency, max_names, max_titles, None, output_dir, # Don't pass existing glossary to chunks strip_honorifics, fuzzy_threshold, filter_mode ) else: chunk_glossary = self._extract_with_patterns( chunk_text, language, min_frequency, max_names, max_titles, batch_size, None, output_dir, # Don't pass existing glossary to chunks strip_honorifics, fuzzy_threshold, filter_mode ) # Normalize to CSV lines and aggregate chunk_lines = [] if isinstance(chunk_glossary, list): for line in chunk_glossary: if line and not line.startswith('type,'): all_glossary_entries.append(line) chunk_lines.append(line) else: for raw_name, translated_name in chunk_glossary.items(): entry_type = "character" if self._has_honorific(raw_name) else "term" line = f"{entry_type},{raw_name},{translated_name}" all_glossary_entries.append(line) chunk_lines.append(line) # Incremental update try: self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode) print(f"📑 Incremental write: +{len(chunk_lines)} entries") except Exception as e2: print(f"⚠️ Incremental write failed: {e2}") finally: if _prev_defer is None: if "GLOSSARY_DEFER_SAVE" in os.environ: del os.environ["GLOSSARY_DEFER_SAVE"] else: os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer if _prev_filtered is None: os.environ.pop("_CHUNK_ALREADY_FILTERED", None) else: os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered if _prev_force_disable is None: os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) else: os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable # Build CSV from aggregated entries csv_lines = ["type,raw_name,translated_name"] + all_glossary_entries # Merge with any provided existing glossary AND on-disk glossary to avoid overwriting on_disk_path = os.path.join(output_dir, "glossary.csv") merge_sources = [] if existing_glossary: merge_sources.append(existing_glossary) if os.path.exists(on_disk_path): try: with open(on_disk_path, 'r', encoding='utf-8') as f: merge_sources.append(f.read()) print("📑 Found existing on-disk glossary to merge") except Exception as e: print(f"⚠️ Failed to read on-disk glossary for merging: {e}") # Also merge the main on-disk glossary if it was present at start if existing_glossary_content: csv_lines = self._merge_csv_entries(csv_lines, existing_glossary_content, strip_honorifics, language) for src in merge_sources: csv_lines = self._merge_csv_entries(csv_lines, src, strip_honorifics, language) # Apply filter mode to final results csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode) # Apply fuzzy deduplication (deferred until after all chunks) print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") original_count = len(csv_lines) - 1 csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) deduped_count = len(csv_lines) - 1 if original_count > deduped_count: print(f"📑 Removed {original_count - deduped_count} duplicate entries") # Sort by type and name print(f"📑 Sorting glossary by type and name...") header = csv_lines[0] entries = csv_lines[1:] entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower() if ',' in x else x.lower())) csv_lines = [header] + entries # Token-efficient format if enabled use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' if not use_legacy_format: csv_lines = self._convert_to_token_efficient_format(csv_lines) # Final sanitize to prevent stray headers and section titles at end csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format) try: # Save csv_content = '\n'.join(csv_lines) glossary_path = os.path.join(output_dir, "glossary.csv") self._atomic_write_file(glossary_path, csv_content) # Verify file exists; fallback direct write if needed if not os.path.exists(glossary_path): try: with open(glossary_path, 'w', encoding='utf-8') as f: f.write(csv_content) print("📑 Fallback write succeeded for glossary.csv") except Exception as e: print(f"❌ Failed to write glossary.csv: {e}") finally: print(f"\n📑 ✅ CHUNKED GLOSSARY SAVED!") print(f"📑 ✅ AI GLOSSARY SAVED!") print(f"📑 File: {glossary_path}") c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format) print(f"📑 Character entries: {c_count}") print(f"📑 Term entries: {t_count}") print(f"📑 Total entries: {total}") return self._parse_csv_to_dict(csv_content) # Original single-text processing if custom_prompt: return self._extract_with_custom_prompt(custom_prompt, all_text, language, min_frequency, max_names, max_titles, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) else: return self._extract_with_patterns(all_text, language, min_frequency, max_names, max_titles, batch_size, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) total_time = time.time() - total_start_time print(f"\n📑 ========== GLOSSARY GENERATION COMPLETE ==========") print(f"📑 Total time: {total_time:.1f}s") print(f"📑 Performance breakdown:") print(f"📑 - Extraction: {getattr(self, '_extraction_time', 0):.1f}s") print(f"📑 - API calls: {getattr(self, '_api_time', 0):.1f}s") print(f"📑 - Frequency checking: {getattr(self, '_freq_check_time', 0):.1f}s") print(f"📑 - Deduplication: {getattr(self, '_dedup_time', 0):.1f}s") print(f"📑 - File I/O: {getattr(self, '_io_time', 0):.1f}s") print(f"📑 ================================================") return result # This is the existing return statement def _convert_to_token_efficient_format(self, csv_lines): """Convert CSV lines to token-efficient format with sections and asterisks""" if len(csv_lines) <= 1: return csv_lines header = csv_lines[0] entries = csv_lines[1:] # Group by type (only from valid CSV lines) import re as _re grouped = {} for line in entries: if not line.strip(): continue # Only accept proper CSV rows: at least 3 fields and a sane type token parts_full = [p.strip() for p in line.split(',')] if len(parts_full) < 3: continue entry_type = parts_full[0].lower() if not _re.match(r'^[a-z_]+$', entry_type): continue if entry_type not in grouped: grouped[entry_type] = [] grouped[entry_type].append(line) # Rebuild with token-efficient format result = [] result.append("Glossary: Characters, Terms, and Important Elements\n") # Process in order: character first, then term, then others type_order = ['character', 'term'] + [t for t in grouped.keys() if t not in ['character', 'term']] for entry_type in type_order: if entry_type not in grouped: continue entries = grouped[entry_type] # Add section header section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper() result.append(f"=== {section_name} ===") # Add entries in new format for line in entries: parts = [p.strip() for p in line.split(',')] if len(parts) >= 3: raw_name = parts[1] translated_name = parts[2] # Format: * TranslatedName (RawName) entry_line = f"* {translated_name} ({raw_name})" # Add gender if present and not Unknown if len(parts) > 3 and parts[3] and parts[3] != 'Unknown': entry_line += f" [{parts[3]}]" # Add any additional fields as description if len(parts) > 4: description = ', '.join(parts[4:]) if description.strip(): entry_line += f": {description}" result.append(entry_line) result.append("") # Blank line between sections return result def _count_glossary_entries(self, lines, use_legacy_format=False): """Return (char_count, term_count, total_count) for either format.""" if not lines: return 0, 0, 0 if use_legacy_format: data = lines[1:] if lines and lines[0].lower().startswith('type,raw_name') else lines char_count = sum(1 for ln in data if ln.startswith('character,')) term_count = sum(1 for ln in data if ln.startswith('term,')) total = sum(1 for ln in data if ln and ',' in ln) return char_count, term_count, total # token-efficient current = None char_count = term_count = total = 0 for ln in lines: s = ln.strip() if s.startswith('=== ') and 'CHARACTER' in s.upper(): current = 'character' continue if s.startswith('=== ') and 'TERM' in s.upper(): current = 'term' continue if s.startswith('* '): total += 1 if current == 'character': char_count += 1 elif current == 'term': term_count += 1 return char_count, term_count, total def _sanitize_final_glossary_lines(self, lines, use_legacy_format=False): """Remove stray CSV headers and normalize header placement before saving. - In legacy CSV mode, ensure exactly one header at the very top. - In token-efficient mode, remove any CSV header lines entirely. """ header_norm = "type,raw_name,translated_name" if not lines: return lines if use_legacy_format: sanitized = [] header_seen = False for ln in lines: txt = ln.strip() if txt.lower().startswith("type,raw_name"): if not header_seen: sanitized.append(header_norm) header_seen = True # skip duplicates else: sanitized.append(ln) # ensure header at top if sanitized and not sanitized[0].strip().lower().startswith("type,raw_name"): sanitized.insert(0, header_norm) return sanitized else: # remove any CSV header lines anywhere and duplicate top headers/sections cleaned = [] glossary_header_seen = False for i, ln in enumerate(lines): txt = ln.strip() low = txt.lower() # Drop CSV headers if low.startswith("type,raw_name"): continue # Keep only the first main glossary header if low.startswith("glossary:"): if glossary_header_seen: continue glossary_header_seen = True cleaned.append(ln) continue # Remove bogus section like '=== GLOSSARY: ... ===' if low.startswith("=== glossary:"): continue cleaned.append(ln) return cleaned def _process_chunks_batch_api(self, chunks_to_process, custom_prompt, language, min_frequency, max_names, max_titles, output_dir, strip_honorifics, fuzzy_threshold, filter_mode, api_batch_size, extraction_workers): """Process chunks using batch API calls for AI extraction with thread delay""" print(f"📑 Using batch API mode with {api_batch_size} chunks per batch") # Ensure we defer saving and heavy merging when processing chunks _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") os.environ["GLOSSARY_DEFER_SAVE"] = "1" # Get thread submission delay thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) if thread_delay > 0: print(f"📑 Thread submission delay: {thread_delay}s between parallel calls") # CHANGE: Collect raw CSV lines instead of dictionary all_csv_lines = [] # Collect all entries as CSV lines total_chunks = len(chunks_to_process) completed_chunks = 0 # Ensure per-chunk smart filtering is disabled globally during batch processing _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") os.environ["_CHUNK_ALREADY_FILTERED"] = "1" os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" # Process in API batches for batch_start in range(0, len(chunks_to_process), api_batch_size): if is_stop_requested(): break batch_end = min(batch_start + api_batch_size, len(chunks_to_process)) batch_chunks = chunks_to_process[batch_start:batch_end] print(f"📑 Processing API batch {batch_start//api_batch_size + 1}: chunks {batch_start+1}-{batch_end}") # Use ThreadPoolExecutor for parallel API calls within batch # Batch mode: issue multiple API calls in parallel within each batch (one worker per chunk) with ThreadPoolExecutor(max_workers=len(batch_chunks)) as executor: futures = {} last_submission_time = 0 for chunk_idx, chunk_text in batch_chunks: if is_stop_requested(): break # Apply thread submission delay if thread_delay > 0 and last_submission_time > 0: time_since_last = time.time() - last_submission_time if time_since_last < thread_delay: sleep_time = thread_delay - time_since_last print(f"🧵 Thread delay: {sleep_time:.1f}s for chunk {chunk_idx}") time.sleep(sleep_time) future = executor.submit( self._extract_with_custom_prompt, custom_prompt, chunk_text, language, min_frequency, max_names, max_titles, None, output_dir, strip_honorifics, fuzzy_threshold, filter_mode ) futures[future] = chunk_idx last_submission_time = time.time() # Collect results for future in as_completed(futures): if is_stop_requested(): break try: chunk_glossary = future.result() print(f"📑 DEBUG: Chunk {futures[future]} returned type={type(chunk_glossary)}, len={len(chunk_glossary)}") # Normalize to CSV lines (without header) chunk_lines = [] if isinstance(chunk_glossary, dict): for raw_name, translated_name in chunk_glossary.items(): entry_type = "character" if self._has_honorific(raw_name) else "term" chunk_lines.append(f"{entry_type},{raw_name},{translated_name}") elif isinstance(chunk_glossary, list): for line in chunk_glossary: if line and not line.startswith('type,'): chunk_lines.append(line) # Aggregate for end-of-run all_csv_lines.extend(chunk_lines) # Incremental update of glossary.csv in token-efficient format try: self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode) print(f"📑 Incremental write: +{len(chunk_lines)} entries") except Exception as e2: print(f"⚠️ Incremental write failed: {e2}") completed_chunks += 1 # Print progress for GUI progress_percent = (completed_chunks / total_chunks) * 100 print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") print(f"📑 Chunk {futures[future]} completed and aggregated") except Exception as e: print(f"⚠️ API call for chunk {futures[future]} failed: {e}") completed_chunks += 1 progress_percent = (completed_chunks / total_chunks) * 100 print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") # Add delay between API batches if batch_end < len(chunks_to_process): api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) print(f"⏱️ Waiting {api_delay}s before next API batch...") time.sleep(api_delay) # CHANGE: Return CSV lines instead of dictionary # Restore per-chunk filter disabling envs if _prev_filtered is None: os.environ.pop("_CHUNK_ALREADY_FILTERED", None) else: os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered if _prev_force_disable is None: os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) else: os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable # Restore previous defer setting if _prev_defer is None: # Default back to not deferring if it wasn't set if "GLOSSARY_DEFER_SAVE" in os.environ: del os.environ["GLOSSARY_DEFER_SAVE"] else: os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer return all_csv_lines def _incremental_update_glossary(self, output_dir, chunk_lines, strip_honorifics, language, filter_mode): """Incrementally update glossary.csv (token-efficient) using an on-disk CSV aggregator. This keeps glossary.csv present and growing after each chunk while preserving token-efficient format for the visible file. """ if not chunk_lines: return # Paths agg_path = os.path.join(output_dir, "glossary.incremental.csv") vis_path = os.path.join(output_dir, "glossary.csv") # Ensure output dir os.makedirs(output_dir, exist_ok=True) # Compose CSV with header for merging new_csv_lines = ["type,raw_name,translated_name"] + chunk_lines # Load existing aggregator content, if any existing_csv = None if os.path.exists(agg_path): try: with open(agg_path, 'r', encoding='utf-8') as f: existing_csv = f.read() except Exception as e: print(f"⚠️ Incremental: cannot read aggregator: {e}") # Merge (exact merge, no fuzzy to keep this fast) merged_csv_lines = self._merge_csv_entries(new_csv_lines, existing_csv or "", strip_honorifics, language) # Optional filter mode merged_csv_lines = self._filter_csv_by_mode(merged_csv_lines, filter_mode) # Save aggregator (CSV) self._atomic_write_file(agg_path, "\n".join(merged_csv_lines)) # Convert to token-efficient format for visible glossary.csv token_lines = self._convert_to_token_efficient_format(merged_csv_lines) token_lines = self._sanitize_final_glossary_lines(token_lines, use_legacy_format=False) self._atomic_write_file(vis_path, "\n".join(token_lines)) if not os.path.exists(vis_path): with open(vis_path, 'w', encoding='utf-8') as f: f.write("\n".join(token_lines)) def _process_single_chunk(self, chunk_idx, chunk_text, custom_prompt, language, min_frequency, max_names, max_titles, batch_size, output_dir, strip_honorifics, fuzzy_threshold, filter_mode, already_filtered=False): """Process a single chunk - wrapper for parallel execution""" print(f"📑 Worker processing chunk {chunk_idx} ({len(chunk_text):,} chars)...") if custom_prompt: # Pass flag to indicate if text is already filtered os.environ["_CHUNK_ALREADY_FILTERED"] = "1" if already_filtered else "0" _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") os.environ["GLOSSARY_DEFER_SAVE"] = "1" try: result = self._extract_with_custom_prompt( custom_prompt, chunk_text, language, min_frequency, max_names, max_titles, None, output_dir, strip_honorifics, fuzzy_threshold, filter_mode ) finally: os.environ["_CHUNK_ALREADY_FILTERED"] = "0" # Reset if _prev_defer is None: if "GLOSSARY_DEFER_SAVE" in os.environ: del os.environ["GLOSSARY_DEFER_SAVE"] else: os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer return result else: return self._extract_with_patterns( chunk_text, language, min_frequency, max_names, max_titles, batch_size, None, output_dir, strip_honorifics, fuzzy_threshold, filter_mode ) def _apply_final_filter(self, entries, filter_mode): """Apply final filtering based on mode to ensure only requested types are included""" if filter_mode == "only_with_honorifics": # Filter to keep only entries that look like they have honorifics filtered = {} for key, value in entries.items(): # Check if the key contains known honorific patterns if self._has_honorific(key): filtered[key] = value print(f"📑 Final filter: Kept {len(filtered)} entries with honorifics (from {len(entries)} total)") return filtered elif filter_mode == "only_without_honorifics": # Filter to keep only entries without honorifics filtered = {} for key, value in entries.items(): if not self._has_honorific(key): filtered[key] = value print(f"📑 Final filter: Kept {len(filtered)} entries without honorifics (from {len(entries)} total)") return filtered else: return entries def _looks_like_name(self, text): """Check if text looks like a character name""" if not text: return False # Check for various name patterns # Korean names (2-4 hangul characters) if all(0xAC00 <= ord(char) <= 0xD7AF for char in text) and 2 <= len(text) <= 4: return True # Japanese names (mix of kanji/kana, 2-6 chars) has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in text) has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in text) if (has_kanji or has_kana) and 2 <= len(text) <= 6: return True # Chinese names (2-4 Chinese characters) if all(0x4E00 <= ord(char) <= 0x9FFF for char in text) and 2 <= len(text) <= 4: return True # English names (starts with capital, mostly letters) if text[0].isupper() and sum(1 for c in text if c.isalpha()) >= len(text) * 0.8: return True return False def _has_honorific(self, term): """Check if a term contains an honorific using PatternManager's comprehensive list""" if not term: return False term_lower = term.lower() # Check all language honorifics from PatternManager for language, honorifics_list in self.pattern_manager.CJK_HONORIFICS.items(): for honorific in honorifics_list: # For romanized/English honorifics with spaces or dashes if honorific.startswith(' ') or honorific.startswith('-'): if term_lower.endswith(honorific.lower()): return True # For CJK honorifics (no separator) else: if honorific in term: return True return False def _strip_all_honorifics(self, term, language='korean'): """Strip all honorifics from a term using PatternManager's lists""" if not term: return term result = term # Get honorifics for the specific language and English romanizations honorifics_to_strip = [] if language in self.pattern_manager.CJK_HONORIFICS: honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS[language]) honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) # Sort by length (longest first) to avoid partial matches honorifics_to_strip.sort(key=len, reverse=True) # Strip honorifics for honorific in honorifics_to_strip: if honorific.startswith(' ') or honorific.startswith('-'): # For romanized honorifics with separators if result.lower().endswith(honorific.lower()): result = result[:-len(honorific)] else: # For CJK honorifics (no separator) if result.endswith(honorific): result = result[:-len(honorific)] return result.strip() def _convert_to_csv_format(self, data): """Convert various glossary formats to CSV string format with enforced 3 columns""" csv_lines = ["type,raw_name,translated_name"] if isinstance(data, str): # Already CSV string if data.strip().startswith('type,raw_name'): return data # Try to parse as JSON try: data = json.loads(data) except: return data if isinstance(data, list): for item in data: if isinstance(item, dict): if 'type' in item and 'raw_name' in item: # Already in correct format line = f"{item['type']},{item['raw_name']},{item.get('translated_name', item['raw_name'])}" csv_lines.append(line) else: # Old format - default to 'term' type entry_type = 'term' raw_name = item.get('original_name', '') translated_name = item.get('name', raw_name) if raw_name and translated_name: csv_lines.append(f"{entry_type},{raw_name},{translated_name}") elif isinstance(data, dict): if 'entries' in data: # Has metadata wrapper, extract entries for original, translated in data['entries'].items(): csv_lines.append(f"term,{original},{translated}") else: # Plain dictionary - default to 'term' type for original, translated in data.items(): csv_lines.append(f"term,{original},{translated}") return '\n'.join(csv_lines) def _parse_csv_to_dict(self, csv_content): """Parse CSV content to dictionary for backward compatibility""" result = {} lines = csv_content.strip().split('\n') for line in lines[1:]: # Skip header if not line.strip(): continue parts = [p.strip() for p in line.split(',')] if len(parts) >= 3: result[parts[1]] = parts[2] # raw_name -> translated_name return result def _fuzzy_match(self, term1, term2, threshold=0.90): """Check if two terms match using fuzzy matching""" ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() return ratio >= threshold def _fuzzy_match_rapidfuzz(self, term_lower, text_lower, threshold, term_len): """Use rapidfuzz library for MUCH faster fuzzy matching""" from rapidfuzz import fuzz print(f"📑 Using RapidFuzz (C++ speed)...") start_time = time.time() matches_count = 0 threshold_percent = threshold * 100 # rapidfuzz uses 0-100 scale # Can use smaller step because rapidfuzz is so fast step = 1 # Check every position - rapidfuzz can handle it # Process text for i in range(0, len(text_lower) - term_len + 1, step): # Check stop flag every 10000 positions if i > 0 and i % 10000 == 0: if is_stop_requested(): print(f"📑 RapidFuzz stopped at position {i}") return matches_count window = text_lower[i:i + term_len] # rapidfuzz is fast enough we can check every position if fuzz.ratio(term_lower, window) >= threshold_percent: matches_count += 1 elapsed = time.time() - start_time print(f"📑 RapidFuzz found {matches_count} matches in {elapsed:.2f}s") return matches_count def _batch_compute_frequencies(self, terms, all_text, fuzzy_threshold=0.90, min_frequency=2): """Compute frequencies for all terms at once - MUCH faster than individual checking""" print(f"📑 Computing frequencies for {len(terms)} terms in batch mode...") start_time = time.time() # Result dictionary term_frequencies = {} # First pass: exact matching (very fast) print(f"📑 Phase 1: Exact matching...") text_lower = all_text.lower() for term in terms: if is_stop_requested(): return term_frequencies term_lower = term.lower() count = text_lower.count(term_lower) term_frequencies[term] = count exact_time = time.time() - start_time high_freq_terms = sum(1 for count in term_frequencies.values() if count >= min_frequency) print(f"📑 Exact matching complete: {high_freq_terms}/{len(terms)} terms meet threshold ({exact_time:.1f}s)") # If fuzzy matching is disabled, we're done if fuzzy_threshold >= 1.0: return term_frequencies # Second pass: fuzzy matching ONLY for low-frequency terms low_freq_terms = [term for term, count in term_frequencies.items() if count < min_frequency] if low_freq_terms: print(f"📑 Phase 2: Fuzzy matching for {len(low_freq_terms)} low-frequency terms...") # Try to use RapidFuzz batch processing try: from rapidfuzz import process, fuzz # For very large texts, sample it for fuzzy matching if len(text_lower) > 500000: print(f"📑 Text too large ({len(text_lower):,} chars), sampling for fuzzy matching...") # Sample every Nth character to reduce size sample_rate = max(1, len(text_lower) // 100000) sampled_text = text_lower[::sample_rate] else: sampled_text = text_lower # Create chunks of text for fuzzy matching chunk_size = 1000 # Process text in chunks text_chunks = [sampled_text[i:i+chunk_size] for i in range(0, len(sampled_text), chunk_size//2)] # Overlapping chunks print(f"📑 Processing {len(text_chunks)} text chunks...") threshold_percent = fuzzy_threshold * 100 # Process in batches to avoid memory issues batch_size = 100 # Process 100 terms at a time for batch_start in range(0, len(low_freq_terms), batch_size): if is_stop_requested(): break batch_end = min(batch_start + batch_size, len(low_freq_terms)) batch_terms = low_freq_terms[batch_start:batch_end] for term in batch_terms: if is_stop_requested(): break # Quick fuzzy search in chunks fuzzy_count = 0 for chunk in text_chunks[:50]: # Limit to first 50 chunks for speed if fuzz.partial_ratio(term.lower(), chunk) >= threshold_percent: fuzzy_count += 1 if fuzzy_count > 0: # Scale up based on sampling if len(text_lower) > 500000: fuzzy_count *= (len(text_lower) // len(sampled_text)) term_frequencies[term] += fuzzy_count if (batch_end % 500 == 0) or (batch_end == len(low_freq_terms)): elapsed = time.time() - start_time print(f"📑 Processed {batch_end}/{len(low_freq_terms)} terms ({elapsed:.1f}s)") except ImportError: print("📑 RapidFuzz not available, skipping fuzzy matching") total_time = time.time() - start_time final_high_freq = sum(1 for count in term_frequencies.values() if count >= min_frequency) print(f"📑 Batch frequency computation complete: {final_high_freq}/{len(terms)} terms accepted ({total_time:.1f}s)") return term_frequencies def _find_fuzzy_matches(self, term, text, threshold=0.90): """Find fuzzy matches of a term in text using efficient method with parallel processing""" start_time = time.time() term_lower = term.lower() text_lower = text.lower() term_len = len(term) # Only log for debugging if explicitly enabled debug_search = os.getenv("GLOSSARY_DEBUG_SEARCH", "0") == "1" if debug_search and len(text) > 100000: print(f"📑 Searching for '{term}' in {len(text):,} chars (threshold: {threshold})") # Strategy 1: Use exact matching first for efficiency exact_start = time.time() matches_count = text_lower.count(term_lower) exact_time = time.time() - exact_start if matches_count > 0: if debug_search and len(text) > 100000: print(f"📑 Found {matches_count} exact matches in {exact_time:.3f}s") return matches_count # Strategy 2: Try rapidfuzz if available (much faster) if matches_count == 0 and threshold < 1.0: try: from rapidfuzz import fuzz return self._fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len) except ImportError: pass # Fall back to parallel/sequential # Strategy 3: Fall back to parallel/sequential if rapidfuzz not available # Check if parallel processing is enabled extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) else: return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) # Check if parallel processing is enabled extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) else: return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) return matches_count def _parallel_fuzzy_search(self, term_lower, text_lower, threshold, term_len, num_workers): """Parallel fuzzy search using ThreadPoolExecutor""" print(f"📑 Starting parallel fuzzy search with {num_workers} workers...") text_len = len(text_lower) matches_count = 0 # Split text into overlapping chunks for parallel processing chunk_size = max(text_len // num_workers, term_len * 100) chunks = [] for i in range(0, text_len, chunk_size): # Add overlap to avoid missing matches at boundaries end = min(i + chunk_size + term_len - 1, text_len) chunks.append((i, text_lower[i:end])) print(f"📑 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each") # Process chunks in parallel with ThreadPoolExecutor(max_workers=num_workers) as executor: futures = [] for chunk_idx, (start_pos, chunk_text) in enumerate(chunks): if is_stop_requested(): return matches_count future = executor.submit( self._fuzzy_search_chunk, term_lower, chunk_text, threshold, term_len, chunk_idx, len(chunks) ) futures.append(future) # Collect results for future in as_completed(futures): if is_stop_requested(): executor.shutdown(wait=False) return matches_count try: chunk_matches = future.result() matches_count += chunk_matches except Exception as e: print(f"📑 ⚠️ Chunk processing error: {e}") print(f"📑 Parallel fuzzy search found {matches_count} matches") return matches_count def _fuzzy_search_chunk(self, term_lower, chunk_text, threshold, term_len, chunk_idx, total_chunks): """Process a single chunk for fuzzy matches""" chunk_matches = 0 # Use a more efficient step size - no need to check every position step = max(1, term_len // 3) # Check every third of term length for i in range(0, len(chunk_text) - term_len + 1, step): # Check stop flag periodically if i > 0 and i % 1000 == 0: if is_stop_requested(): return chunk_matches window = chunk_text[i:i + term_len] # Use SequenceMatcher for fuzzy matching if SequenceMatcher(None, term_lower, window).ratio() >= threshold: chunk_matches += 1 # Log progress for this chunk if total_chunks > 1: print(f"📑 Chunk {chunk_idx + 1}/{total_chunks} completed: {chunk_matches} matches") return chunk_matches def _sequential_fuzzy_search(self, term_lower, text_lower, threshold, term_len): """Sequential fuzzy search (fallback for small texts or single worker)""" print(f"📑 Starting sequential fuzzy search...") fuzzy_start = time.time() matches_count = 0 # More efficient step size step = max(1, term_len // 3) total_windows = (len(text_lower) - term_len + 1) // step print(f"📑 Checking ~{total_windows:,} windows with step size {step}") windows_checked = 0 for i in range(0, len(text_lower) - term_len + 1, step): # Check stop flag frequently if i > 0 and i % (step * 100) == 0: if is_stop_requested(): return matches_count # Progress log for very long operations if windows_checked % 1000 == 0 and windows_checked > 0: elapsed = time.time() - fuzzy_start rate = windows_checked / elapsed if elapsed > 0 else 0 eta = (total_windows - windows_checked) / rate if rate > 0 else 0 print(f"📑 Progress: {windows_checked}/{total_windows} windows, {rate:.0f} w/s, ETA: {eta:.1f}s") window = text_lower[i:i + term_len] if SequenceMatcher(None, term_lower, window).ratio() >= threshold: matches_count += 1 windows_checked += 1 fuzzy_time = time.time() - fuzzy_start print(f"📑 Sequential fuzzy search completed in {fuzzy_time:.2f}s, found {matches_count} matches") return matches_count def _fuzzy_match(self, term1, term2, threshold=0.90): """Check if two terms match using fuzzy matching (unchanged)""" ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() return ratio >= threshold def _strip_honorific(self, term, language_hint='unknown'): """Strip honorific from a term if present""" if not term: return term # Get honorifics for the detected language honorifics_to_check = [] if language_hint in self.pattern_manager.CJK_HONORIFICS: honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS[language_hint]) honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) # Check and remove honorifics for honorific in honorifics_to_check: if honorific.startswith('-') or honorific.startswith(' '): # English-style suffix if term.endswith(honorific): return term[:-len(honorific)].strip() else: # CJK-style suffix (no separator) if term.endswith(honorific): return term[:-len(honorific)] return term def _translate_chunk_traditional(self, chunk_text, chunk_index, total_chunks, chapter_title=""): """Simplified translation for traditional APIs (DeepL, Google Translate)""" print(f"📝 Using traditional translation API for chunk {chunk_index}/{total_chunks}") # Traditional APIs don't use complex prompts, just need the text messages = [] # Add minimal system context for language detection profile = self.active_profile if profile == 'korean': lang_hint = "Translating from Korean to English" elif profile == 'japanese': lang_hint = "Translating from Japanese to English" elif profile == 'chinese': lang_hint = "Translating from Chinese to English" else: lang_hint = "Translating to English" messages.append({ "role": "system", "content": lang_hint }) # For traditional APIs, we need to handle glossary differently # Apply glossary terms as preprocessing if available processed_text = chunk_text if hasattr(self, 'glossary_manager') and self.glossary_manager and self.glossary_manager.entries: # Pre-process: Mark glossary terms with placeholders glossary_placeholders = {} placeholder_index = 0 for entry in self.glossary_manager.entries: source = entry.get('source', '') target = entry.get('target', '') if source and target and source in processed_text: # Create unique placeholder placeholder = f"[[GLOSS_{placeholder_index}]]" glossary_placeholders[placeholder] = target processed_text = processed_text.replace(source, placeholder) placeholder_index += 1 print(f"📚 Applied {len(glossary_placeholders)} glossary placeholders") # Add the text to translate messages.append({ "role": "user", "content": processed_text }) # Send to API try: response = self.client.send(messages) if response and response.content: translated_text = response.content # Post-process: Replace placeholders with glossary terms if 'glossary_placeholders' in locals(): for placeholder, target in glossary_placeholders.items(): translated_text = translated_text.replace(placeholder, target) print(f"✅ Restored {len(glossary_placeholders)} glossary terms") # Log detected language if available if hasattr(response, 'usage') and response.usage: detected_lang = response.usage.get('detected_source_lang') if detected_lang: print(f"🔍 Detected source language: {detected_lang}") return translated_text else: print("❌ No translation received from traditional API") return None except Exception as e: print(f"❌ Traditional API translation error: {e}") return None def _filter_text_for_glossary(self, text, min_frequency=2): """Filter text to extract only meaningful content for glossary extraction""" import re from collections import Counter from concurrent.futures import ThreadPoolExecutor, as_completed import time filter_start_time = time.time() print(f"📑 Starting smart text filtering...") print(f"📑 Input text size: {len(text):,} characters") # Clean HTML if present print(f"📑 Step 1/7: Cleaning HTML tags...") from bs4 import BeautifulSoup soup = BeautifulSoup(text, 'html.parser') clean_text = soup.get_text() print(f"📑 Clean text size: {len(clean_text):,} characters") # Detect primary language for better filtering print(f"📑 Step 2/7: Detecting primary language...") def detect_primary_language(text_sample): sample = text_sample[:1000] korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) if korean_chars > 50: return 'korean' elif japanese_kana > 20: return 'japanese' elif chinese_chars > 50 and japanese_kana < 10: return 'chinese' else: return 'english' primary_lang = detect_primary_language(clean_text) print(f"📑 Detected primary language: {primary_lang}") # Split into sentences for better context print(f"📑 Step 3/7: Splitting text into sentences...") sentences = re.split(r'[.!?。!?]+', clean_text) print(f"📑 Found {len(sentences):,} sentences") # Extract potential terms (words/phrases that appear multiple times) print(f"📑 Step 4/7: Setting up extraction patterns and exclusion rules...") word_freq = Counter() # Pattern for detecting potential names/terms based on capitalization or special characters # Korean names: 2-4 hangul characters WITHOUT honorifics korean_pattern = r'[가-힣]{2,4}' # Japanese names: kanji/hiragana/katakana combinations japanese_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,6}' # Chinese names: 2-4 Chinese characters chinese_pattern = r'[\u4e00-\u9fff]{2,4}' # English proper nouns: Capitalized words english_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b' # Combine patterns combined_pattern = f'({korean_pattern}|{japanese_pattern}|{chinese_pattern}|{english_pattern})' print(f"📑 Using combined regex pattern for {primary_lang} text") # Get honorifics and title patterns for the detected language honorifics_to_exclude = set() if primary_lang in self.pattern_manager.CJK_HONORIFICS: honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS[primary_lang]) # Also add English romanizations honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS.get('english', [])) # Compile title patterns for the language title_patterns = [] if primary_lang in self.pattern_manager.TITLE_PATTERNS: for pattern in self.pattern_manager.TITLE_PATTERNS[primary_lang]: title_patterns.append(re.compile(pattern)) # Function to check if a term should be excluded def should_exclude_term(term): term_lower = term.lower() # Check if it's a common word if term in self.pattern_manager.COMMON_WORDS or term_lower in self.pattern_manager.COMMON_WORDS: return True # Check if it contains honorifics for honorific in honorifics_to_exclude: if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): return True # Check if it matches title patterns for pattern in title_patterns: if pattern.search(term): return True # Check if it's a number (including Chinese numbers) if term in self.pattern_manager.CHINESE_NUMS: return True # Check if it's just digits if term.isdigit(): return True return False # Extract potential terms from each sentence print(f"📑 Step 5/7: Extracting and filtering terms from sentences...") # Check if we should use parallel processing extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) # Auto-detect optimal workers if not set if extraction_workers == 1 and len(sentences) > 1000: # Use more cores for better parallelization cpu_count = os.cpu_count() or 4 extraction_workers = min(cpu_count, 12) # Use up to 12 cores print(f"📑 Auto-detected {cpu_count} CPU cores, using {extraction_workers} workers") use_parallel = extraction_workers > 1 and len(sentences) > 100 if use_parallel: print(f"📑 Using parallel processing with {extraction_workers} workers") print(f"📑 Estimated speedup: {extraction_workers}x faster") important_sentences = [] seen_contexts = set() processed_count = 0 total_sentences = len(sentences) last_progress_time = time.time() def process_sentence_batch(batch_sentences, batch_idx): """Process a batch of sentences""" local_word_freq = Counter() local_important = [] local_seen = set() for sentence in batch_sentences: sentence = sentence.strip() if len(sentence) < 10 or len(sentence) > 500: continue # Find all potential terms in this sentence matches = re.findall(combined_pattern, sentence) if matches: # Filter out excluded terms filtered_matches = [] for match in matches: if not should_exclude_term(match): local_word_freq[match] += 1 filtered_matches.append(match) # Keep sentences with valid potential terms if filtered_matches: sentence_key = ' '.join(sorted(filtered_matches)) if sentence_key not in local_seen: local_important.append(sentence) local_seen.add(sentence_key) return local_word_freq, local_important, local_seen, batch_idx if use_parallel: # Force SMALL batches for real parallelization # We want MANY small batches, not few large ones! # Calculate based on total sentences total_sentences = len(sentences) if total_sentences < 1000: # Small dataset: 50-100 sentences per batch optimal_batch_size = 100 elif total_sentences < 10000: # Medium dataset: 200 sentences per batch optimal_batch_size = 200 elif total_sentences < 50000: # Large dataset: 300 sentences per batch optimal_batch_size = 300 else: # Very large dataset: 400 sentences per batch max optimal_batch_size = 400 # Ensure we have enough batches for all workers min_batches = extraction_workers * 3 # At least 3 batches per worker max_batch_size = max(50, total_sentences // min_batches) optimal_batch_size = min(optimal_batch_size, max_batch_size) print(f"📑 Total sentences: {total_sentences:,}") print(f"📑 Target batch size: {optimal_batch_size} sentences") # Calculate expected number of batches expected_batches = (total_sentences + optimal_batch_size - 1) // optimal_batch_size print(f"📑 Expected batches: {expected_batches} (for {extraction_workers} workers)") print(f"📑 Batches per worker: ~{expected_batches // extraction_workers} batches") batches = [sentences[i:i + optimal_batch_size] for i in range(0, len(sentences), optimal_batch_size)] print(f"📑 Processing {len(batches)} batches of ~{optimal_batch_size} sentences each") print(f"📑 Expected speedup: {min(extraction_workers, len(batches))}x (using {extraction_workers} workers)") # Decide between ThreadPoolExecutor and ProcessPoolExecutor import multiprocessing in_subprocess = multiprocessing.current_process().name != 'MainProcess' # Use ProcessPoolExecutor for better parallelism on larger datasets use_process_pool = (not in_subprocess and len(sentences) > 5000) if use_process_pool: print(f"📑 Using ProcessPoolExecutor for maximum performance (true parallelism)") executor_class = ProcessPoolExecutor else: print(f"📑 Using ThreadPoolExecutor for sentence processing") executor_class = ThreadPoolExecutor with executor_class(max_workers=extraction_workers) as executor: futures = [] # Prepare data for ProcessPoolExecutor if needed if use_process_pool: # Serialize exclusion check data for process pool exclude_check_data = ( list(honorifics_to_exclude), [p.pattern for p in title_patterns], # Convert regex to strings self.pattern_manager.COMMON_WORDS, self.pattern_manager.CHINESE_NUMS ) for idx, batch in enumerate(batches): if use_process_pool: # Use module-level function for ProcessPoolExecutor future = executor.submit(_process_sentence_batch_for_extraction, (batch, idx, combined_pattern, exclude_check_data)) else: # Use local function for ThreadPoolExecutor future = executor.submit(process_sentence_batch, batch, idx) futures.append(future) # Yield to GUI when submitting futures if idx % 10 == 0: time.sleep(0.001) # Collect results with progress completed_batches = 0 batch_start_time = time.time() for future in as_completed(futures): # Get result without timeout - as_completed already handles waiting local_word_freq, local_important, local_seen, batch_idx = future.result() # Merge results word_freq.update(local_word_freq) for sentence in local_important: sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence))) if sentence_key not in seen_contexts: important_sentences.append(sentence) seen_contexts.add(sentence_key) processed_count += len(batches[batch_idx]) completed_batches += 1 # Show progress every 10 batches or at key milestones if completed_batches % 10 == 0 or completed_batches == len(batches): progress = (processed_count / total_sentences) * 100 elapsed = time.time() - batch_start_time rate = (processed_count / elapsed) if elapsed > 0 else 0 print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) | Batch {completed_batches}/{len(batches)} | {rate:.0f} sent/sec") # Yield to GUI after each batch completes time.sleep(0.001) else: # Sequential processing with progress for idx, sentence in enumerate(sentences): sentence = sentence.strip() if len(sentence) < 10 or len(sentence) > 500: continue # Find all potential terms in this sentence matches = re.findall(combined_pattern, sentence) if matches: # Filter out excluded terms filtered_matches = [] for match in matches: if not should_exclude_term(match): word_freq[match] += 1 filtered_matches.append(match) # Keep sentences with valid potential terms if filtered_matches: sentence_key = ' '.join(sorted(filtered_matches)) if sentence_key not in seen_contexts: important_sentences.append(sentence) seen_contexts.add(sentence_key) # Show progress every 1000 sentences or 2 seconds if idx % 1000 == 0 or (time.time() - last_progress_time > 2): progress = ((idx + 1) / total_sentences) * 100 print(f"📑 Processing sentences: {idx + 1:,}/{total_sentences:,} ({progress:.1f}%)") last_progress_time = time.time() # Yield to GUI thread every 1000 sentences time.sleep(0.001) # Tiny sleep to let GUI update # Yield to GUI thread every 1000 sentences time.sleep(0.001) # Tiny sleep to let GUI update print(f"📑 Found {len(important_sentences):,} sentences with potential glossary terms") # Step 6/7: Deduplicate and normalize terms print(f"📑 Step 6/7: Normalizing and deduplicating {len(word_freq):,} unique terms...") # Since should_exclude_term already filters honorifics, we just need to deduplicate # based on normalized forms (lowercase, etc.) combined_freq = Counter() term_count = 0 for term, count in word_freq.items(): # Normalize term for deduplication (but keep original form) normalized = term.lower().strip() # Keep the version with highest count if normalized in combined_freq: # If we already have this normalized form, keep the one with higher count if count > combined_freq[normalized]: # Remove old entry and add new one del combined_freq[normalized] combined_freq[term] = count else: combined_freq[term] = count term_count += 1 # Yield to GUI every 1000 terms if term_count % 1000 == 0: time.sleep(0.001) print(f"📑 Deduplicated to {len(combined_freq):,} unique terms") # Filter to keep only terms that appear at least min_frequency times frequent_terms = {term: count for term, count in combined_freq.items() if count >= min_frequency} # Build filtered text focusing on sentences containing frequent terms print(f"📑 Step 7/7: Building filtered text from relevant sentences...") # OPTIMIZATION: Skip sentences that already passed filtering in step 5 # These sentences already contain glossary terms, no need to check again! # We just need to limit the sample size filtered_sentences = important_sentences # Already filtered! print(f"📑 Using {len(filtered_sentences):,} pre-filtered sentences (already contain glossary terms)") # For extremely large datasets, we can optionally do additional filtering if len(filtered_sentences) > 10000 and len(frequent_terms) > 1000: print(f"📑 Large dataset detected - applying frequency-based filtering...") print(f"📑 Filtering {len(filtered_sentences):,} sentences for top frequent terms...") # Sort terms by frequency to prioritize high-frequency ones sorted_terms = sorted(frequent_terms.items(), key=lambda x: x[1], reverse=True) top_terms = dict(sorted_terms[:1000]) # Focus on top 1000 most frequent terms print(f"📑 Using top {len(top_terms):,} most frequent terms for final filtering") # Use parallel processing only if really needed if use_parallel and len(filtered_sentences) > 5000: import multiprocessing in_subprocess = multiprocessing.current_process().name != 'MainProcess' # Create a simple set of terms for fast lookup (no variations needed) term_set = set(top_terms.keys()) print(f"📑 Using parallel filtering with {extraction_workers} workers...") # Optimize batch size check_batch_size = 500 # Larger batches since we're doing simpler checks check_batches = [filtered_sentences[i:i + check_batch_size] for i in range(0, len(filtered_sentences), check_batch_size)] print(f"📑 Processing {len(check_batches)} batches of ~{check_batch_size} sentences") # Simple function to check if sentence contains any top term def check_batch_simple(batch): result = [] for sentence in batch: # Simple substring check - much faster than regex for term in term_set: if term in sentence: result.append(sentence) break return result new_filtered = [] with ThreadPoolExecutor(max_workers=extraction_workers) as executor: futures = [executor.submit(check_batch_simple, batch) for batch in check_batches] for future in as_completed(futures): new_filtered.extend(future.result()) filtered_sentences = new_filtered print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") else: # For smaller datasets, simple sequential filtering print(f"📑 Using sequential filtering...") new_filtered = [] for i, sentence in enumerate(filtered_sentences): for term in top_terms: if term in sentence: new_filtered.append(sentence) break if i % 1000 == 0: print(f"📑 Progress: {i:,}/{len(filtered_sentences):,} sentences") time.sleep(0.001) filtered_sentences = new_filtered print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") print(f"📑 Selected {len(filtered_sentences):,} sentences containing frequent terms") # Limit the number of sentences to reduce token usage max_sentences = int(os.getenv("GLOSSARY_MAX_SENTENCES", "200")) if len(filtered_sentences) > max_sentences: print(f"📑 Limiting to {max_sentences} representative sentences (from {len(filtered_sentences):,})") # Take a representative sample step = len(filtered_sentences) // max_sentences filtered_sentences = filtered_sentences[::step][:max_sentences] filtered_text = ' '.join(filtered_sentences) # Calculate and display filtering statistics filter_end_time = time.time() filter_duration = filter_end_time - filter_start_time original_length = len(clean_text) filtered_length = len(filtered_text) reduction_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0 print(f"\n📑 === FILTERING COMPLETE ===") print(f"📑 Duration: {filter_duration:.1f} seconds") print(f"📑 Text reduction: {original_length:,} → {filtered_length:,} chars ({reduction_percent:.1f}% reduction)") print(f"📑 Terms found: {len(frequent_terms):,} unique terms (min frequency: {min_frequency})") print(f"📑 Final output: {len(filtered_sentences)} sentences, {filtered_length:,} characters") print(f"📑 Performance: {(original_length / filter_duration / 1000):.1f}K chars/second") print(f"📑 ========================\n") return filtered_text, frequent_terms def _extract_with_custom_prompt(self, custom_prompt, all_text, language, min_frequency, max_names, max_titles, existing_glossary, output_dir, strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'): """Extract glossary using custom AI prompt with proper filtering""" print("📑 Using custom automatic glossary prompt") extraction_start = time.time() # Check stop flag if is_stop_requested(): print("📑 ❌ Glossary extraction stopped by user") return {} # Note: Filter mode can be controlled via the configurable prompt environment variable # No hardcoded filter instructions are added here try: MODEL = os.getenv("MODEL", "gemini-2.0-flash") API_KEY = (os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_OR_Gemini_API_KEY") or os.getenv("GEMINI_API_KEY")) if is_traditional_translation_api(MODEL): return self._translate_chunk_traditional(chunk_text, chunk_index, total_chunks, chapter_title) elif not API_KEY: print(f"📑 No API key found, falling back to pattern-based extraction") return self._extract_with_patterns(all_text, language, min_frequency, max_names, max_titles, 50, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) else: print(f"📑 Using AI-assisted extraction with custom prompt") from unified_api_client import UnifiedClient, UnifiedClientError client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() # Apply thread submission delay using the client's method thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) if thread_delay > 0: client._apply_thread_submission_delay() # Check if cancelled during delay if hasattr(client, '_cancelled') and client._cancelled: print("📑 ❌ Glossary extraction stopped during delay") return {} # Check if text is already filtered (from chunking) already_filtered = os.getenv("_CHUNK_ALREADY_FILTERED", "0") == "1" if already_filtered: print("📑 Text already filtered during chunking, skipping re-filtering") text_sample = all_text # Use as-is since it's already filtered detected_terms = {} else: # Apply smart filtering to reduce noise and focus on meaningful content force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER", "0") == "1" use_smart_filter = (os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1") and not force_disable if use_smart_filter: print("📑 Applying smart text filtering to reduce noise...") text_sample, detected_terms = self._filter_text_for_glossary(all_text, min_frequency) else: print("📑 Smart filter disabled - using raw text sample") # Fallback to simple truncation max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000")) text_sample = all_text[:max_text_size] if len(all_text) > max_text_size and max_text_size > 0 else all_text detected_terms = {} # Replace placeholders in prompt prompt = custom_prompt.replace('{language}', language) prompt = prompt.replace('{min_frequency}', str(min_frequency)) prompt = prompt.replace('{max_names}', str(max_names)) prompt = prompt.replace('{max_titles}', str(max_titles)) # Get the format instructions from environment variable format_instructions = os.getenv("GLOSSARY_FORMAT_INSTRUCTIONS", "") # If no format instructions are provided, use a default if not format_instructions: format_instructions = """ Return the results in EXACT CSV format with this header: type,raw_name,translated_name For example: character,김상현,Kim Sang-hyu character,갈편제,Gale Hardest character,디히릿 아데,Dihirit Ade Only include entries that actually appear in the text. Do not use quotes around values unless they contain commas. Text to analyze: {text_sample}""" # Replace placeholders in format instructions format_instructions = format_instructions.replace('{text_sample}', text_sample) # Combine the user's prompt with format instructions enhanced_prompt = f"{prompt}\n\n{format_instructions}" messages = [ {"role": "system", "content": "You are a glossary extraction assistant. Return ONLY CSV format with exactly 3 columns: type,raw_name,translated_name. The 'type' column should classify entries (e.g., character, term, location, etc.)."}, {"role": "user", "content": enhanced_prompt} ] # Check stop before API call if is_stop_requested(): print("📑 ❌ Glossary extraction stopped before API call") return {} try: temperature = float(os.getenv("TEMPERATURE", "0.3")) max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096")) # Use send_with_interrupt for interruptible API call chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "900")) # 15 minute default for glossary print(f"📑 Sending AI extraction request (timeout: {chunk_timeout}s, interruptible)...") # Before API call api_start = time.time() print(f"📑 Preparing API request (text size: {len(text_sample):,} chars)...") print(f"📑 ⏳ Processing {len(text_sample):,} characters... Please wait, this may take 5-10 minutes") response = send_with_interrupt( messages=messages, client=client, temperature=temperature, max_tokens=max_tokens, stop_check_fn=is_stop_requested, chunk_timeout=chunk_timeout ) api_time = time.time() - api_start print(f"📑 API call completed in {api_time:.1f}s") # Get the actual text from the response if hasattr(response, 'content'): response_text = response.content else: response_text = str(response) # Before processing response process_start = time.time() print(f"📑 Processing AI response...") # Process response and build CSV csv_lines = self._process_ai_response(response_text, all_text, min_frequency, strip_honorifics, fuzzy_threshold, language, filter_mode) print(f"📑 AI extracted {len(csv_lines) - 1} valid terms (header excluded)") process_time = time.time() - process_start print(f"📑 Response processing took {process_time:.1f}s") # If we're running per-chunk, defer all heavy work and saving if os.getenv("GLOSSARY_DEFER_SAVE", "0") == "1": return csv_lines # Check stop before merging if is_stop_requested(): print("📑 ❌ Glossary generation stopped before merging") return {} # Merge with existing glossary if present if existing_glossary: csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language) # Fuzzy matching deduplication skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1" if not skip_frequency_check: # Only dedupe if we're checking frequencies # Time the deduplication dedup_start = time.time() original_count = len(csv_lines) - 1 # Exclude header csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) dedup_time = time.time() - dedup_start final_count = len(csv_lines) - 1 # Exclude header removed_count = original_count - final_count print(f"📑 Deduplication completed in {dedup_time:.1f}s") print(f"📑 - Original entries: {original_count}") print(f"📑 - Duplicates removed: {removed_count}") print(f"📑 - Final entries: {final_count}") # Store for summary statistics self._dedup_time = getattr(self, '_dedup_time', 0) + dedup_time else: print(f"📑 Skipping deduplication (frequency check disabled)") # Apply filter mode to final results csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode) # Check if we should use token-efficient format use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' if not use_legacy_format: # Convert to token-efficient format csv_lines = self._convert_to_token_efficient_format(csv_lines) # Final sanitize to prevent stray headers csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format) # Create final CSV content csv_content = '\n'.join(csv_lines) # Save glossary as CSV with proper extension glossary_path = os.path.join(output_dir, "glossary.csv") self._atomic_write_file(glossary_path, csv_content) print(f"\n📑 ✅ AI-ASSISTED GLOSSARY SAVED!") print(f"📑 File: {glossary_path}") c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format) print(f"📑 Character entries: {c_count}") print(f"📑 Term entries: {t_count}") print(f"📑 Total entries: {total}") total_time = time.time() - extraction_start print(f"📑 Total extraction time: {total_time:.1f}s") return self._parse_csv_to_dict(csv_content) except UnifiedClientError as e: if "stopped by user" in str(e).lower(): print(f"📑 ❌ AI extraction interrupted by user") return {} else: print(f"⚠️ AI extraction failed: {e}") print("📑 Falling back to pattern-based extraction") return self._extract_with_patterns(all_text, language, min_frequency, max_names, max_titles, 50, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) except Exception as e: print(f"⚠️ AI extraction failed: {e}") import traceback traceback.print_exc() print("📑 Falling back to pattern-based extraction") return self._extract_with_patterns(all_text, language, min_frequency, max_names, max_titles, 50, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) except Exception as e: print(f"⚠️ Custom prompt processing failed: {e}") import traceback traceback.print_exc() return self._extract_with_patterns(all_text, language, min_frequency, max_names, max_titles, 50, existing_glossary, output_dir, strip_honorifics, fuzzy_threshold, filter_mode) def _filter_csv_by_mode(self, csv_lines, filter_mode): """Filter CSV lines based on the filter mode""" if filter_mode == "all": return csv_lines filtered = [csv_lines[0]] # Keep header for line in csv_lines[1:]: if not line.strip(): continue parts = [p.strip() for p in line.split(',')] if len(parts) < 3: continue entry_type = parts[0].lower() raw_name = parts[1] if filter_mode == "only_with_honorifics": # Only keep character entries with honorifics if entry_type == "character" and self._has_honorific(raw_name): filtered.append(line) elif filter_mode == "only_without_honorifics": # Keep terms and characters without honorifics if entry_type == "term" or (entry_type == "character" and not self._has_honorific(raw_name)): filtered.append(line) print(f"📑 Filter '{filter_mode}': {len(filtered)-1} entries kept from {len(csv_lines)-1}") return filtered def _process_ai_response(self, response_text, all_text, min_frequency, strip_honorifics, fuzzy_threshold, language, filter_mode): """Process AI response and return CSV lines""" # option to completely skip frequency validation for speed skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" if skip_all_validation: print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") # Clean response text response_text = response_text.strip() # Remove string representation artifacts if they wrap the entire response if response_text.startswith('("') and response_text.endswith('")'): response_text = response_text[2:-2] elif response_text.startswith('"') and response_text.endswith('"'): response_text = response_text[1:-1] elif response_text.startswith('(') and response_text.endswith(')'): response_text = response_text[1:-1] # Unescape the string response_text = response_text.replace('\\n', '\n') response_text = response_text.replace('\\r', '') response_text = response_text.replace('\\t', '\t') response_text = response_text.replace('\\"', '"') response_text = response_text.replace("\\'", "'") response_text = response_text.replace('\\\\', '\\') # Clean up markdown code blocks if present if '```' in response_text: parts = response_text.split('```') for part in parts: if 'csv' in part[:10].lower(): response_text = part[part.find('\n')+1:] break elif part.strip() and ('type,raw_name' in part or 'character,' in part or 'term,' in part): response_text = part break # Normalize line endings response_text = response_text.replace('\r\n', '\n').replace('\r', '\n') lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()] csv_lines = [] header_found = False # Check if we should skip frequency check skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1" # Add option to completely skip ALL validation for maximum speed skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" if skip_all_validation: print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") # Always use the enforced 3-column header csv_lines.append("type,raw_name,translated_name") # Process the AI response for line in lines: # Skip header lines if 'type' in line.lower() and 'raw_name' in line.lower(): continue # Parse CSV line parts = [p.strip().strip('"\"') for p in line.split(',')] if len(parts) >= 3: # Has all 3 columns entry_type = parts[0] raw_name = parts[1] translated_name = parts[2] if raw_name and translated_name: csv_lines.append(f"{entry_type},{raw_name},{translated_name}") elif len(parts) == 2: # Missing type, default to 'term' raw_name = parts[0] translated_name = parts[1] if raw_name and translated_name: csv_lines.append(f"term,{raw_name},{translated_name}") print(f"📑 Fast mode: Accepted {len(csv_lines) - 1} entries without validation") return csv_lines # For "only_with_honorifics" mode, ALWAYS skip frequency check if filter_mode == "only_with_honorifics": skip_frequency_check = True print("📑 Filter mode 'only_with_honorifics': Bypassing frequency checks") print(f"📑 Processing {len(lines)} lines from AI response...") print(f"📑 Text corpus size: {len(all_text):,} chars") print(f"📑 Frequency checking: {'DISABLED' if skip_frequency_check else f'ENABLED (min: {min_frequency})'}") print(f"📑 Fuzzy threshold: {fuzzy_threshold}") # Collect all terms first for batch processing all_terms_to_check = [] term_info_map = {} # Map term to its full info if not skip_frequency_check: # First pass: collect all terms that need frequency checking for line in lines: if 'type' in line.lower() and 'raw_name' in line.lower(): continue # Skip header parts = [p.strip().strip('"\"') for p in line.split(',')] if len(parts) >= 3: entry_type = parts[0].lower() raw_name = parts[1] translated_name = parts[2] elif len(parts) == 2: entry_type = 'term' raw_name = parts[0] translated_name = parts[1] else: continue if raw_name and translated_name: # Store for batch processing original_raw = raw_name if strip_honorifics: raw_name = self._strip_honorific(raw_name, language) all_terms_to_check.append(raw_name) term_info_map[raw_name] = { 'entry_type': entry_type, 'original_raw': original_raw, 'translated_name': translated_name, 'line': line } # Batch compute all frequencies at once if all_terms_to_check: print(f"📑 Computing frequencies for {len(all_terms_to_check)} terms...") term_frequencies = self._batch_compute_frequencies( all_terms_to_check, all_text, fuzzy_threshold, min_frequency ) else: term_frequencies = {} # Now process the results using pre-computed frequencies entries_processed = 0 entries_accepted = 0 # Process based on mode if filter_mode == "only_with_honorifics" or skip_frequency_check: # For these modes, accept all entries csv_lines.append("type,raw_name,translated_name") # Header for line in lines: if 'type' in line.lower() and 'raw_name' in line.lower(): continue # Skip header parts = [p.strip().strip('"\"') for p in line.split(',')] if len(parts) >= 3: entry_type = parts[0].lower() raw_name = parts[1] translated_name = parts[2] elif len(parts) == 2: entry_type = 'term' raw_name = parts[0] translated_name = parts[1] else: continue if raw_name and translated_name: csv_line = f"{entry_type},{raw_name},{translated_name}" csv_lines.append(csv_line) entries_accepted += 1 print(f"📑 Accepted {entries_accepted} entries (frequency check disabled)") else: # Use pre-computed frequencies csv_lines.append("type,raw_name,translated_name") # Header for term, info in term_info_map.items(): count = term_frequencies.get(term, 0) # Also check original form if it was stripped if info['original_raw'] != term: count += term_frequencies.get(info['original_raw'], 0) if count >= min_frequency: csv_line = f"{info['entry_type']},{term},{info['translated_name']}" csv_lines.append(csv_line) entries_accepted += 1 # Log first few examples if entries_accepted <= 5: print(f"📑 ✓ Example: {term} -> {info['translated_name']} (freq: {count})") print(f"📑 Frequency filtering complete: {entries_accepted}/{len(term_info_map)} terms accepted") # Ensure we have at least the header if len(csv_lines) == 0: csv_lines.append("type,raw_name,translated_name") # Print final summary print(f"📑 Processing complete: {entries_accepted} terms accepted") return csv_lines def _deduplicate_glossary_with_fuzzy(self, csv_lines, fuzzy_threshold): """Apply fuzzy matching to remove duplicate entries from the glossary with stop flag checks""" from difflib import SequenceMatcher print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") # Check stop flag at start if is_stop_requested(): print(f"📑 ❌ Deduplication stopped by user") return csv_lines header_line = csv_lines[0] # Keep header entry_lines = csv_lines[1:] # Data lines deduplicated = [header_line] seen_entries = {} # Use dict for O(1) lookups instead of list seen_names_lower = set() # Quick exact match check removed_count = 0 total_entries = len(entry_lines) # Pre-process all entries for faster comparison print(f"📑 Processing {total_entries} entries for deduplication...") for idx, line in enumerate(entry_lines): # Check stop flag every 100 entries if idx > 0 and idx % 100 == 0: if is_stop_requested(): print(f"📑 ❌ Deduplication stopped at entry {idx}/{total_entries}") return deduplicated # Show progress for large glossaries if total_entries > 500 and idx % 200 == 0: progress = (idx / total_entries) * 100 print(f"📑 Deduplication progress: {progress:.1f}% ({idx}/{total_entries})") if not line.strip(): continue parts = [p.strip() for p in line.split(',')] if len(parts) < 3: continue entry_type = parts[0] raw_name = parts[1] translated_name = parts[2] raw_name_lower = raw_name.lower() # Fast exact duplicate check first if raw_name_lower in seen_names_lower: removed_count += 1 continue # For fuzzy matching, only check if threshold is less than 1.0 is_duplicate = False if fuzzy_threshold < 1.0: # Use a more efficient approach: only check similar length strings name_len = len(raw_name) min_len = int(name_len * 0.7) max_len = int(name_len * 1.3) # Only compare with entries of similar length candidates = [] for seen_name, (seen_type, seen_trans) in seen_entries.items(): if min_len <= len(seen_name) <= max_len: candidates.append(seen_name) # Check fuzzy similarity with candidates for seen_name in candidates: # Quick character overlap check before expensive SequenceMatcher char_overlap = len(set(raw_name_lower) & set(seen_name.lower())) if char_overlap < len(raw_name_lower) * 0.5: continue # Too different, skip raw_similarity = SequenceMatcher(None, raw_name_lower, seen_name.lower()).ratio() if raw_similarity >= fuzzy_threshold: if removed_count < 10: # Only log first few print(f"📑 Removing duplicate: '{raw_name}' ~= '{seen_name}' (similarity: {raw_similarity:.2%})") removed_count += 1 is_duplicate = True break if not is_duplicate: seen_entries[raw_name] = (entry_type, translated_name) seen_names_lower.add(raw_name_lower) deduplicated.append(line) print(f"📑 ✅ Removed {removed_count} duplicates from glossary") print(f"📑 Final glossary size: {len(deduplicated) - 1} unique entries") return deduplicated def _merge_csv_entries(self, new_csv_lines, existing_glossary, strip_honorifics, language): """Merge CSV entries with existing glossary with stop flag checks""" # Check stop flag at start if is_stop_requested(): print(f"📑 ❌ Glossary merge stopped by user") return new_csv_lines # Parse existing glossary existing_lines = [] existing_names = set() if isinstance(existing_glossary, str): # Already CSV format lines = existing_glossary.strip().split('\n') total_lines = len(lines) for idx, line in enumerate(lines): # Check stop flag every 50 lines if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Merge stopped while processing existing glossary at line {idx}/{total_lines}") return new_csv_lines if total_lines > 200: progress = (idx / total_lines) * 100 print(f"📑 Processing existing glossary: {progress:.1f}%") if 'type,raw_name' in line.lower(): continue # Skip header line_stripped = line.strip() # Skip token-efficient lines and section/bullet markers if not line_stripped or line_stripped.startswith('===') or line_stripped.startswith('*') or line_stripped.lower().startswith('glossary:'): continue parts = [p.strip() for p in line.split(',')] # Require at least 3 fields (type, raw_name, translated_name) if len(parts) < 3: continue entry_type = parts[0].strip().lower() # Only accept reasonable type tokens (letters/underscores only) import re as _re if not _re.match(r'^[a-z_]+$', entry_type): continue raw_name = parts[1] if strip_honorifics: raw_name = self._strip_honorific(raw_name, language) parts[1] = raw_name if raw_name not in existing_names: existing_lines.append(','.join(parts)) existing_names.add(raw_name) # Check stop flag before processing new names if is_stop_requested(): print(f"📑 ❌ Merge stopped before processing new entries") return new_csv_lines # Get new names new_names = set() final_lines = [] for idx, line in enumerate(new_csv_lines): # Check stop flag every 50 lines if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Merge stopped while processing new entries at line {idx}") return final_lines if final_lines else new_csv_lines if 'type,raw_name' in line.lower(): final_lines.append(line) # Keep header continue parts = [p.strip() for p in line.split(',')] if len(parts) >= 2: new_names.add(parts[1]) final_lines.append(line) # Check stop flag before adding existing entries if is_stop_requested(): print(f"📑 ❌ Merge stopped before combining entries") return final_lines # Add non-duplicate existing entries added_count = 0 for idx, line in enumerate(existing_lines): # Check stop flag every 50 additions if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Merge stopped while adding existing entries ({added_count} added)") return final_lines parts = [p.strip() for p in line.split(',')] if len(parts) >= 2 and parts[1] not in new_names: final_lines.append(line) added_count += 1 print(f"📑 Merged {added_count} entries from existing glossary") return final_lines def _extract_with_patterns(self, all_text, language, min_frequency, max_names, max_titles, batch_size, existing_glossary, output_dir, strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'): """Extract glossary using pattern matching with true CSV format output and stop flag checks""" print("📑 Using pattern-based extraction") # Check stop flag at start if is_stop_requested(): print("📑 ❌ Pattern-based extraction stopped by user") return {} def is_valid_name(name, language_hint='unknown'): """Strict validation for proper names only""" if not name or len(name.strip()) < 1: return False name = name.strip() if name.lower() in self.pattern_manager.COMMON_WORDS or name in self.pattern_manager.COMMON_WORDS: return False if language_hint == 'korean': if not (2 <= len(name) <= 4): return False if not all(0xAC00 <= ord(char) <= 0xD7AF for char in name): return False if len(set(name)) == 1: return False elif language_hint == 'japanese': if not (2 <= len(name) <= 6): return False has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in name) has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in name) if not (has_kanji or has_kana): return False elif language_hint == 'chinese': if not (2 <= len(name) <= 4): return False if not all(0x4E00 <= ord(char) <= 0x9FFF for char in name): return False elif language_hint == 'english': if not name[0].isupper(): return False if sum(1 for c in name if c.isalpha()) < len(name) * 0.8: return False if not (2 <= len(name) <= 20): return False return True def detect_language_hint(text_sample): """Quick language detection for validation purposes""" sample = text_sample[:1000] korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A) if korean_chars > 50: return 'korean' elif japanese_kana > 20: return 'japanese' elif chinese_chars > 50 and japanese_kana < 10: return 'chinese' elif latin_chars > 100: return 'english' else: return 'unknown' language_hint = detect_language_hint(all_text) print(f"📑 Detected primary language: {language_hint}") # Check stop flag after language detection if is_stop_requested(): print("📑 ❌ Extraction stopped after language detection") return {} honorifics_to_use = [] if language_hint in self.pattern_manager.CJK_HONORIFICS: honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS[language_hint]) honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS.get('english', [])) print(f"📑 Using {len(honorifics_to_use)} honorifics for {language_hint}") names_with_honorifics = {} standalone_names = {} # Check if parallel processing is enabled extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) # PARALLEL HONORIFIC PROCESSING if extraction_workers > 1 and len(honorifics_to_use) > 3: print(f"📑 Scanning for names with honorifics (parallel with {extraction_workers} workers)...") # Create a wrapper function that can be called in parallel def process_honorific(args): """Process a single honorific in a worker thread""" honorific, idx, total = args # Check stop flag if is_stop_requested(): return None, None print(f"📑 Worker processing honorific {idx}/{total}: '{honorific}'") # Local dictionaries for this worker local_names_with = {} local_standalone = {} # Call the extraction method self._extract_names_for_honorific( honorific, all_text, language_hint, min_frequency, local_names_with, local_standalone, is_valid_name, fuzzy_threshold ) return local_names_with, local_standalone # Prepare arguments for parallel processing honorific_args = [ (honorific, idx + 1, len(honorifics_to_use)) for idx, honorific in enumerate(honorifics_to_use) ] # Process honorifics in parallel with ThreadPoolExecutor(max_workers=min(extraction_workers, len(honorifics_to_use))) as executor: futures = [] for args in honorific_args: if is_stop_requested(): executor.shutdown(wait=False) return {} future = executor.submit(process_honorific, args) futures.append(future) # Collect results as they complete completed = 0 for future in as_completed(futures): if is_stop_requested(): executor.shutdown(wait=False) return {} try: result = future.result() if result and result[0] is not None: local_names_with, local_standalone = result # Merge results (thread-safe since we're in main thread) for name, count in local_names_with.items(): if name not in names_with_honorifics: names_with_honorifics[name] = count else: names_with_honorifics[name] = max(names_with_honorifics[name], count) for name, count in local_standalone.items(): if name not in standalone_names: standalone_names[name] = count else: standalone_names[name] = max(standalone_names[name], count) completed += 1 if completed % 5 == 0 or completed == len(honorifics_to_use): print(f"📑 Honorific processing: {completed}/{len(honorifics_to_use)} completed") except Exception as e: print(f"⚠️ Failed to process honorific: {e}") completed += 1 print(f"📑 Parallel honorific processing completed: found {len(names_with_honorifics)} names") else: # SEQUENTIAL PROCESSING (fallback) print("📑 Scanning for names with honorifics...") # Extract names with honorifics total_honorifics = len(honorifics_to_use) for idx, honorific in enumerate(honorifics_to_use): # Check stop flag before each honorific if is_stop_requested(): print(f"📑 ❌ Extraction stopped at honorific {idx}/{total_honorifics}") return {} print(f"📑 Processing honorific {idx + 1}/{total_honorifics}: '{honorific}'") self._extract_names_for_honorific(honorific, all_text, language_hint, min_frequency, names_with_honorifics, standalone_names, is_valid_name, fuzzy_threshold) # Check stop flag before processing terms if is_stop_requested(): print("📑 ❌ Extraction stopped before processing terms") return {} # Apply filter mode filtered_names = {} if filter_mode == 'only_with_honorifics': # Only keep names that have honorifics (no standalone names) filtered_names = names_with_honorifics.copy() print(f"📑 Filter: Keeping only names with honorifics ({len(filtered_names)} names)") elif filter_mode == 'only_without_honorifics': # Keep standalone names that were NOT found with honorifics for name, count in standalone_names.items(): # Check if this name also appears with honorifics appears_with_honorific = False for honorific_name in names_with_honorifics.keys(): if self._strip_honorific(honorific_name, language_hint) == name: appears_with_honorific = True break # Only add if it doesn't appear with honorifics if not appears_with_honorific: filtered_names[name] = count print(f"📑 Filter: Keeping only names without honorifics ({len(filtered_names)} names)") else: # 'all' mode # Keep all names (both with and without honorifics) filtered_names = names_with_honorifics.copy() # Also add standalone names for name, count in standalone_names.items(): if name not in filtered_names and not any( self._strip_honorific(n, language_hint) == name for n in filtered_names.keys() ): filtered_names[name] = count print(f"📑 Filter: Keeping all names ({len(filtered_names)} names)") # Process extracted terms final_terms = {} term_count = 0 total_terms = len(filtered_names) for term, count in filtered_names.items(): term_count += 1 # Check stop flag every 20 terms if term_count % 20 == 0: if is_stop_requested(): print(f"📑 ❌ Term processing stopped at {term_count}/{total_terms}") return {} if strip_honorifics: clean_term = self._strip_honorific(term, language_hint) if clean_term in final_terms: final_terms[clean_term] = final_terms[clean_term] + count else: final_terms[clean_term] = count else: final_terms[term] = count # Check stop flag before finding titles if is_stop_requested(): print("📑 ❌ Extraction stopped before finding titles") return {} # Find titles (but respect filter mode) print("📑 Scanning for titles...") found_titles = {} # Extract titles for all modes EXCEPT "only_with_honorifics" # (titles are included in "only_without_honorifics" since titles typically don't have honorifics) if filter_mode != 'only_with_honorifics': title_patterns_to_use = [] if language_hint in self.pattern_manager.TITLE_PATTERNS: title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS[language_hint]) title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS.get('english', [])) total_patterns = len(title_patterns_to_use) for pattern_idx, pattern in enumerate(title_patterns_to_use): # Check stop flag before each pattern if is_stop_requested(): print(f"📑 ❌ Title extraction stopped at pattern {pattern_idx}/{total_patterns}") return {} print(f"📑 Processing title pattern {pattern_idx + 1}/{total_patterns}") matches = list(re.finditer(pattern, all_text, re.IGNORECASE if 'english' in pattern else 0)) for match_idx, match in enumerate(matches): # Check stop flag every 50 matches if match_idx > 0 and match_idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Title extraction stopped at match {match_idx}") return {} title = match.group(0) # Skip if this title is already in names if title in filtered_names or title in names_with_honorifics: continue count = self._find_fuzzy_matches(title, all_text, fuzzy_threshold) # Check if stopped during fuzzy matching if is_stop_requested(): print(f"📑 ❌ Title extraction stopped during fuzzy matching") return {} if count >= min_frequency: if re.match(r'[A-Za-z]', title): title = title.title() if strip_honorifics: title = self._strip_honorific(title, language_hint) if title not in found_titles: found_titles[title] = count if filter_mode == 'only_without_honorifics': print(f"📑 Found {len(found_titles)} titles (included in 'without honorifics' mode)") else: print(f"📑 Found {len(found_titles)} unique titles") else: print(f"📑 Skipping title extraction (filter mode: only_with_honorifics)") # Check stop flag before sorting and translation if is_stop_requested(): print("📑 ❌ Extraction stopped before sorting terms") return {} # Combine and sort sorted_names = sorted(final_terms.items(), key=lambda x: x[1], reverse=True)[:max_names] sorted_titles = sorted(found_titles.items(), key=lambda x: x[1], reverse=True)[:max_titles] all_terms = [] for name, count in sorted_names: all_terms.append(name) for title, count in sorted_titles: all_terms.append(title) print(f"📑 Total terms to translate: {len(all_terms)}") # Check stop flag before translation if is_stop_requested(): print("📑 ❌ Extraction stopped before translation") return {} # Translate terms if os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": print("📑 Translation disabled - keeping original terms") translations = {term: term for term in all_terms} else: print(f"📑 Translating {len(all_terms)} terms...") translations = self._translate_terms_batch(all_terms, language_hint, batch_size, output_dir) # Check if translation was stopped if is_stop_requested(): print("📑 ❌ Extraction stopped after translation") return translations # Return partial results # Build CSV lines csv_lines = ["type,raw_name,translated_name"] for name, _ in sorted_names: if name in translations: csv_lines.append(f"character,{name},{translations[name]}") for title, _ in sorted_titles: if title in translations: csv_lines.append(f"term,{title},{translations[title]}") # Check stop flag before merging if is_stop_requested(): print("📑 ❌ Extraction stopped before merging with existing glossary") # Still save what we have csv_content = '\n'.join(csv_lines) glossary_path = os.path.join(output_dir, "glossary.json") self._atomic_write_file(glossary_path, csv_content) return self._parse_csv_to_dict(csv_content) # Merge with existing glossary if existing_glossary: csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language_hint) # Check stop flag before deduplication if is_stop_requested(): print("📑 ❌ Extraction stopped before deduplication") csv_content = '\n'.join(csv_lines) glossary_path = os.path.join(output_dir, "glossary.json") self._atomic_write_file(glossary_path, csv_content) return self._parse_csv_to_dict(csv_content) # Fuzzy matching deduplication csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) # Create CSV content csv_content = '\n'.join(csv_lines) # Save glossary as CSV glossary_path = os.path.join(output_dir, "glossary.csv") self._atomic_write_file(glossary_path, csv_content) print(f"\n📑 ✅ TARGETED GLOSSARY SAVED!") print(f"📑 File: {glossary_path}") print(f"📑 Total entries: {len(csv_lines) - 1}") # Exclude header return self._parse_csv_to_dict(csv_content) def _translate_terms_batch(self, term_list, profile_name, batch_size=50, output_dir=None): """Use fully configurable prompts for translation with interrupt support""" if not term_list or os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": print(f"📑 Glossary translation disabled or no terms to translate") return {term: term for term in term_list} # Check stop flag if is_stop_requested(): print("📑 ❌ Glossary translation stopped by user") return {term: term for term in term_list} try: MODEL = os.getenv("MODEL", "gemini-1.5-flash") API_KEY = (os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_OR_Gemini_API_KEY") or os.getenv("GEMINI_API_KEY")) if is_traditional_translation_api(MODEL): return if not API_KEY: print(f"📑 No API key found, skipping translation") return {term: term for term in term_list} print(f"📑 Translating {len(term_list)} {profile_name} terms to English using batch size {batch_size}...") from unified_api_client import UnifiedClient, UnifiedClientError client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() # Get custom translation prompt from environment translation_prompt_template = os.getenv("GLOSSARY_TRANSLATION_PROMPT", "") if not translation_prompt_template: translation_prompt_template = """You are translating {language} character names and important terms to English. For character names, provide English transliterations or keep as romanized. Keep honorifics/suffixes only if they are integral to the name. Respond with the same numbered format. Terms to translate: {terms_list} Provide translations in the same numbered format.""" all_translations = {} chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "300")) # 5 minute default for i in range(0, len(term_list), batch_size): # Check stop flag before each batch if is_stop_requested(): print(f"📑 ❌ Translation stopped at batch {(i // batch_size) + 1}") # Return partial translations for term in term_list: if term not in all_translations: all_translations[term] = term return all_translations batch = term_list[i:i + batch_size] batch_num = (i // batch_size) + 1 total_batches = (len(term_list) + batch_size - 1) // batch_size print(f"📑 Processing batch {batch_num}/{total_batches} ({len(batch)} terms)...") # Format terms list terms_text = "" for idx, term in enumerate(batch, 1): terms_text += f"{idx}. {term}\n" # Replace placeholders in prompt prompt = translation_prompt_template.replace('{language}', profile_name) prompt = prompt.replace('{terms_list}', terms_text.strip()) prompt = prompt.replace('{batch_size}', str(len(batch))) messages = [ {"role": "user", "content": prompt} ] try: temperature = float(os.getenv("TEMPERATURE", "0.3")) max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096")) # Use send_with_interrupt for interruptible API call print(f"📑 Sending translation request for batch {batch_num} (interruptible)...") response = send_with_interrupt( messages=messages, client=client, temperature=temperature, max_tokens=max_tokens, stop_check_fn=is_stop_requested, chunk_timeout=chunk_timeout ) # Handle response properly if hasattr(response, 'content'): response_text = response.content else: response_text = str(response) batch_translations = self._parse_translation_response(response_text, batch) all_translations.update(batch_translations) print(f"📑 Batch {batch_num} completed: {len(batch_translations)} translations") # Small delay between batches to avoid rate limiting (configurable) if i + batch_size < len(term_list): # Check stop before sleep if is_stop_requested(): print(f"📑 ❌ Translation stopped after batch {batch_num}") # Fill in missing translations for term in term_list: if term not in all_translations: all_translations[term] = term return all_translations # Use configurable batch delay or default to 0.1s (much faster than 0.5s) batch_delay = float(os.getenv("GLOSSARY_BATCH_DELAY", "0.001")) if batch_delay > 0: time.sleep(batch_delay) except UnifiedClientError as e: if "stopped by user" in str(e).lower(): print(f"📑 ❌ Translation interrupted by user at batch {batch_num}") # Fill in remaining terms with originals for term in term_list: if term not in all_translations: all_translations[term] = term return all_translations else: print(f"⚠️ Translation failed for batch {batch_num}: {e}") for term in batch: all_translations[term] = term except Exception as e: print(f"⚠️ Translation failed for batch {batch_num}: {e}") for term in batch: all_translations[term] = term # Ensure all terms have translations for term in term_list: if term not in all_translations: all_translations[term] = term translated_count = sum(1 for term, translation in all_translations.items() if translation != term and translation.strip()) print(f"📑 Successfully translated {translated_count}/{len(term_list)} terms") return all_translations except Exception as e: print(f"⚠️ Glossary translation failed: {e}") return {term: term for term in term_list} def _extract_names_for_honorific(self, honorific, all_text, language_hint, min_frequency, names_with_honorifics, standalone_names, is_valid_name, fuzzy_threshold=0.90): """Extract names for a specific honorific with fuzzy matching and stop flag checks""" # Check stop flag at start if is_stop_requested(): print(f"📑 ❌ Name extraction for '{honorific}' stopped by user") return if language_hint == 'korean' and not honorific.startswith('-'): pattern = r'([\uac00-\ud7af]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,.\!?]|$))' matches = list(re.finditer(pattern, all_text)) total_matches = len(matches) for idx, match in enumerate(matches): # Check stop flag every 50 matches if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Korean name extraction stopped at {idx}/{total_matches}") return # Show progress for large sets if total_matches > 500: progress = (idx / total_matches) * 100 print(f"📑 Processing Korean names: {progress:.1f}% ({idx}/{total_matches})") potential_name = match.group(1) if is_valid_name(potential_name, 'korean'): full_form = potential_name + honorific # Use fuzzy matching for counting with stop check count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) # Check if stopped during fuzzy matching if is_stop_requested(): print(f"📑 ❌ Name extraction stopped during fuzzy matching") return if count >= min_frequency: context_patterns = [ full_form + r'[은는이가]', full_form + r'[을를]', full_form + r'[에게한테]', r'["]' + full_form, full_form + r'[,]', ] context_count = 0 for ctx_pattern in context_patterns: context_count += len(re.findall(ctx_pattern, all_text)) if context_count > 0: names_with_honorifics[full_form] = count standalone_names[potential_name] = count elif language_hint == 'japanese' and not honorific.startswith('-'): pattern = r'([\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,5})(?=' + re.escape(honorific) + r'(?:\s|[、。!?]|$))' matches = list(re.finditer(pattern, all_text)) total_matches = len(matches) for idx, match in enumerate(matches): # Check stop flag every 50 matches if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Japanese name extraction stopped at {idx}/{total_matches}") return if total_matches > 500: progress = (idx / total_matches) * 100 print(f"📑 Processing Japanese names: {progress:.1f}% ({idx}/{total_matches})") potential_name = match.group(1) if is_valid_name(potential_name, 'japanese'): full_form = potential_name + honorific count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) if is_stop_requested(): print(f"📑 ❌ Name extraction stopped during fuzzy matching") return if count >= min_frequency: names_with_honorifics[full_form] = count standalone_names[potential_name] = count elif language_hint == 'chinese' and not honorific.startswith('-'): pattern = r'([\u4e00-\u9fff]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,。!?]|$))' matches = list(re.finditer(pattern, all_text)) total_matches = len(matches) for idx, match in enumerate(matches): # Check stop flag every 50 matches if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ Chinese name extraction stopped at {idx}/{total_matches}") return if total_matches > 500: progress = (idx / total_matches) * 100 print(f"📑 Processing Chinese names: {progress:.1f}% ({idx}/{total_matches})") potential_name = match.group(1) if is_valid_name(potential_name, 'chinese'): full_form = potential_name + honorific count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) if is_stop_requested(): print(f"📑 ❌ Name extraction stopped during fuzzy matching") return if count >= min_frequency: names_with_honorifics[full_form] = count standalone_names[potential_name] = count elif honorific.startswith('-') or honorific.startswith(' '): is_space_separated = honorific.startswith(' ') if is_space_separated: pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'(?=\s|[,.\!?]|$)' else: pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'\b' matches = list(re.finditer(pattern_english, all_text)) total_matches = len(matches) for idx, match in enumerate(matches): # Check stop flag every 50 matches if idx > 0 and idx % 50 == 0: if is_stop_requested(): print(f"📑 ❌ English name extraction stopped at {idx}/{total_matches}") return if total_matches > 500: progress = (idx / total_matches) * 100 print(f"📑 Processing English names: {progress:.1f}% ({idx}/{total_matches})") potential_name = match.group(1) if is_valid_name(potential_name, 'english'): full_form = potential_name + honorific count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold) if is_stop_requested(): print(f"📑 ❌ Name extraction stopped during fuzzy matching") return if count >= min_frequency: names_with_honorifics[full_form] = count standalone_names[potential_name] = count def _parse_translation_response(self, response, original_terms): """Parse translation response - handles numbered format""" translations = {} # Handle UnifiedResponse object if hasattr(response, 'content'): response_text = response.content else: response_text = str(response) lines = response_text.strip().split('\n') for line in lines: line = line.strip() if not line or not line[0].isdigit(): continue try: number_match = re.match(r'^(\d+)\.?\s*(.+)', line) if number_match: num = int(number_match.group(1)) - 1 content = number_match.group(2).strip() if 0 <= num < len(original_terms): original_term = original_terms[num] for separator in ['->', '→', ':', '-', '—', '=']: if separator in content: parts = content.split(separator, 1) if len(parts) == 2: translation = parts[1].strip() translation = translation.strip('"\'()[]') if translation and translation != original_term: translations[original_term] = translation break else: if content != original_term: translations[original_term] = content except (ValueError, IndexError): continue return translations # ===================================================== # UNIFIED UTILITIES # ===================================================== def sanitize_resource_filename(filename): """Sanitize resource filenames for filesystem compatibility""" filename = unicodedata.normalize('NFC', filename) replacements = { '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_', '"': '_', '<': '_', '>': '_', '|': '_', '\0': '', '\n': '_', '\r': '_' } for old, new in replacements.items(): filename = filename.replace(old, new) filename = ''.join(char for char in filename if ord(char) >= 32) name, ext = os.path.splitext(filename) if not name: name = 'resource' return name + ext def should_retain_source_extension(): """Read GUI toggle for retaining original extension and no 'response_' prefix. This is stored in config or env by the GUI; we read env as bridge. """ return os.getenv('RETAIN_SOURCE_EXTENSION', os.getenv('retain_source_extension', '0')) in ('1', 'true', 'True') def make_safe_filename(title, actual_num): """Create a safe filename that works across different filesystems""" if not title: return f"chapter_{actual_num:03d}" title = unicodedata.normalize('NFC', str(title)) dangerous_chars = { '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_', '"': '_', '<': '_', '>': '_', '|': '_', '\0': '', '\n': ' ', '\r': ' ', '\t': ' ' } for old, new in dangerous_chars.items(): title = title.replace(old, new) title = ''.join(char for char in title if ord(char) >= 32) title = re.sub(r'\s+', '_', title) title = title.strip('_.• \t') if not title or title == '_' * len(title): title = f"chapter_{actual_num:03d}" return title def get_content_hash(html_content): """Create a stable hash of content""" return ContentProcessor.get_content_hash(html_content) def clean_ai_artifacts(text, remove_artifacts=True): """Remove AI response artifacts from text""" return ContentProcessor.clean_ai_artifacts(text, remove_artifacts) def find_glossary_file(output_dir): """Return path to glossary file preferring CSV over JSON, or None if not found""" candidates = [ os.path.join(output_dir, "glossary.csv"), os.path.join(output_dir, "glossary.json"), ] for p in candidates: if os.path.exists(p): return p return None def clean_memory_artifacts(text): """Remove any memory/summary artifacts""" return ContentProcessor.clean_memory_artifacts(text) def emergency_restore_paragraphs(text, original_html=None, verbose=True): """Emergency restoration when AI returns wall of text""" return ContentProcessor.emergency_restore_paragraphs(text, original_html, verbose) def is_meaningful_text_content(html_content): """Check if chapter has meaningful text beyond just structure""" return ContentProcessor.is_meaningful_text_content(html_content) # ===================================================== # GLOBAL SETTINGS AND FLAGS # ===================================================== logging.basicConfig(level=logging.DEBUG) try: if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='ignore') except AttributeError: if sys.stdout is None: devnull = open(os.devnull, "wb") sys.stdout = io.TextIOWrapper(devnull, encoding='utf-8', errors='ignore') elif hasattr(sys.stdout, 'buffer'): try: sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') except: pass _stop_requested = False def set_stop_flag(value): """Set the global stop flag""" global _stop_requested _stop_requested = value def is_stop_requested(): """Check if stop was requested""" global _stop_requested return _stop_requested def set_output_redirect(log_callback=None): """Redirect print statements to a callback function for GUI integration""" if log_callback: class CallbackWriter: def __init__(self, callback): self.callback = callback def write(self, text): if text.strip(): self.callback(text.strip()) def flush(self): pass sys.stdout = CallbackWriter(log_callback) # ===================================================== # EPUB AND FILE PROCESSING # ===================================================== def extract_chapter_number_from_filename(filename, opf_spine_position=None, opf_spine_data=None): """Extract chapter number from filename, prioritizing OPF spine order""" # Priority 1: Use OPF spine position if available if opf_spine_position is not None: # Handle special non-chapter files (always chapter 0) filename_lower = filename.lower() name_without_ext = os.path.splitext(filename)[0].lower() # Check for special keywords OR no numbers present special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface', 'nav'] has_special_keyword = any(name in filename_lower for name in special_keywords) has_no_numbers = not re.search(r'\d', name_without_ext) if has_special_keyword or has_no_numbers: return 0, 'opf_special_file' # Use spine position for regular chapters (0, 1, 2, 3...) return opf_spine_position, 'opf_spine_order' # Priority 2: Check if this looks like a special file (even without OPF) name_without_ext = os.path.splitext(filename)[0].lower() special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface'] has_special_keyword = any(name in name_without_ext for name in special_keywords) has_no_numbers = not re.search(r'\d', name_without_ext) if has_special_keyword or has_no_numbers: return 0, 'special_file' # Priority 3: Try to extract sequential numbers (000, 001, 002...) name_without_ext = os.path.splitext(filename)[0] # Look for simple sequential patterns first # Priority 3: Try to extract sequential numbers and decimals sequential_patterns = [ (r'^(\d+)\.(\d+)$', 'decimal_number'), # 1.5, 2.3 (NEW!) (r'^(\d{3,4})$', 'sequential_number'), # 000, 001, 0001 (r'^(\d+)$', 'direct_number'), # 0, 1, 2 ] for pattern, method in sequential_patterns: match = re.search(pattern, name_without_ext) if match: if method == 'decimal_number': # Return as float for decimal chapters return float(f"{match.group(1)}.{match.group(2)}"), method else: return int(match.group(1)), method # Priority 4: Fall back to existing filename parsing patterns fallback_patterns = [ (r'^response_(\d+)[_\.]', 'response_prefix'), (r'[Cc]hapter[_\s]*(\d+)', 'chapter_word'), (r'[Cc]h[_\s]*(\d+)', 'ch_abbreviation'), (r'No(\d+)', 'no_prefix'), (r'第(\d+)[章话回]', 'chinese_chapter'), (r'-h-(\d+)', 'h_suffix'), # For your -h-16 pattern (r'_(\d+)', 'underscore_suffix'), (r'-(\d+)', 'dash_suffix'), (r'(\d+)', 'trailing_number'), ] for pattern, method in fallback_patterns: match = re.search(pattern, name_without_ext, re.IGNORECASE) if match: return int(match.group(1)), method return None, None def process_chapter_images(chapter_html: str, actual_num: int, image_translator: ImageTranslator, check_stop_fn=None) -> Tuple[str, Dict[str, str]]: """Process and translate images in a chapter""" from bs4 import BeautifulSoup images = image_translator.extract_images_from_chapter(chapter_html) if not images: return chapter_html, {} print(f"🖼️ Found {len(images)} images in chapter {actual_num}") soup = BeautifulSoup(chapter_html, 'html.parser') image_translations = {} translated_count = 0 max_images_per_chapter = int(os.getenv('MAX_IMAGES_PER_CHAPTER', '10')) if len(images) > max_images_per_chapter: print(f" ⚠️ Chapter has {len(images)} images - processing first {max_images_per_chapter} only") images = images[:max_images_per_chapter] for idx, img_info in enumerate(images, 1): if check_stop_fn and check_stop_fn(): print("❌ Image translation stopped by user") break img_src = img_info['src'] if img_src.startswith('../'): img_path = os.path.join(image_translator.output_dir, img_src[3:]) elif img_src.startswith('./'): img_path = os.path.join(image_translator.output_dir, img_src[2:]) elif img_src.startswith('/'): img_path = os.path.join(image_translator.output_dir, img_src[1:]) else: possible_paths = [ os.path.join(image_translator.images_dir, os.path.basename(img_src)), os.path.join(image_translator.output_dir, img_src), os.path.join(image_translator.output_dir, 'images', os.path.basename(img_src)), os.path.join(image_translator.output_dir, os.path.basename(img_src)), os.path.join(image_translator.output_dir, os.path.dirname(img_src), os.path.basename(img_src)) ] img_path = None for path in possible_paths: if os.path.exists(path): img_path = path print(f" ✅ Found image at: {path}") break if not img_path: print(f" ❌ Image not found in any location for: {img_src}") print(f" Tried: {possible_paths}") continue img_path = os.path.normpath(img_path) if not os.path.exists(img_path): print(f" ⚠️ Image not found: {img_path}") print(f" 📁 Images directory: {image_translator.images_dir}") print(f" 📁 Output directory: {image_translator.output_dir}") print(f" 📁 Working directory: {os.getcwd()}") if os.path.exists(image_translator.images_dir): files = os.listdir(image_translator.images_dir) print(f" 📁 Files in images dir: {files[:5]}...") continue print(f" 🔍 Processing image {idx}/{len(images)}: {os.path.basename(img_path)}") context = "" if img_info.get('alt'): context += f", Alt text: {img_info['alt']}" if translated_count > 0: delay = float(os.getenv('IMAGE_API_DELAY', '1.0')) time.sleep(delay) translation_result = image_translator.translate_image(img_path, context, check_stop_fn) print(f"\n🔍 DEBUG: Image {idx}/{len(images)}") print(f" Translation result: {'Success' if translation_result and '[Image Translation Error:' not in translation_result else 'Failed'}") if translation_result and "[Image Translation Error:" in translation_result: print(f" Error message: {translation_result}") if translation_result: img_tag = None for img in soup.find_all('img'): if img.get('src') == img_src: img_tag = img break if img_tag: hide_label = os.getenv("HIDE_IMAGE_TRANSLATION_LABEL", "0") == "1" print(f" 🔍 DEBUG: Integration Phase") print(f" 🏷️ Hide label mode: {hide_label}") print(f" 📍 Found img tag: {img_tag.get('src')}") # Store the translation result in the dictionary FIRST image_translations[img_path] = translation_result # Parse the translation result to integrate into the chapter HTML if '
' in translation_result: trans_soup = BeautifulSoup(translation_result, 'html.parser') # Try to get the full container first full_container = trans_soup.find('div', class_=['translated-text-only', 'image-with-translation']) if full_container: # Clone the container to avoid issues new_container = BeautifulSoup(str(full_container), 'html.parser').find('div') img_tag.replace_with(new_container) print(f" ✅ Replaced image with full translation container") else: # Fallback: manually build the structure trans_div = trans_soup.find('div', class_='image-translation') if trans_div: container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) img_tag.replace_with(container) if not hide_label: new_img = soup.new_tag('img', src=img_src) if img_info.get('alt'): new_img['alt'] = img_info.get('alt') container.append(new_img) # Clone the translation div content new_trans_div = soup.new_tag('div', **{'class': 'image-translation'}) # Copy all children from trans_div to new_trans_div for child in trans_div.children: if hasattr(child, 'name'): new_trans_div.append(BeautifulSoup(str(child), 'html.parser')) else: new_trans_div.append(str(child)) container.append(new_trans_div) print(f" ✅ Built container with translation div") else: print(f" ⚠️ No translation div found in result") continue else: # Plain text translation - build structure manually container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) img_tag.replace_with(container) if not hide_label: new_img = soup.new_tag('img', src=img_src) if img_info.get('alt'): new_img['alt'] = img_info.get('alt') container.append(new_img) # Create translation div with content translation_div = soup.new_tag('div', **{'class': 'image-translation'}) if not hide_label: label_p = soup.new_tag('p') label_em = soup.new_tag('em') #label_em.string = "[Image text translation:]" label_p.append(label_em) translation_div.append(label_p) trans_p = soup.new_tag('p') trans_p.string = translation_result translation_div.append(trans_p) container.append(translation_div) print(f" ✅ Created plain text translation structure") translated_count += 1 # Save to translated_images folder trans_filename = f"ch{actual_num:03d}_img{idx:02d}_translation.html" trans_filepath = os.path.join(image_translator.translated_images_dir, trans_filename) # Extract just the translation content for saving save_soup = BeautifulSoup(translation_result, 'html.parser') save_div = save_soup.find('div', class_='image-translation') if not save_div: # Create a simple div for plain text save_div = f'

{translation_result}

' with open(trans_filepath, 'w', encoding='utf-8') as f: f.write(f""" Chapter {actual_num} - Image {idx} Translation

Chapter {actual_num} - Image {idx}

Original: {os.path.basename(img_path)}


{save_div} """) print(f" ✅ Saved translation to: {trans_filename}") else: print(f" ⚠️ Could not find image tag in HTML for: {img_src}") if translated_count > 0: print(f" 🖼️ Successfully translated {translated_count} images") # Debug output final_html = str(soup) trans_count = final_html.count('
') print(f" 📊 Final HTML has {trans_count} translation divs") print(f" 📊 image_translations dict has {len(image_translations)} entries") prog = image_translator.load_progress() if "image_chunks" in prog: completed_images = [] for img_key, img_data in prog["image_chunks"].items(): if len(img_data["completed"]) == img_data["total"]: completed_images.append(img_key) for img_key in completed_images: del prog["image_chunks"][img_key] if completed_images: image_translator.save_progress(prog) print(f" 🧹 Cleaned up progress for {len(completed_images)} completed images") image_translator.save_translation_log(actual_num, image_translations) return str(soup), image_translations else: print(f" ℹ️ No images were successfully translated") return chapter_html, {} def detect_novel_numbering(chapters): """Detect if the novel uses 0-based or 1-based chapter numbering with improved accuracy""" print("[DEBUG] Detecting novel numbering system...") if not chapters: return False if isinstance(chapters[0], str): print("[DEBUG] Text file detected, skipping numbering detection") return False patterns = PatternManager.FILENAME_EXTRACT_PATTERNS # Special check for prefix_suffix pattern like "0000_1.xhtml" prefix_suffix_pattern = r'^(\d+)_(\d+)[_\.]' # Track chapter numbers from different sources filename_numbers = [] content_numbers = [] has_prefix_suffix = False prefix_suffix_numbers = [] for idx, chapter in enumerate(chapters): extracted_num = None # Check filename patterns if 'original_basename' in chapter and chapter['original_basename']: filename = chapter['original_basename'] elif 'filename' in chapter: filename = os.path.basename(chapter['filename']) else: continue # First check for prefix_suffix pattern prefix_match = re.search(prefix_suffix_pattern, filename, re.IGNORECASE) if prefix_match: has_prefix_suffix = True # Use the SECOND number (after underscore) suffix_num = int(prefix_match.group(2)) prefix_suffix_numbers.append(suffix_num) extracted_num = suffix_num print(f"[DEBUG] Prefix_suffix pattern matched: {filename} -> Chapter {suffix_num}") else: # Try other patterns for pattern in patterns: match = re.search(pattern, filename) if match: extracted_num = int(match.group(1)) #print(f"[DEBUG] Pattern '{pattern}' matched: {filename} -> Chapter {extracted_num}") break if extracted_num is not None: filename_numbers.append(extracted_num) # Also check chapter content for chapter declarations if 'body' in chapter: # Look for "Chapter N" in the first 1000 characters content_preview = chapter['body'][:1000] content_match = re.search(r'Chapter\s+(\d+)', content_preview, re.IGNORECASE) if content_match: content_num = int(content_match.group(1)) content_numbers.append(content_num) print(f"[DEBUG] Found 'Chapter {content_num}' in content") # Decision logic with improved heuristics # 1. If using prefix_suffix pattern, trust those numbers exclusively if has_prefix_suffix and prefix_suffix_numbers: min_suffix = min(prefix_suffix_numbers) if min_suffix >= 1: print(f"[DEBUG] ✅ 1-based novel detected (prefix_suffix pattern starts at {min_suffix})") return False else: print(f"[DEBUG] ✅ 0-based novel detected (prefix_suffix pattern starts at {min_suffix})") return True # 2. If we have content numbers, prefer those over filename numbers if content_numbers: min_content = min(content_numbers) # Check if we have a good sequence starting from 0 or 1 if 0 in content_numbers and 1 in content_numbers: print(f"[DEBUG] ✅ 0-based novel detected (found both Chapter 0 and Chapter 1 in content)") return True elif min_content == 1: print(f"[DEBUG] ✅ 1-based novel detected (content chapters start at 1)") return False # 3. Fall back to filename numbers if filename_numbers: min_filename = min(filename_numbers) max_filename = max(filename_numbers) # Check for a proper sequence # If we have 0,1,2,3... it's likely 0-based # If we have 1,2,3,4... it's likely 1-based # Count how many chapters we have in sequence starting from 0 zero_sequence_count = 0 for i in range(len(chapters)): if i in filename_numbers: zero_sequence_count += 1 else: break # Count how many chapters we have in sequence starting from 1 one_sequence_count = 0 for i in range(1, len(chapters) + 1): if i in filename_numbers: one_sequence_count += 1 else: break print(f"[DEBUG] Zero-based sequence length: {zero_sequence_count}") print(f"[DEBUG] One-based sequence length: {one_sequence_count}") # If we have a better sequence starting from 1, it's 1-based if one_sequence_count > zero_sequence_count and min_filename >= 1: print(f"[DEBUG] ✅ 1-based novel detected (better sequence match starting from 1)") return False # If we have any 0 in filenames and it's part of a sequence if 0 in filename_numbers and zero_sequence_count >= 3: print(f"[DEBUG] ✅ 0-based novel detected (found 0 in sequence)") return True # 4. Default to 1-based if uncertain print(f"[DEBUG] ✅ Defaulting to 1-based novel (insufficient evidence for 0-based)") return False def validate_chapter_continuity(chapters): """Validate chapter continuity and warn about issues""" if not chapters: print("No chapters to translate") return issues = [] # Get all chapter numbers chapter_nums = [c['num'] for c in chapters] actual_nums = [c.get('actual_chapter_num', c['num']) for c in chapters] # Check for duplicates duplicates = [num for num in chapter_nums if chapter_nums.count(num) > 1] if duplicates: issues.append(f"Duplicate chapter numbers found: {set(duplicates)}") # Check for gaps in sequence min_num = min(chapter_nums) max_num = max(chapter_nums) expected = set(range(min_num, max_num + 1)) actual = set(chapter_nums) missing = expected - actual if missing: issues.append(f"Missing chapter numbers: {sorted(missing)}") # Show gaps more clearly gaps = [] sorted_missing = sorted(missing) if sorted_missing: start = sorted_missing[0] end = sorted_missing[0] for num in sorted_missing[1:]: if num == end + 1: end = num else: gaps.append(f"{start}-{end}" if start != end else str(start)) start = end = num gaps.append(f"{start}-{end}" if start != end else str(start)) issues.append(f"Gap ranges: {', '.join(gaps)}") # Check for duplicate titles title_map = {} for c in chapters: title_lower = c['title'].lower().strip() if title_lower in title_map: title_map[title_lower].append(c['num']) else: title_map[title_lower] = [c['num']] for title, nums in title_map.items(): if len(nums) > 1: issues.append(f"Duplicate title '{title}' in chapters: {nums}") # Print summary print("\n" + "="*60) print("📚 CHAPTER VALIDATION SUMMARY") print("="*60) print(f"Total chapters: {len(chapters)}") print(f"Chapter range: {min_num} to {max_num}") print(f"Expected count: {max_num - min_num + 1}") print(f"Actual count: {len(chapters)}") if len(chapters) != (max_num - min_num + 1): print(f"⚠️ Chapter count mismatch - missing {(max_num - min_num + 1) - len(chapters)} chapters") if issues: print("\n⚠️ Issues found:") for issue in issues: print(f" - {issue}") else: print("✅ No continuity issues detected") print("="*60 + "\n") def validate_epub_structure(output_dir): """Validate that all necessary EPUB structure files are present""" print("🔍 Validating EPUB structure...") required_files = { 'container.xml': 'META-INF container file (critical)', '*.opf': 'OPF package file (critical)', '*.ncx': 'Navigation file (recommended)' } found_files = {} missing_files = [] container_path = os.path.join(output_dir, 'container.xml') if os.path.exists(container_path): found_files['container.xml'] = 'Found' print(" ✅ container.xml - Found") else: missing_files.append('container.xml') print(" ❌ container.xml - Missing (CRITICAL)") opf_files = [] ncx_files = [] for file in os.listdir(output_dir): if file.lower().endswith('.opf'): opf_files.append(file) elif file.lower().endswith('.ncx'): ncx_files.append(file) if opf_files: found_files['opf'] = opf_files print(f" ✅ OPF file(s) - Found: {', '.join(opf_files)}") else: missing_files.append('*.opf') print(" ❌ OPF file - Missing (CRITICAL)") if ncx_files: found_files['ncx'] = ncx_files print(f" ✅ NCX file(s) - Found: {', '.join(ncx_files)}") else: missing_files.append('*.ncx') print(" ⚠️ NCX file - Missing (navigation may not work)") html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] if html_files: print(f" ✅ Translated chapters - Found: {len(html_files)} files") else: print(" ⚠️ No translated chapter files found") critical_missing = [f for f in missing_files if f in ['container.xml', '*.opf']] if not critical_missing: print("✅ EPUB structure validation PASSED") print(" All critical files present for EPUB reconstruction") return True else: print("❌ EPUB structure validation FAILED") print(f" Missing critical files: {', '.join(critical_missing)}") print(" EPUB reconstruction may fail without these files") return False def check_epub_readiness(output_dir): """Check if the output directory is ready for EPUB compilation""" print("📋 Checking EPUB compilation readiness...") issues = [] if not validate_epub_structure(output_dir): issues.append("Missing critical EPUB structure files") html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] if not html_files: issues.append("No translated chapter files found") else: print(f" ✅ Found {len(html_files)} translated chapters") metadata_path = os.path.join(output_dir, 'metadata.json') if os.path.exists(metadata_path): print(" ✅ Metadata file present") try: with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) if 'title' not in metadata: issues.append("Metadata missing title") except Exception as e: issues.append(f"Metadata file corrupted: {e}") else: issues.append("Missing metadata.json file") resource_dirs = ['css', 'fonts', 'images'] found_resources = 0 for res_dir in resource_dirs: res_path = os.path.join(output_dir, res_dir) if os.path.exists(res_path): files = [f for f in os.listdir(res_path) if os.path.isfile(os.path.join(res_path, f))] if files: found_resources += len(files) print(f" ✅ Found {len(files)} {res_dir} files") if found_resources > 0: print(f" ✅ Total resources: {found_resources} files") else: print(" ⚠️ No resource files found (this may be normal)") if not issues: print("🎉 EPUB compilation readiness: READY") print(" All necessary files present for EPUB creation") return True else: print("⚠️ EPUB compilation readiness: ISSUES FOUND") for issue in issues: print(f" • {issue}") return False def cleanup_previous_extraction(output_dir): """Clean up any files from previous extraction runs (preserves CSS files)""" # Remove 'css' from cleanup_items to preserve CSS files cleanup_items = [ 'images', # Removed 'css' from this list '.resources_extracted' ] epub_structure_files = [ 'container.xml', 'content.opf', 'toc.ncx' ] cleaned_count = 0 # Clean up directories (except CSS) for item in cleanup_items: if item.startswith('.'): continue item_path = os.path.join(output_dir, item) try: if os.path.isdir(item_path): shutil.rmtree(item_path) print(f"🧹 Removed directory: {item}") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove directory {item}: {e}") # Clean up EPUB structure files for epub_file in epub_structure_files: file_path = os.path.join(output_dir, epub_file) try: if os.path.isfile(file_path): os.remove(file_path) print(f"🧹 Removed EPUB file: {epub_file}") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove {epub_file}: {e}") # Clean up any loose .opf and .ncx files try: for file in os.listdir(output_dir): if file.lower().endswith(('.opf', '.ncx')): file_path = os.path.join(output_dir, file) if os.path.isfile(file_path): os.remove(file_path) print(f"🧹 Removed EPUB file: {file}") cleaned_count += 1 except Exception as e: print(f"⚠️ Error scanning for EPUB files: {e}") # Remove extraction marker marker_path = os.path.join(output_dir, '.resources_extracted') try: if os.path.isfile(marker_path): os.remove(marker_path) print(f"🧹 Removed extraction marker") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove extraction marker: {e}") # Check if CSS files exist and inform user they're being preserved css_path = os.path.join(output_dir, 'css') if os.path.exists(css_path): try: css_files = [f for f in os.listdir(css_path) if os.path.isfile(os.path.join(css_path, f))] if css_files: print(f"📚 Preserving {len(css_files)} CSS files") except Exception: pass if cleaned_count > 0: print(f"🧹 Cleaned up {cleaned_count} items from previous runs (CSS files preserved)") return cleaned_count # ===================================================== # API AND TRANSLATION UTILITIES # ===================================================== def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn, chunk_timeout=None, request_id=None, context=None): """Send API request with interrupt capability and optional timeout retry. Optional context parameter is passed through to the client to improve payload labeling. """ # Import UnifiedClientError at function level to avoid scoping issues from unified_api_client import UnifiedClientError # The client.send() call will handle multi-key rotation automatically # Generate request_id if not provided #if request_id is None: # request_id = str(uuid.uuid4())[:8] result_queue = queue.Queue() def api_call(): try: start_time = time.time() # Check if client.send accepts request_id parameter send_params = { 'messages': messages, 'temperature': temperature, 'max_tokens': max_tokens } # Add context if supported sig = inspect.signature(client.send) if 'context' in sig.parameters and context is not None: send_params['context'] = context # Add request_id if the client supports it sig = inspect.signature(client.send) #if 'request_id' in sig.parameters: # send_params['request_id'] = request_id result = client.send(**send_params) elapsed = time.time() - start_time result_queue.put((result, elapsed)) except Exception as e: result_queue.put(e) api_thread = threading.Thread(target=api_call) api_thread.daemon = True api_thread.start() timeout = chunk_timeout if chunk_timeout is not None else 86400 check_interval = 0.5 elapsed = 0 while elapsed < timeout: try: result = result_queue.get(timeout=check_interval) if isinstance(result, Exception): # For expected errors like rate limits, preserve the error type without extra traceback if hasattr(result, 'error_type') and result.error_type == "rate_limit": raise result elif "429" in str(result) or "rate limit" in str(result).lower(): # Convert generic exceptions to UnifiedClientError for rate limits raise UnifiedClientError(str(result), error_type="rate_limit") else: raise result if isinstance(result, tuple): api_result, api_time = result if chunk_timeout and api_time > chunk_timeout: # Set cleanup flag when chunk timeout occurs if hasattr(client, '_in_cleanup'): client._in_cleanup = True if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)") return api_result return result except queue.Empty: if stop_check_fn(): # Set cleanup flag when user stops if hasattr(client, '_in_cleanup'): client._in_cleanup = True if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() raise UnifiedClientError("Translation stopped by user") elapsed += check_interval # Set cleanup flag when timeout occurs if hasattr(client, '_in_cleanup'): client._in_cleanup = True if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() raise UnifiedClientError(f"API call timed out after {timeout} seconds") def handle_api_error(processor, error, chunk_info=""): """Handle API errors with multi-key support""" error_str = str(error) # Check for rate limit if "429" in error_str or "rate limit" in error_str.lower(): if processor.config.use_multi_api_keys: print(f"⚠️ Rate limit hit {chunk_info}, client should rotate to next key") stats = processor.client.get_stats() print(f"📊 API Stats - Active keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") if stats.get('active_keys', 0) == 0: print("⏳ All API keys are cooling down - will wait and retry") print(f"🔄 Multi-key error handling: Rate limit processed, preparing for key rotation...") time.sleep(0.1) # Brief pause after rate limit detection for stability return True # Always retry else: print(f"⚠️ Rate limit hit {chunk_info}, waiting before retry...") time.sleep(60) print(f"🔄 Single-key error handling: Rate limit wait completed, ready for retry...") time.sleep(0.1) # Brief pause after rate limit wait for stability return True # Always retry # Other errors print(f"❌ API Error {chunk_info}: {error_str}") return False def parse_token_limit(env_value): """Parse token limit from environment variable""" if not env_value or env_value.strip() == "": return None, "unlimited" env_value = env_value.strip() if env_value.lower() == "unlimited": return None, "unlimited" if env_value.isdigit() and int(env_value) > 0: limit = int(env_value) return limit, str(limit) return 1000000, "1000000 (default)" def build_system_prompt(user_prompt, glossary_path=None): """Build the system prompt with glossary - TRUE BRUTE FORCE VERSION""" append_glossary = os.getenv("APPEND_GLOSSARY", "1") == "1" actual_glossary_path = glossary_path system = user_prompt if user_prompt else "" if append_glossary and actual_glossary_path and os.path.exists(actual_glossary_path): try: print(f"[DEBUG] ✅ Loading glossary from: {os.path.abspath(actual_glossary_path)}") # Try to load as JSON first try: with open(actual_glossary_path, "r", encoding="utf-8") as gf: glossary_data = json.load(gf) glossary_text = json.dumps(glossary_data, ensure_ascii=False, indent=2) print(f"[DEBUG] Loaded as JSON") except json.JSONDecodeError: # If JSON fails, just read as raw text #print(f"[DEBUG] JSON parse failed, reading as raw text") with open(actual_glossary_path, "r", encoding="utf-8") as gf: glossary_text = gf.read() if system: system += "\n\n" custom_prompt = os.getenv("APPEND_GLOSSARY_PROMPT", "Character/Term Glossary (use these translations consistently):").strip() if not custom_prompt: custom_prompt = "Character/Term Glossary (use these translations consistently):" system += f"{custom_prompt}\n{glossary_text}" print(f"[DEBUG] ✅ Entire glossary appended!") print(f"[DEBUG] Glossary text length: {len(glossary_text)} characters") except Exception as e: print(f"[ERROR] Could not load glossary: {e}") import traceback print(f"[ERROR] Full traceback: {traceback.format_exc()}") else: if not append_glossary: #print(f"[DEBUG] ❌ Glossary append disabled") pass elif not actual_glossary_path: print(f"[DEBUG] ❌ No glossary path provided") elif not os.path.exists(actual_glossary_path): print(f"[DEBUG] ❌ Glossary file does not exist: {actual_glossary_path}") print(f"🎯 Final system prompt length: {len(system)} characters") return system def translate_title(title, client, system_prompt, user_prompt, temperature=0.3): """Translate the book title using the configured settings""" if not title or not title.strip(): return title print(f"📚 Processing book title: {title}") try: if os.getenv("TRANSLATE_BOOK_TITLE", "1") == "0": print(f"📚 Book title translation disabled - keeping original") return title # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the text without AI prompts print(f"📚 Using translation service ({client_type}) - sending text directly") messages = [ {"role": "user", "content": title} ] max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) else: # For AI services, use prompts as before book_title_prompt = os.getenv("BOOK_TITLE_PROMPT", "Translate this book title to English while retaining any acronyms:") # Get the system prompt for book titles, with fallback to default book_title_system_prompt = os.getenv("BOOK_TITLE_SYSTEM_PROMPT", "You are a translator. Respond with only the translated text, nothing else. Do not add any explanation or additional content.") messages = [ {"role": "system", "content": book_title_system_prompt}, {"role": "user", "content": f"{book_title_prompt}\n\n{title}"} ] max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) print(f"[DEBUG] Raw API response: '{translated_title}'") print(f"[DEBUG] Response length: {len(translated_title)} (original: {len(title)})") newline = '\n' print(f"[DEBUG] Has newlines: {repr(translated_title) if newline in translated_title else 'No'}") translated_title = translated_title.strip() if ((translated_title.startswith('"') and translated_title.endswith('"')) or (translated_title.startswith("'") and translated_title.endswith("'"))): translated_title = translated_title[1:-1].strip() if '\n' in translated_title: print(f"⚠️ API returned multi-line content, keeping original title") return title # Check for JSON-like structured content, but allow simple brackets like [END] if (any(char in translated_title for char in ['{', '}']) or '"role":' in translated_title or '"content":' in translated_title or ('[[' in translated_title and ']]' in translated_title)): # Only flag double brackets print(f"⚠️ API returned structured content, keeping original title") return title if any(tag in translated_title.lower() for tag in ['

', '

', '

', '

', '= 2: return True # Single strong error indicator in very short response if len(content_str) < 50 and error_count >= 1: return True return False # Additional helper function for debugging def get_failure_reason(content): """ Returns the specific reason why content was marked as qa_failed Useful for debugging and logging """ if not content: return "Empty content" content_str = str(content).strip() content_lower = content_str.lower() # Check each category and return the first match failure_categories = { "Explicit Failure Marker": [ "[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]", "[IMAGE TRANSLATION FAILED]", "API response unavailable", "[]" ], "HTTP Error": [ "authentication_error", "rate_limit_error", "api_error" ], "Content Filter": [ "content_filter", "safety filter", "blocked by safety" ], "Timeout": [ "timeout", "timed out", "apitimeouterror" ], "Rate Limit": [ "rate limit exceeded", "quota exceeded", "too many requests" ], "Refusal Pattern": [ "i cannot", "i can't", "unable to process" ], "Empty Response": [ '"text": ""', "choices: [ { text: ''" ] } for category, markers in failure_categories.items(): for marker in markers: if marker in content_str or marker in content_lower: return f"{category}: {marker}" if len(content_str) < 50: return f"Short response with error indicators: {content_str[:30]}..." return "Unknown failure pattern" def convert_enhanced_text_to_html(plain_text, chapter_info=None): """Convert markdown/plain text back to HTML after translation (for enhanced mode) This function handles the conversion of translated markdown back to HTML. The input is the TRANSLATED text that was originally extracted using html2text. """ import re preserve_structure = chapter_info.get('preserve_structure', False) if chapter_info else False # First, try to use markdown2 for proper markdown conversion try: import markdown2 # Check if the text contains markdown patterns has_markdown = any([ '##' in plain_text, # Headers '**' in plain_text, # Bold '*' in plain_text and not '**' in plain_text, # Italic '[' in plain_text and '](' in plain_text, # Links '```' in plain_text, # Code blocks '> ' in plain_text, # Blockquotes '- ' in plain_text or '* ' in plain_text or '1. ' in plain_text # Lists ]) if has_markdown or preserve_structure: # Use markdown2 for proper conversion html = markdown2.markdown(plain_text, extras=[ 'cuddled-lists', # Lists without blank lines 'fenced-code-blocks', # Code blocks with ``` 'break-on-newline', # Treat single newlines as
'smarty-pants', # Smart quotes and dashes 'tables', # Markdown tables ]) # Post-process to ensure proper paragraph structure if not '

' in html: # If markdown2 didn't create paragraphs, wrap content lines = html.split('\n') processed_lines = [] for line in lines: line = line.strip() if line and not line.startswith('<') and not line.endswith('>'): processed_lines.append(f'

{line}

') elif line: processed_lines.append(line) html = '\n'.join(processed_lines) return html except ImportError: print("⚠️ markdown2 not available, using fallback HTML conversion") # Fallback: Manual markdown-to-HTML conversion lines = plain_text.strip().split('\n') html_parts = [] in_code_block = False code_block_content = [] for line in lines: # Handle code blocks if line.strip().startswith('```'): if in_code_block: # End code block html_parts.append('
' + '\n'.join(code_block_content) + '
') code_block_content = [] in_code_block = False else: # Start code block in_code_block = True continue if in_code_block: code_block_content.append(line) continue line = line.strip() if not line: # Preserve empty lines as paragraph breaks if html_parts and not html_parts[-1].endswith('

'): # Only add break if not already after a closing tag html_parts.append('
') continue # Check for markdown headers if line.startswith('#'): match = re.match(r'^(#+)\s*(.+)$', line) if match: level = min(len(match.group(1)), 6) header_text = match.group(2).strip() html_parts.append(f'{header_text}') continue # Check for blockquotes if line.startswith('> '): quote_text = line[2:].strip() html_parts.append(f'
{quote_text}
') continue # Check for lists if re.match(r'^[*\-+]\s+', line): list_text = re.sub(r'^[*\-+]\s+', '', line) html_parts.append(f'
  • {list_text}
  • ') continue if re.match(r'^\d+\.\s+', line): list_text = re.sub(r'^\d+\.\s+', '', line) html_parts.append(f'
  • {list_text}
  • ') continue # Convert inline markdown # Bold line = re.sub(r'\*\*(.+?)\*\*', r'\1', line) line = re.sub(r'__(.+?)__', r'\1', line) # Italic line = re.sub(r'\*(.+?)\*', r'\1', line) line = re.sub(r'_(.+?)_', r'\1', line) # Links line = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', line) # Code inline line = re.sub(r'`([^`]+)`', r'\1', line) # Regular paragraph html_parts.append(f'

    {line}

    ') # Post-process lists to wrap in ul/ol tags final_html = [] in_list = False list_type = None for part in html_parts: if part.startswith('
  • '): if not in_list: # Determine list type based on context (simplified) list_type = 'ul' # Default to unordered final_html.append(f'<{list_type}>') in_list = True final_html.append(part) else: if in_list: final_html.append(f'') in_list = False final_html.append(part) # Close any open list if in_list: final_html.append(f'') return '\n'.join(final_html) # ===================================================== # MAIN TRANSLATION FUNCTION # ===================================================== def main(log_callback=None, stop_callback=None): """Main translation function with enhanced duplicate detection and progress tracking""" config = TranslationConfig() builtins._DISABLE_ZERO_DETECTION = config.DISABLE_ZERO_DETECTION if config.DISABLE_ZERO_DETECTION: print("=" * 60) print("⚠️ 0-BASED DETECTION DISABLED BY USER") print("⚠️ All chapter numbers will be used exactly as found") print("=" * 60) args = None chapters_completed = 0 chunks_completed = 0 args = None chapters_completed = 0 chunks_completed = 0 input_path = config.input_path if not input_path and len(sys.argv) > 1: input_path = sys.argv[1] is_text_file = input_path.lower().endswith('.txt') if is_text_file: os.environ["IS_TEXT_FILE_TRANSLATION"] = "1" import json as _json _original_load = _json.load def debug_json_load(fp, *args, **kwargs): result = _original_load(fp, *args, **kwargs) if isinstance(result, list) and len(result) > 0: if isinstance(result[0], dict) and 'original_name' in result[0]: print(f"[DEBUG] Loaded glossary list with {len(result)} items from {fp.name if hasattr(fp, 'name') else 'unknown'}") return result _json.load = debug_json_load if log_callback: set_output_redirect(log_callback) def check_stop(): if stop_callback and stop_callback(): print("❌ Translation stopped by user request.") return True return is_stop_requested() if config.EMERGENCY_RESTORE: print("✅ Emergency paragraph restoration is ENABLED") else: print("⚠️ Emergency paragraph restoration is DISABLED") print(f"[DEBUG] REMOVE_AI_ARTIFACTS environment variable: {os.getenv('REMOVE_AI_ARTIFACTS', 'NOT SET')}") print(f"[DEBUG] REMOVE_AI_ARTIFACTS parsed value: {config.REMOVE_AI_ARTIFACTS}") if config.REMOVE_AI_ARTIFACTS: print("⚠️ AI artifact removal is ENABLED - will clean AI response artifacts") else: print("✅ AI artifact removal is DISABLED - preserving all content as-is") if '--epub' in sys.argv or (len(sys.argv) > 1 and sys.argv[1].endswith(('.epub', '.txt'))): import argparse parser = argparse.ArgumentParser() parser.add_argument('epub', help='Input EPUB or text file') args = parser.parse_args() input_path = args.epub is_text_file = input_path.lower().endswith('.txt') if is_text_file: file_base = os.path.splitext(os.path.basename(input_path))[0] else: epub_base = os.path.splitext(os.path.basename(input_path))[0] file_base = epub_base out = file_base os.makedirs(out, exist_ok=True) print(f"[DEBUG] Created output folder → {out}") cleanup_previous_extraction(out) os.environ["EPUB_OUTPUT_DIR"] = out payloads_dir = out # clear history if CONTEXTUAL is disabled if not config.CONTEXTUAL: history_file = os.path.join(payloads_dir, "translation_history.json") if os.path.exists(history_file): os.remove(history_file) print("[DEBUG] CONTEXTUAL disabled - cleared translation history") history_manager = HistoryManager(payloads_dir) chapter_splitter = ChapterSplitter(model_name=config.MODEL) chunk_context_manager = ChunkContextManager() progress_manager = ProgressManager(payloads_dir) # Create ChapterExtractor with progress callback if available chapter_progress_callback = None if log_callback: # Create a wrapper that formats progress messages for the log def chapter_progress_callback(msg): log_callback(f"📊 {msg}") chapter_extractor = ChapterExtractor(progress_callback=chapter_progress_callback) glossary_manager = GlossaryManager() history_file = os.path.join(payloads_dir, "translation_history.json") if os.path.exists(history_file): os.remove(history_file) print(f"[DEBUG] Purged translation history → {history_file}") print("🔍 Checking for deleted output files...") progress_manager.cleanup_missing_files(out) progress_manager.save() if check_stop(): return if not config.API_KEY: print("❌ Error: Set API_KEY, OPENAI_API_KEY, or OPENAI_OR_Gemini_API_KEY in your environment.") return #print(f"[DEBUG] Found API key: {config.API_KEY[:10]}...") print(f"[DEBUG] Using model = {config.MODEL}") print(f"[DEBUG] Max output tokens = {config.MAX_OUTPUT_TOKENS}") client = UnifiedClient(model=config.MODEL, api_key=config.API_KEY, output_dir=out) if hasattr(client, 'use_multi_keys') and client.use_multi_keys: stats = client.get_stats() print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") print(f" Active keys: {stats.get('active_keys', 0)}") else: print(f"🔑 Single-key mode: Using {config.MODEL}") # Reset cleanup state when starting new translation if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() if is_text_file: print("📄 Processing text file...") try: txt_processor = TextFileProcessor(input_path, out) chapters = txt_processor.extract_chapters() txt_processor.save_original_structure() metadata = { "title": os.path.splitext(os.path.basename(input_path))[0], "type": "text", "chapter_count": len(chapters) } except ImportError as e: print(f"❌ Error: Text file processor not available: {e}") if log_callback: log_callback(f"❌ Error: Text file processor not available: {e}") return except Exception as e: print(f"❌ Error processing text file: {e}") if log_callback: log_callback(f"❌ Error processing text file: {e}") return else: # Check if we should use async extraction (for GUI mode) use_async_extraction = os.getenv("USE_ASYNC_CHAPTER_EXTRACTION", "0") == "1" if use_async_extraction and log_callback: print("🚀 Using async chapter extraction (subprocess mode)...") from chapter_extraction_manager import ChapterExtractionManager # Create manager with log callback extraction_manager = ChapterExtractionManager(log_callback=log_callback) # Get extraction mode extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() # Define completion callback extraction_result = {"completed": False, "result": None} def on_extraction_complete(result): extraction_result["completed"] = True extraction_result["result"] = result # Safety check for None result if result is None: log_callback("❌ Chapter extraction failed: No result returned") return if result.get("success"): log_callback(f"✅ Chapter extraction completed: {result.get('chapters', 0)} chapters") else: log_callback(f"❌ Chapter extraction failed: {result.get('error', 'Unknown error')}") # Start async extraction extraction_manager.extract_chapters_async( input_path, out, extraction_mode=extraction_mode, progress_callback=lambda msg: log_callback(f"📊 {msg}"), completion_callback=on_extraction_complete ) # Wait for completion (with timeout) timeout = 300 # 5 minutes timeout start_time = time.time() while not extraction_result["completed"]: if check_stop(): extraction_manager.stop_extraction() return if time.time() - start_time > timeout: log_callback("⚠️ Chapter extraction timeout") extraction_manager.stop_extraction() return time.sleep(0.1) # Check every 100ms # Check if extraction was successful if not extraction_result["result"] or not extraction_result["result"].get("success"): log_callback("❌ Chapter extraction failed") return # Load the extracted data metadata_path = os.path.join(out, "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) else: metadata = extraction_result["result"].get("metadata", {}) # The async extraction should have saved chapters directly, similar to the sync version # We need to reconstruct the chapters list with body content # Check if the extraction actually created a chapters.json file with full content chapters_full_path = os.path.join(out, "chapters_full.json") chapters_info_path = os.path.join(out, "chapters_info.json") chapters = [] # First try to load full chapters if saved if os.path.exists(chapters_full_path): log_callback("Loading full chapters data...") with open(chapters_full_path, 'r', encoding='utf-8') as f: chapters = json.load(f) log_callback(f"✅ Loaded {len(chapters)} chapters with content") elif os.path.exists(chapters_info_path): # Fall back to loading from individual files log_callback("Loading chapter info and searching for content files...") with open(chapters_info_path, 'r', encoding='utf-8') as f: chapters_info = json.load(f) # List all files in the output directory all_files = os.listdir(out) log_callback(f"Found {len(all_files)} files in output directory") # Try to match chapter files for info in chapters_info: chapter_num = info['num'] found = False # Try different naming patterns patterns = [ f"chapter_{chapter_num:04d}_", # With leading zeros f"chapter_{chapter_num}_", # Without leading zeros f"ch{chapter_num:04d}_", # Shortened with zeros f"ch{chapter_num}_", # Shortened without zeros f"{chapter_num:04d}_", # Just number with zeros f"{chapter_num}_" # Just number ] for pattern in patterns: # Find files matching this pattern (any extension) matching_files = [f for f in all_files if f.startswith(pattern)] if matching_files: # Prefer HTML/XHTML files html_files = [f for f in matching_files if f.endswith(('.html', '.xhtml', '.htm'))] if html_files: chapter_file = html_files[0] else: chapter_file = matching_files[0] chapter_path = os.path.join(out, chapter_file) try: with open(chapter_path, 'r', encoding='utf-8') as f: content = f.read() chapters.append({ "num": chapter_num, "title": info.get("title", f"Chapter {chapter_num}"), "body": content, "filename": info.get("original_filename", ""), "has_images": info.get("has_images", False), "file_size": len(content), "content_hash": info.get("content_hash", "") }) found = True break except Exception as e: log_callback(f"⚠️ Error reading {chapter_file}: {e}") if not found: log_callback(f"⚠️ No file found for Chapter {chapter_num}") # Log available files for debugging if len(all_files) < 50: similar_files = [f for f in all_files if str(chapter_num) in f] if similar_files: log_callback(f" Similar files: {similar_files[:3]}") if not chapters: log_callback("❌ No chapters could be loaded!") log_callback(f"❌ Output directory: {out}") log_callback(f"❌ Files in directory: {len(os.listdir(out))} files") # Show first few files for debugging sample_files = os.listdir(out)[:10] log_callback(f"❌ Sample files: {sample_files}") return # Sort chapters by OPF spine order if available opf_path = os.path.join(out, 'content.opf') if os.path.exists(opf_path) and chapters: log_callback("📋 Sorting chapters according to OPF spine order...") # Use the existing chapter_extractor instance to sort chapters = chapter_extractor._sort_by_opf_spine(chapters, opf_path) log_callback("✅ Chapters sorted according to OPF reading order") else: print("🚀 Using comprehensive chapter extraction with resource handling...") with zipfile.ZipFile(input_path, 'r') as zf: metadata = chapter_extractor._extract_epub_metadata(zf) chapters = chapter_extractor.extract_chapters(zf, out) print(f"\n📚 Extraction Summary:") print(f" Total chapters extracted: {len(chapters)}") if chapters: nums = [c.get('num', 0) for c in chapters] print(f" Chapter range: {min(nums)} to {max(nums)}") # Check for gaps in the sequence expected_count = max(nums) - min(nums) + 1 if len(chapters) < expected_count: print(f"\n⚠️ Potential missing chapters detected:") print(f" Expected {expected_count} chapters (from {min(nums)} to {max(nums)})") print(f" Actually found: {len(chapters)} chapters") print(f" Potentially missing: {expected_count - len(chapters)} chapters") validate_chapter_continuity(chapters) print("\n" + "="*50) validate_epub_structure(out) print("="*50 + "\n") progress_manager.migrate_to_content_hash(chapters) progress_manager.save() if check_stop(): return metadata_path = os.path.join(out, "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r', encoding='utf-8') as mf: metadata = json.load(mf) metadata["chapter_count"] = len(chapters) metadata["chapter_titles"] = {str(c["num"]): c["title"] for c in chapters} print(f"[DEBUG] Initializing client with model = {config.MODEL}") client = UnifiedClient(api_key=config.API_KEY, model=config.MODEL, output_dir=out) if hasattr(client, 'use_multi_keys') and client.use_multi_keys: stats = client.get_stats() print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") print(f" Active keys: {stats.get('active_keys', 0)}") else: print(f"🔑 Single-key mode: Using {config.MODEL}") # Reset cleanup state when starting new translation if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() if "title" in metadata and config.TRANSLATE_BOOK_TITLE and not metadata.get("title_translated", False): original_title = metadata["title"] print(f"📚 Original title: {original_title}") if not check_stop(): translated_title = translate_title( original_title, client, None, None, config.TEMP ) metadata["original_title"] = original_title metadata["title"] = translated_title metadata["title_translated"] = True print(f"📚 Translated title: {translated_title}") else: print("❌ Title translation skipped due to stop request") # Translate other metadata fields if configured translate_metadata_fields_str = os.getenv('TRANSLATE_METADATA_FIELDS', '{}') metadata_translation_mode = os.getenv('METADATA_TRANSLATION_MODE', 'together') try: translate_metadata_fields = json.loads(translate_metadata_fields_str) if translate_metadata_fields and any(translate_metadata_fields.values()): # Filter out fields that should be translated (excluding already translated fields) fields_to_translate = {} skipped_fields = [] for field_name, should_translate in translate_metadata_fields.items(): if should_translate and field_name != 'title' and field_name in metadata: # Check if already translated if metadata.get(f"{field_name}_translated", False): skipped_fields.append(field_name) print(f"✓ Skipping {field_name} - already translated") else: fields_to_translate[field_name] = should_translate if fields_to_translate: print("\n" + "="*50) print("📋 METADATA TRANSLATION PHASE") print("="*50) print(f"🌐 Translating {len(fields_to_translate)} metadata fields...") # Get ALL configuration from environment - NO DEFAULTS system_prompt = os.getenv('BOOK_TITLE_SYSTEM_PROMPT', '') if not system_prompt: print("❌ No system prompt configured, skipping metadata translation") else: # Get field-specific prompts field_prompts_str = os.getenv('METADATA_FIELD_PROMPTS', '{}') try: field_prompts = json.loads(field_prompts_str) except: field_prompts = {} if not field_prompts and not field_prompts.get('_default'): print("❌ No field prompts configured, skipping metadata translation") else: # Get language configuration lang_behavior = os.getenv('LANG_PROMPT_BEHAVIOR', 'auto') forced_source_lang = os.getenv('FORCED_SOURCE_LANG', 'Korean') output_language = os.getenv('OUTPUT_LANGUAGE', 'English') # Determine source language source_lang = metadata.get('language', '').lower() if lang_behavior == 'never': lang_str = "" elif lang_behavior == 'always': lang_str = forced_source_lang else: # auto if 'zh' in source_lang or 'chinese' in source_lang: lang_str = 'Chinese' elif 'ja' in source_lang or 'japanese' in source_lang: lang_str = 'Japanese' elif 'ko' in source_lang or 'korean' in source_lang: lang_str = 'Korean' else: lang_str = '' # Check if batch translation is enabled for parallel processing batch_translate_enabled = os.getenv('BATCH_TRANSLATION', '0') == '1' batch_size = int(os.getenv('BATCH_SIZE', '50')) # Default batch size if batch_translate_enabled and len(fields_to_translate) > 1: print(f"⚡ Using parallel metadata translation mode ({len(fields_to_translate)} fields, batch size: {batch_size})...") # Import ThreadPoolExecutor for parallel processing from concurrent.futures import ThreadPoolExecutor, as_completed import threading # Thread-safe results storage translation_results = {} results_lock = threading.Lock() def translate_metadata_field(field_name, original_value): """Translate a single metadata field""" try: print(f"\n📋 Translating {field_name}: {original_value[:100]}..." if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") # Get field-specific prompt prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) if not prompt_template: print(f"⚠️ No prompt configured for field '{field_name}', skipping") return None # Replace variables in prompt field_prompt = prompt_template.replace('{source_lang}', lang_str) field_prompt = field_prompt.replace('{output_lang}', output_language) field_prompt = field_prompt.replace('English', output_language) field_prompt = field_prompt.replace('{field_value}', str(original_value)) # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the field value without AI prompts print(f"🌐 Using translation service ({client_type}) - sending field directly") messages = [ {"role": "user", "content": str(original_value)} ] else: # For AI services, use prompts as before messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} ] # Add delay for rate limiting if config.DELAY > 0: time.sleep(config.DELAY) # Make API call content, finish_reason = client.send( messages, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS ) translated_value = content.strip() # Store result thread-safely with results_lock: translation_results[field_name] = { 'original': original_value, 'translated': translated_value, 'success': True } print(f"✅ Translated {field_name}: {translated_value}") return translated_value except Exception as e: print(f"❌ Failed to translate {field_name}: {e}") with results_lock: translation_results[field_name] = { 'original': original_value, 'translated': None, 'success': False, 'error': str(e) } return None # Execute parallel translations with limited workers max_workers = min(len(fields_to_translate), batch_size) with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all translation tasks futures = {} for field_name in fields_to_translate: if field_name in metadata and not check_stop(): original_value = metadata[field_name] future = executor.submit(translate_metadata_field, field_name, original_value) futures[future] = field_name # Wait for completion for future in as_completed(futures): if check_stop(): print("❌ Metadata translation stopped by user") break # Apply results to metadata for field_name, result in translation_results.items(): if result['success'] and result['translated']: metadata[f"original_{field_name}"] = result['original'] metadata[field_name] = result['translated'] metadata[f"{field_name}_translated"] = True else: # Sequential translation mode (individual translation) mode_desc = "sequential" if not batch_translate_enabled else "sequential (single field)" print(f"📝 Using {mode_desc} translation mode...") for field_name in fields_to_translate: if not check_stop() and field_name in metadata: original_value = metadata[field_name] print(f"\n📋 Translating {field_name}: {original_value[:100]}..." if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") # Get field-specific prompt prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) if not prompt_template: print(f"⚠️ No prompt configured for field '{field_name}', skipping") continue # Replace variables in prompt field_prompt = prompt_template.replace('{source_lang}', lang_str) field_prompt = field_prompt.replace('{output_lang}', output_language) field_prompt = field_prompt.replace('English', output_language) field_prompt = field_prompt.replace('{field_value}', str(original_value)) # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the field value without AI prompts print(f"🌐 Using translation service ({client_type}) - sending field directly") messages = [ {"role": "user", "content": str(original_value)} ] else: # For AI services, use prompts as before messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} ] try: # Add delay using the config instance from main() if config.DELAY > 0: # ✅ FIXED - use config.DELAY instead of config.SEND_INTERVAL time.sleep(config.DELAY) # Use the same client instance from main() # ✅ FIXED - Properly unpack tuple response and provide max_tokens content, finish_reason = client.send( messages, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS # ✅ FIXED - provide max_tokens to avoid NoneType error ) translated_value = content.strip() # ✅ FIXED - use content from unpacked tuple metadata[f"original_{field_name}"] = original_value metadata[field_name] = translated_value metadata[f"{field_name}_translated"] = True print(f"✅ Translated {field_name}: {translated_value}") except Exception as e: print(f"❌ Failed to translate {field_name}: {e}") else: if check_stop(): print("❌ Metadata translation stopped by user") break else: print("📋 No additional metadata fields to translate") except Exception as e: print(f"⚠️ Error processing metadata translation settings: {e}") import traceback traceback.print_exc() with open(metadata_path, 'w', encoding='utf-8') as mf: json.dump(metadata, mf, ensure_ascii=False, indent=2) print(f"💾 Saved metadata with {'translated' if metadata.get('title_translated', False) else 'original'} title") print("\n" + "="*50) print("📑 GLOSSARY GENERATION PHASE") print("="*50) print(f"📑 DEBUG: ENABLE_AUTO_GLOSSARY = '{os.getenv('ENABLE_AUTO_GLOSSARY', 'NOT SET')}'") print(f"📑 DEBUG: MANUAL_GLOSSARY = '{config.MANUAL_GLOSSARY}'") print(f"📑 DEBUG: Manual glossary exists? {os.path.isfile(config.MANUAL_GLOSSARY) if config.MANUAL_GLOSSARY else False}") # Check if glossary.csv already exists in the source folder existing_glossary_csv = os.path.join(out, "glossary.csv") existing_glossary_json = os.path.join(out, "glossary.json") print(f"📑 DEBUG: Existing glossary.csv? {os.path.exists(existing_glossary_csv)}") print(f"📑 DEBUG: Existing glossary.json? {os.path.exists(existing_glossary_json)}") if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY): ext = os.path.splitext(config.MANUAL_GLOSSARY)[1].lower() target_name = "glossary.csv" if ext == ".csv" else "glossary.json" target_path = os.path.join(out, target_name) if os.path.abspath(config.MANUAL_GLOSSARY) != os.path.abspath(target_path): shutil.copy(config.MANUAL_GLOSSARY, target_path) print("📑 Using manual glossary from:", config.MANUAL_GLOSSARY) else: print("📑 Using existing glossary:", config.MANUAL_GLOSSARY) elif os.path.exists(existing_glossary_csv) or os.path.exists(existing_glossary_json): print("📑 Existing glossary file detected in source folder - skipping automatic generation") if os.path.exists(existing_glossary_csv): print(f"📑 Using existing glossary.csv: {existing_glossary_csv}") elif os.path.exists(existing_glossary_json): print(f"📑 Using existing glossary.json: {existing_glossary_json}") elif os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1": model = os.getenv("MODEL", "gpt-4") if is_traditional_translation_api(model): print("📑 Automatic glossary generation disabled") print(f" {model} does not support glossary extraction") print(" Traditional translation APIs cannot identify character names/terms") else: print("📑 Starting automatic glossary generation...") try: # Use the new process-safe glossary worker from glossary_process_worker import generate_glossary_in_process import concurrent.futures import multiprocessing instructions = "" # Get extraction workers setting extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) if extraction_workers == 1: # Auto-detect for better performance extraction_workers = min(os.cpu_count() or 4, 4) print(f"📑 Using {extraction_workers} CPU cores for glossary generation") # Collect environment variables to pass to subprocess env_vars = {} important_vars = [ 'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES', 'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS', 'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT', 'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED', 'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION', 'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS', 'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY' ] for var in important_vars: if var in os.environ: env_vars[var] = os.environ[var] # Create a Queue for real-time log streaming manager = multiprocessing.Manager() log_queue = manager.Queue() # Use ProcessPoolExecutor for true parallelism (completely bypasses GIL) print("📑 Starting glossary generation in separate process...") with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: # Submit to separate process WITH log queue future = executor.submit( generate_glossary_in_process, out, chapters, instructions, env_vars, log_queue # Pass the queue for real-time logs ) # Poll for completion and stream logs in real-time poll_count = 0 while not future.done(): poll_count += 1 # Check for logs from subprocess and print them immediately try: while not log_queue.empty(): log_line = log_queue.get_nowait() print(log_line) # Print to GUI except: pass # Super short sleep to yield to GUI time.sleep(0.001) # Check for stop every 100 polls if poll_count % 100 == 0: if check_stop(): print("📑 ❌ Glossary generation cancelled") executor.shutdown(wait=False, cancel_futures=True) return # Get any remaining logs from queue try: while not log_queue.empty(): log_line = log_queue.get_nowait() print(log_line) except: pass # Get result if future.done(): try: result = future.result(timeout=0.1) if isinstance(result, dict): if result.get('success'): print(f"📑 ✅ Glossary generation completed successfully") else: print(f"📑 ❌ Glossary generation failed: {result.get('error')}") if result.get('traceback'): print(f"📑 Error details:\n{result.get('traceback')}") except Exception as e: print(f"📑 ❌ Error retrieving glossary result: {e}") print("✅ Automatic glossary generation COMPLETED") # Handle deferred glossary appending if os.getenv('DEFER_GLOSSARY_APPEND') == '1': print("📑 Processing deferred glossary append to system prompt...") glossary_path = find_glossary_file(out) if glossary_path and os.path.exists(glossary_path): try: glossary_block = None if glossary_path.lower().endswith('.csv'): with open(glossary_path, 'r', encoding='utf-8') as f: glossary_block = f.read() else: with open(glossary_path, 'r', encoding='utf-8') as f: glossary_data = json.load(f) formatted_entries = {} if isinstance(glossary_data, dict) and 'entries' in glossary_data: formatted_entries = glossary_data['entries'] elif isinstance(glossary_data, dict): formatted_entries = {k: v for k, v in glossary_data.items() if k != "metadata"} if formatted_entries: glossary_block = json.dumps(formatted_entries, ensure_ascii=False, indent=2) else: glossary_block = None if glossary_block: glossary_prompt = os.getenv('GLOSSARY_APPEND_PROMPT', "Character/Term Glossary (use these translations consistently):") current_prompt = config.PROMPT if current_prompt: current_prompt += "\n\n" current_prompt += f"{glossary_prompt}\n{glossary_block}" config.PROMPT = current_prompt print(f"✅ Added auto-generated glossary to system prompt ({os.path.basename(glossary_path)})") if 'DEFER_GLOSSARY_APPEND' in os.environ: del os.environ['DEFER_GLOSSARY_APPEND'] if 'GLOSSARY_APPEND_PROMPT' in os.environ: del os.environ['GLOSSARY_APPEND_PROMPT'] else: print("⚠️ Auto-generated glossary has no entries - skipping append") if 'DEFER_GLOSSARY_APPEND' in os.environ: del os.environ['DEFER_GLOSSARY_APPEND'] if 'GLOSSARY_APPEND_PROMPT' in os.environ: del os.environ['GLOSSARY_APPEND_PROMPT'] except Exception as e: print(f"⚠️ Failed to append auto-generated glossary: {e}") else: print("⚠️ No glossary file found after automatic generation") except Exception as e: print(f"❌ Glossary generation failed: {e}") else: print("📑 Automatic glossary generation disabled") # Don't create an empty glossary - let any existing manual glossary remain glossary_file = find_glossary_file(out) if glossary_file and os.path.exists(glossary_file): try: if glossary_file.lower().endswith('.csv'): # Quick CSV stats with open(glossary_file, 'r', encoding='utf-8') as f: lines = [ln.strip() for ln in f.readlines() if ln.strip()] entry_count = max(0, len(lines) - 1) if lines and ',' in lines[0] else len(lines) print(f"📑 Glossary ready (CSV) with {entry_count} entries") print("📑 Sample glossary lines:") for ln in lines[1:4]: print(f" • {ln}") else: with open(glossary_file, 'r', encoding='utf-8') as f: glossary_data = json.load(f) if isinstance(glossary_data, dict): if 'entries' in glossary_data and isinstance(glossary_data['entries'], dict): entry_count = len(glossary_data['entries']) sample_items = list(glossary_data['entries'].items())[:3] else: entry_count = len(glossary_data) sample_items = list(glossary_data.items())[:3] print(f"📑 Glossary ready with {entry_count} entries") print("📑 Sample glossary entries:") for key, value in sample_items: print(f" • {key} → {value}") elif isinstance(glossary_data, list): print(f"📑 Glossary ready with {len(glossary_data)} entries") print("📑 Sample glossary entries:") for i, entry in enumerate(glossary_data[:3]): if isinstance(entry, dict): original = entry.get('original_name', '?') translated = entry.get('name', original) print(f" • {original} → {translated}") else: print(f"⚠️ Unexpected glossary format: {type(glossary_data)}") except Exception as e: print(f"⚠️ Failed to inspect glossary file: {e}") else: print("📑 No glossary file found") print("="*50) print("🚀 STARTING MAIN TRANSLATION PHASE") print("="*50 + "\n") glossary_path = find_glossary_file(out) if glossary_path and os.path.exists(glossary_path) and glossary_path.lower().endswith('.json'): try: with open(glossary_path, 'r', encoding='utf-8') as f: g_data = json.load(f) print(f"[DEBUG] Glossary type before translation: {type(g_data)}") if isinstance(g_data, list): print(f"[DEBUG] Glossary is a list") except Exception as e: print(f"[DEBUG] Error checking glossary: {e}") glossary_path = find_glossary_file(out) system = build_system_prompt(config.SYSTEM_PROMPT, glossary_path) base_msg = [{"role": "system", "content": system}] # Preserve the original system prompt to avoid in-place mutations original_system_prompt = system last_summary_block_text = None # Will hold the last rolling summary text for the NEXT chapter only image_translator = None if config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Image translation enabled for model: {config.MODEL}") print("🖼️ Image translation will use your custom system prompt and glossary") image_translator = ImageTranslator( client, out, config.PROFILE_NAME, system, config.TEMP, log_callback , progress_manager, history_manager, chunk_context_manager ) known_vision_models = [ 'gemini-1.5-pro', 'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', 'gpt-4-turbo', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini', 'gpt-4.1-mini' ] if config.MODEL.lower() not in known_vision_models: print(f"⚠️ Note: {config.MODEL} may not have vision capabilities. Image translation will be attempted anyway.") else: print("ℹ️ Image translation disabled by user") total_chapters = len(chapters) # Only detect numbering if the toggle is not disabled if config.DISABLE_ZERO_DETECTION: print(f"📊 0-based detection disabled by user setting") uses_zero_based = False # Important: Set a flag that can be checked throughout the codebase config._force_disable_zero_detection = True else: if chapters: uses_zero_based = detect_novel_numbering(chapters) print(f"📊 Novel numbering detected: {'0-based' if uses_zero_based else '1-based'}") else: uses_zero_based = False config._force_disable_zero_detection = False # Store this for later use config._uses_zero_based = uses_zero_based rng = os.getenv("CHAPTER_RANGE", "") start = None end = None if rng and re.match(r"^\d+\s*-\s*\d+$", rng): start, end = map(int, rng.split("-", 1)) if config.DISABLE_ZERO_DETECTION: print(f"📊 0-based detection disabled - using range as specified: {start}-{end}") elif uses_zero_based: print(f"📊 0-based novel detected") print(f"📊 User range {start}-{end} will be used as-is (chapters are already adjusted)") else: print(f"📊 1-based novel detected") print(f"📊 Using range as specified: {start}-{end}") print("📊 Calculating total chunks needed...") total_chunks_needed = 0 chunks_per_chapter = {} chapters_to_process = 0 # When setting actual chapter numbers (in the main function) for idx, c in enumerate(chapters): chap_num = c["num"] content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Extract the raw chapter number from the file raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") # Apply the offset offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset # When toggle is disabled, use raw numbers without any 0-based adjustment if config.DISABLE_ZERO_DETECTION: c['actual_chapter_num'] = raw_num # Store raw number for consistency c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply adjustment only if this is a 0-based novel if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False # Now we can safely use actual_num actual_num = c['actual_chapter_num'] if start is not None: if not (start <= c['actual_chapter_num'] <= end): #print(f"[SKIP] Chapter {c['actual_chapter_num']} outside range {start}-{end}") continue needs_translation, skip_reason, _ = progress_manager.check_chapter_status( idx, actual_num, content_hash, out ) if not needs_translation: chunks_per_chapter[idx] = 0 continue chapters_to_process += 1 chapter_key = str(actual_num) if chapter_key in progress_manager.prog["chapters"] and progress_manager.prog["chapters"][chapter_key].get("status") == "in_progress": pass # Calculate based on OUTPUT limit only max_output_tokens = config.MAX_OUTPUT_TOKENS safety_margin_output = 500 # Korean to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) #print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") # For mixed content chapters, calculate on clean text # For mixed content chapters, calculate on clean text if c.get('has_images', False) and ContentProcessor.is_meaningful_text_content(c["body"]): # Don't modify c["body"] at all during chunk calculation # Just pass the body as-is, the chunking will be slightly off but that's OK chunks = chapter_splitter.split_chapter(c["body"], available_tokens) else: chunks = chapter_splitter.split_chapter(c["body"], available_tokens) chapter_key_str = content_hash old_key_str = str(idx) if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] del progress_manager.prog["chapter_chunks"][old_key_str] #print(f"[PROGRESS] Migrated chunks for chapter {actual_num} to new tracking system") # Always count actual chunks - ignore "completed" tracking chunks_per_chapter[idx] = len(chunks) total_chunks_needed += chunks_per_chapter[idx] terminology = "Sections" if is_text_file else "Chapters" print(f"📊 Total chunks to translate: {total_chunks_needed}") print(f"📚 {terminology} to process: {chapters_to_process}") multi_chunk_chapters = [(idx, count) for idx, count in chunks_per_chapter.items() if count > 1] if multi_chunk_chapters: # Determine terminology based on file type terminology = "Sections" if is_text_file else "Chapters" print(f"📄 {terminology} requiring multiple chunks:") for idx, chunk_count in multi_chunk_chapters: chap = chapters[idx] section_term = "Section" if is_text_file else "Chapter" print(f" • {section_term} {idx+1} ({chap['title'][:30]}...): {chunk_count} chunks") translation_start_time = time.time() chunks_completed = 0 chapters_completed = 0 current_chunk_number = 0 if config.BATCH_TRANSLATION: print(f"\n📦 PARALLEL TRANSLATION MODE ENABLED") print(f"📦 Processing chapters with up to {config.BATCH_SIZE} concurrent API calls") import concurrent.futures from threading import Lock progress_lock = Lock() chapters_to_translate = [] # FIX: First pass to set actual chapter numbers for ALL chapters # This ensures batch mode has the same chapter numbering as non-batch mode print("📊 Setting chapter numbers...") for idx, c in enumerate(chapters): raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) # Apply offset if configured offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset if config.DISABLE_ZERO_DETECTION: # Use raw numbers without adjustment c['actual_chapter_num'] = raw_num c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply 0-based adjustment if detected if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False for idx, c in enumerate(chapters): chap_num = c["num"] content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Check if this is a pre-split text chunk with decimal number if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] # Preserve the decimal for text files only else: actual_num = c.get('actual_chapter_num', c['num']) # Now this will exist! # Skip chapters outside the range if start is not None and not (start <= actual_num <= end): continue # Check if chapter needs translation needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, c # Pass the chapter object ) # Add explicit file check for supposedly completed chapters if not needs_translation and existing_file: file_path = os.path.join(out, existing_file) if not os.path.exists(file_path): print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") needs_translation = True skip_reason = None # Update status to file_missing progress_manager.update(idx, actual_num, content_hash, None, status="file_missing") progress_manager.save() if not needs_translation: # Modify skip_reason to use appropriate terminology is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Replace "Chapter" with appropriate terminology in skip_reason skip_reason_modified = skip_reason.replace("Chapter", terminology) print(f"[SKIP] {skip_reason_modified}") chapters_completed += 1 continue # Check for empty or image-only chapters has_images = c.get('has_images', False) has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) text_size = c.get('file_size', 0) is_empty_chapter = (not has_images and text_size < 10) is_image_only_chapter = (has_images and not has_meaningful_text) # Handle empty chapters if is_empty_chapter: print(f"📄 Empty chapter {chap_num} - will process individually") safe_title = make_safe_filename(c['title'], c['num']) if isinstance(c['num'], float): fname = FileUtilities.create_chapter_filename(c, c['num']) else: fname = FileUtilities.create_chapter_filename(c, c['num']) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(c["body"]) progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty") progress_manager.save() chapters_completed += 1 continue # Add to chapters to translate chapters_to_translate.append((idx, c)) print(f"📊 Found {len(chapters_to_translate)} chapters to translate in parallel") # Continue with the rest of the existing batch processing code... batch_processor = BatchTranslationProcessor( config, client, base_msg, out, progress_lock, progress_manager.save, lambda idx, actual_num, content_hash, output_file=None, status="completed", **kwargs: progress_manager.update(idx, actual_num, content_hash, output_file, status, **kwargs), check_stop, image_translator, is_text_file=is_text_file ) total_to_process = len(chapters_to_translate) processed = 0 # Apply conservative batching setting batch_multiplier = 3 if os.getenv('CONSERVATIVE_BATCHING', '0') == '1' else 1 batch_group_size = config.BATCH_SIZE * batch_multiplier if batch_multiplier > 1: print(f"📦 Using conservative batching: {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel") else: print(f"📦 Using direct batching (default): {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel") with concurrent.futures.ThreadPoolExecutor(max_workers=config.BATCH_SIZE) as executor: for batch_start in range(0, total_to_process, batch_group_size): if check_stop(): print("❌ Translation stopped during parallel processing") executor.shutdown(wait=False) return batch_end = min(batch_start + batch_group_size, total_to_process) current_batch = chapters_to_translate[batch_start:batch_end] batch_number = (batch_start // batch_group_size) + 1 print(f"\n📦 Submitting batch {batch_number}: {len(current_batch)} chapters") future_to_chapter = { executor.submit(batch_processor.process_single_chapter, chapter_data): chapter_data for chapter_data in current_batch } active_count = 0 completed_in_batch = 0 failed_in_batch = 0 for future in concurrent.futures.as_completed(future_to_chapter): if check_stop(): print("❌ Translation stopped") executor.shutdown(wait=False) return chapter_data = future_to_chapter[future] idx, chapter = chapter_data try: success, chap_num = future.result() if success: completed_in_batch += 1 print(f"✅ Chapter {chap_num} done ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)") else: failed_in_batch += 1 print(f"❌ Chapter {chap_num} failed ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)") except Exception as e: failed_in_batch += 1 print(f"❌ Chapter thread error: {e}") processed += 1 progress_percent = (processed / total_to_process) * 100 print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)") print(f"\n📦 Batch Summary:") print(f" ✅ Successful: {completed_in_batch}") print(f" ❌ Failed: {failed_in_batch}") if batch_end < total_to_process: print(f"⏳ Waiting {config.DELAY}s before next batch...") time.sleep(config.DELAY) chapters_completed = batch_processor.chapters_completed chunks_completed = batch_processor.chunks_completed print(f"\n🎉 Parallel translation complete!") print(f" Total chapters processed: {processed}") # Count qa_failed chapters correctly qa_failed_count = 0 actual_successful = 0 for idx, c in enumerate(chapters): # Get the chapter's actual number if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) # Check if this chapter was processed and has qa_failed status content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Check if this chapter exists in progress chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) status = chapter_info.get("status") if status == "qa_failed": qa_failed_count += 1 elif status == "completed": actual_successful += 1 # Correct the displayed counts print(f" Successful: {actual_successful}") if qa_failed_count > 0: print(f"\n⚠️ {qa_failed_count} chapters failed due to content policy violations:") qa_failed_chapters = [] for idx, c in enumerate(chapters): if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) if chapter_info.get("status") == "qa_failed": qa_failed_chapters.append(actual_num) print(f" Failed chapters: {', '.join(map(str, sorted(qa_failed_chapters)))}") # Stop translation completely after batch mode print("\n📌 Batch translation completed.") elif not config.BATCH_TRANSLATION: translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file) if config.DUPLICATE_DETECTION_MODE == 'ai-hunter': # Build the main config from environment variables and config object main_config = { 'duplicate_lookback_chapters': config.DUPLICATE_LOOKBACK_CHAPTERS, 'duplicate_detection_mode': config.DUPLICATE_DETECTION_MODE, } # Check if AI Hunter config was passed via environment variable ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') if ai_hunter_config_str: try: ai_hunter_config = json.loads(ai_hunter_config_str) main_config['ai_hunter_config'] = ai_hunter_config print("🤖 AI Hunter: Loaded configuration from environment") except json.JSONDecodeError: print("⚠️ AI Hunter: Failed to parse AI_HUNTER_CONFIG from environment") # If no AI Hunter config in environment, try to load from file as fallback if 'ai_hunter_config' not in main_config: # Try multiple locations for config.json config_paths = [ os.path.join(os.getcwd(), 'config.json'), os.path.join(out, '..', 'config.json'), ] if getattr(sys, 'frozen', False): config_paths.append(os.path.join(os.path.dirname(sys.executable), 'config.json')) else: script_dir = os.path.dirname(os.path.abspath(__file__)) config_paths.extend([ os.path.join(script_dir, 'config.json'), os.path.join(os.path.dirname(script_dir), 'config.json') ]) for config_path in config_paths: if os.path.exists(config_path): try: with open(config_path, 'r', encoding='utf-8') as f: file_config = json.load(f) if 'ai_hunter_config' in file_config: main_config['ai_hunter_config'] = file_config['ai_hunter_config'] print(f"🤖 AI Hunter: Loaded configuration from {config_path}") break except Exception as e: print(f"⚠️ Failed to load config from {config_path}: {e}") # Always create and inject the improved AI Hunter when ai-hunter mode is selected ai_hunter = ImprovedAIHunterDetection(main_config) # The TranslationProcessor class has a method that checks for duplicates # We need to replace it with our enhanced AI Hunter # Create a wrapper to match the expected signature def enhanced_duplicate_check(self, result, idx, prog, out, actual_num=None): # If actual_num is not provided, try to get it from progress if actual_num is None: # Look for the chapter being processed for ch_key, ch_info in prog.get("chapters", {}).items(): if ch_info.get("chapter_idx") == idx: actual_num = ch_info.get("actual_num", idx + 1) break # Fallback to idx+1 if not found if actual_num is None: actual_num = idx + 1 return ai_hunter.detect_duplicate_ai_hunter_enhanced(result, idx, prog, out, actual_num) # Bind the enhanced method to the processor instance translation_processor.check_duplicate_content = enhanced_duplicate_check.__get__(translation_processor, TranslationProcessor) print("🤖 AI Hunter: Using enhanced detection with configurable thresholds") # First pass: set actual chapter numbers respecting the config for idx, c in enumerate(chapters): raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") # Apply offset if configured offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset if config.DISABLE_ZERO_DETECTION: # Use raw numbers without adjustment c['actual_chapter_num'] = raw_num c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply 0-based adjustment if detected if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False # Second pass: process chapters for idx, c in enumerate(chapters): chap_num = c["num"] # Check if this is a pre-split text chunk with decimal number if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] # Preserve the decimal for text files only else: actual_num = c.get('actual_chapter_num', c['num']) content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) if start is not None and not (start <= actual_num <= end): #print(f"[SKIP] Chapter {actual_num} (file: {c.get('original_basename', 'unknown')}) outside range {start}-{end}") continue needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, c # Pass the chapter object ) # Add explicit file check for supposedly completed chapters if not needs_translation and existing_file: file_path = os.path.join(out, existing_file) if not os.path.exists(file_path): print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") needs_translation = True skip_reason = None # Update status to file_missing progress_manager.update(idx, actual_num, content_hash, None, status="file_missing") progress_manager.save() if not needs_translation: # Modify skip_reason to use appropriate terminology is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Replace "Chapter" with appropriate terminology in skip_reason skip_reason_modified = skip_reason.replace("Chapter", terminology) print(f"[SKIP] {skip_reason_modified}") continue chapter_position = f"{chapters_completed + 1}/{chapters_to_process}" # Determine if this is a text file is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Determine file reference based on type if c.get('is_chunk', False): file_ref = f"Section_{c['num']}" else: file_ref = c.get('original_basename', f'{terminology}_{actual_num}') print(f"\n🔄 Processing #{idx+1}/{total_chapters} (Actual: {terminology} {actual_num}) ({chapter_position} to translate): {c['title']} [File: {file_ref}]") chunk_context_manager.start_chapter(chap_num, c['title']) has_images = c.get('has_images', False) has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) text_size = c.get('file_size', 0) is_empty_chapter = (not has_images and text_size < 10) is_image_only_chapter = (has_images and not has_meaningful_text) is_mixed_content = (has_images and has_meaningful_text) is_text_only = (not has_images and has_meaningful_text) if is_empty_chapter: print(f"📄 Empty chapter {actual_num} detected") # Create filename for empty chapter if isinstance(c['num'], float): fname = FileUtilities.create_chapter_filename(c, c['num']) else: fname = FileUtilities.create_chapter_filename(c, actual_num) # Save original content with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(c["body"]) # Update progress tracking progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty") progress_manager.save() chapters_completed += 1 # CRITICAL: Skip translation! continue elif is_image_only_chapter: print(f"📸 Image-only chapter: {c.get('image_count', 0)} images") translated_html = c["body"] image_translations = {} # Step 1: Process images if image translation is enabled if image_translator and config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Translating {c.get('image_count', 0)} images...") image_translator.set_current_chapter(chap_num) translated_html, image_translations = process_chapter_images( c["body"], actual_num, image_translator, check_stop ) if image_translations: print(f"✅ Translated {len(image_translations)} images") # Step 2: Check for headers/titles that need translation from bs4 import BeautifulSoup soup = BeautifulSoup(c["body"], 'html.parser') # Look for headers headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) # If we have headers, we should translate them even in "image-only" chapters if headers and any(h.get_text(strip=True) for h in headers): print(f"📝 Found headers to translate in image-only chapter") # Create a minimal HTML with just the headers for translation headers_html = "" for header in headers: if header.get_text(strip=True): headers_html += str(header) + "\n" if headers_html: print(f"📤 Translating chapter headers...") # Send just the headers for translation header_msgs = base_msg + [{"role": "user", "content": headers_html}] # Use the standard filename fname = FileUtilities.create_chapter_filename(c, actual_num) client.set_output_filename(fname) # Simple API call for headers header_result, _ = client.send( header_msgs, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS ) if header_result: # Clean the result header_result = re.sub(r"^```(?:html)?\s*\n?", "", header_result, count=1, flags=re.MULTILINE) header_result = re.sub(r"\n?```\s*$", "", header_result, count=1, flags=re.MULTILINE) # Parse both the translated headers and the original body soup_headers = BeautifulSoup(header_result, 'html.parser') soup_body = BeautifulSoup(translated_html, 'html.parser') # Replace headers in the body with translated versions translated_headers = soup_headers.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) original_headers = soup_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) # Match and replace headers for orig, trans in zip(original_headers, translated_headers): if trans and trans.get_text(strip=True): orig.string = trans.get_text(strip=True) translated_html = str(soup_body) print(f"✅ Headers translated successfully") status = "completed" else: print(f"⚠️ Failed to translate headers") status = "completed_image_only" else: status = "completed_image_only" else: print(f"ℹ️ No headers found to translate") status = "completed_image_only" # Step 3: Save with correct filename fname = FileUtilities.create_chapter_filename(c, actual_num) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(translated_html) print(f"[Chapter {idx+1}/{total_chapters}] ✅ Saved image-only chapter") progress_manager.update(idx, actual_num, content_hash, fname, status=status) progress_manager.save() chapters_completed += 1 continue else: # Set default text to translate text_to_translate = c["body"] image_translations = {} if is_mixed_content and image_translator and config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Processing {c.get('image_count', 0)} images first...") print(f"[DEBUG] Content before image processing (first 200 chars):") print(c["body"][:200]) print(f"[DEBUG] Has h1 tags: {'

    ' in c['body']}") print(f"[DEBUG] Has h2 tags: {'

    ' in c['body']}") image_translator.set_current_chapter(chap_num) # Store the original body before processing original_body = c["body"] # Calculate original chapter tokens before modification original_chapter_tokens = chapter_splitter.count_tokens(original_body) # Process images and get body with translations body_with_images, image_translations = process_chapter_images( c["body"], actual_num, image_translator, check_stop ) if image_translations: print(f"✅ Translated {len(image_translations)} images") # Store the body with images for later merging c["body_with_images"] = c["body"] # For chapters with only images and title, we still need to translate the title # Extract clean text for translation from ORIGINAL body from bs4 import BeautifulSoup soup_clean = BeautifulSoup(original_body, 'html.parser') # Remove images from the original to get pure text for img in soup_clean.find_all('img'): img.decompose() # Set clean text for translation - use prettify() or str() on the full document c["body"] = str(soup_clean) if soup_clean.body else original_body # If there's no meaningful text content after removing images, # the text translation will just translate the title, which is correct print(f" 📝 Clean text for translation: {len(c['body'])} chars") # Update text_size to reflect actual text to translate text_size = len(c["body"]) # Recalculate the actual token count for clean text actual_text_tokens = chapter_splitter.count_tokens(c["body"]) print(f" 📊 Actual text tokens: {actual_text_tokens} (was counting {original_chapter_tokens} with images)") else: print(f"ℹ️ No translatable text found in images") # Keep original body if no image translations c["body"] = original_body print(f"📖 Translating text content ({text_size} characters)") progress_manager.update(idx, actual_num, content_hash, output_file=None, status="in_progress") progress_manager.save() # Apply ignore filtering to the content before chunk splitting batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active if (ignore_title_tag or ignore_header_tags) and c["body"]: from bs4 import BeautifulSoup content_soup = BeautifulSoup(c["body"], 'html.parser') # Remove title tags if ignored if ignore_title_tag: for title_tag in content_soup.find_all('title'): title_tag.decompose() # Remove header tags if ignored if ignore_header_tags: for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): header_tag.decompose() c["body"] = str(content_soup) # Update the chapter body # Check if this chapter is already a chunk from text file splitting if c.get('is_chunk', False): # This is already a pre-split chunk, but still check if it needs further splitting # Calculate based on OUTPUT limit only max_output_tokens = config.MAX_OUTPUT_TOKENS safety_margin_output = 500 # CJK to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") chapter_tokens = chapter_splitter.count_tokens(c["body"]) if chapter_tokens > available_tokens: # Even pre-split chunks might need further splitting chunks = chapter_splitter.split_chapter(c["body"], available_tokens) print(f"📄 Section {c['num']} (pre-split from text file) needs further splitting into {len(chunks)} chunks") else: chunks = [(c["body"], 1, 1)] print(f"📄 Section {c['num']} (pre-split from text file)") else: # Normal splitting logic for non-text files # Calculate based on OUTPUT limit only max_output_tokens = config.MAX_OUTPUT_TOKENS safety_margin_output = 500 # CJK to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") chunks = chapter_splitter.split_chapter(c["body"], available_tokens) # Use consistent terminology is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f"📄 {terminology} will be processed in {len(chunks)} chunk(s)") # Recalculate tokens on the actual text to be translated actual_chapter_tokens = chapter_splitter.count_tokens(c["body"]) if len(chunks) > 1: is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (limit: {available_tokens:,} tokens per chunk)") else: is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (within limit of {available_tokens:,} tokens)") chapter_key_str = str(idx) if chapter_key_str not in progress_manager.prog["chapter_chunks"]: progress_manager.prog["chapter_chunks"][chapter_key_str] = { "total": len(chunks), "completed": [], "chunks": {} } progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) translated_chunks = [] for chunk_html, chunk_idx, total_chunks in chunks: chapter_key_str = content_hash old_key_str = str(idx) if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] del progress_manager.prog["chapter_chunks"][old_key_str] #print(f"[PROGRESS] Migrated chunks for chapter {chap_num} to new tracking system") if chapter_key_str not in progress_manager.prog["chapter_chunks"]: progress_manager.prog["chapter_chunks"][chapter_key_str] = { "total": len(chunks), "completed": [], "chunks": {} } progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) # Get chapter status to check for qa_failed chapter_info = progress_manager.prog["chapters"].get(chapter_key_str, {}) chapter_status = chapter_info.get("status") if chapter_status == "qa_failed": # Force retranslation of qa_failed chapters print(f" [RETRY] Chunk {chunk_idx}/{total_chunks} - retranslating due to QA failure") if config.CONTEXTUAL and history_manager.will_reset_on_next_append(config.HIST_LIMIT): print(f" 📌 History will reset after this chunk (current: {len(history_manager.load_history())//2}/{config.HIST_LIMIT} exchanges)") if check_stop(): print(f"❌ Translation stopped during chapter {actual_num}, chunk {chunk_idx}") return current_chunk_number += 1 progress_percent = (current_chunk_number / total_chunks_needed) * 100 if total_chunks_needed > 0 else 0 if chunks_completed > 0: elapsed_time = time.time() - translation_start_time avg_time_per_chunk = elapsed_time / chunks_completed remaining_chunks = total_chunks_needed - current_chunk_number + 1 eta_seconds = remaining_chunks * avg_time_per_chunk eta_hours = int(eta_seconds // 3600) eta_minutes = int((eta_seconds % 3600) // 60) eta_str = f"{eta_hours}h {eta_minutes}m" if eta_hours > 0 else f"{eta_minutes}m" else: eta_str = "calculating..." if total_chunks > 1: print(f" 🔄 Translating chunk {chunk_idx}/{total_chunks} for #{idx+1} (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str})") print(f" ⏳ Chunk size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)") else: # Determine terminology and file reference is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Consistent file reference if c.get('is_chunk', False): file_ref = f"Section_{c['num']}" else: file_ref = c.get('original_basename', f'{terminology}_{actual_num}') print(f" 📄 Translating {terminology.lower()} content (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str}) [File: {file_ref}]") print(f" 📊 {terminology} {actual_num} size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)") print(f" ℹ️ This may take 30-60 seconds. Stop will take effect after completion.") if log_callback: if hasattr(log_callback, '__self__') and hasattr(log_callback.__self__, 'append_chunk_progress'): if total_chunks == 1: # Determine terminology based on source type is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" log_callback.__self__.append_chunk_progress( 1, 1, "text", f"{terminology} {actual_num}", overall_current=current_chunk_number, overall_total=total_chunks_needed, extra_info=f"{len(chunk_html):,} chars" ) else: log_callback.__self__.append_chunk_progress( chunk_idx, total_chunks, "text", f"{terminology} {actual_num}", overall_current=current_chunk_number, overall_total=total_chunks_needed ) else: # Determine terminology based on source type is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" terminology_lower = "section" if is_text_source else "chapter" if total_chunks == 1: log_callback(f"📄 Processing {terminology} {actual_num} ({chapters_completed + 1}/{chapters_to_process}) - {progress_percent:.1f}% complete") else: log_callback(f"📄 processing chunk {chunk_idx}/{total_chunks} for {terminology_lower} {actual_num} - {progress_percent:.1f}% complete") # Get custom chunk prompt template from environment chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]\n{chunk_html}") if total_chunks > 1: user_prompt = chunk_prompt_template.format( chunk_idx=chunk_idx, total_chunks=total_chunks, chunk_html=chunk_html ) else: user_prompt = chunk_html if config.CONTEXTUAL: history = history_manager.load_history() trimmed = history[-config.HIST_LIMIT*2:] chunk_context = chunk_context_manager.get_context_messages(limit=2) else: history = [] # Set empty history when not contextual trimmed = [] chunk_context = [] # Build the current system prompt from the original each time, and append the last summary block if present current_system_content = original_system_prompt if config.USE_ROLLING_SUMMARY and last_summary_block_text: current_system_content = ( current_system_content + "\n\n[Rolling Summary of Previous Chapter]\n" + "(For AI: Use as context only; do not include in output)\n" + last_summary_block_text + "\n[End of Rolling Summary]" ) current_base = [{"role": "system", "content": current_system_content}] # If we have a prepared rolling summary from previous chapter, include it as a separate message (do NOT mutate system prompt) summary_msgs_list = [] if config.USE_ROLLING_SUMMARY and last_summary_block_text: summary_msgs_list = [{ "role": os.getenv("SUMMARY_ROLE", "user"), "content": ( "CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n" "[MEMORY] Previous context summary:\n\n" f"{last_summary_block_text}\n\n" "[END MEMORY]\n" "END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:" ) }] msgs = current_base + summary_msgs_list + chunk_context + trimmed + [{"role": "user", "content": user_prompt}] c['__index'] = idx c['__progress'] = progress_manager.prog c['history_manager'] = history_manager result, finish_reason = translation_processor.translate_with_retry( msgs, chunk_html, c, chunk_idx, total_chunks ) if result is None: progress_manager.update(idx, actual_num, content_hash, output_file=None, status="failed") progress_manager.save() continue if config.REMOVE_AI_ARTIFACTS: result = ContentProcessor.clean_ai_artifacts(result, True) if config.EMERGENCY_RESTORE: result = ContentProcessor.emergency_restore_paragraphs(result, chunk_html) if config.REMOVE_AI_ARTIFACTS: lines = result.split('\n') json_line_count = 0 for i, line in enumerate(lines[:5]): if line.strip() and any(pattern in line for pattern in [ '"role":', '"content":', '"messages":', '{"role"', '{"content"', '[{', '}]' ]): json_line_count = i + 1 else: break if json_line_count > 0 and json_line_count < len(lines): remaining = '\n'.join(lines[json_line_count:]) if remaining.strip() and len(remaining) > 100: result = remaining print(f"✂️ Removed {json_line_count} lines of JSON artifacts") result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE) translated_chunks.append((result, chunk_idx, total_chunks)) chunk_context_manager.add_chunk(user_prompt, result, chunk_idx, total_chunks) progress_manager.prog["chapter_chunks"][chapter_key_str]["completed"].append(chunk_idx) progress_manager.prog["chapter_chunks"][chapter_key_str]["chunks"][str(chunk_idx)] = result progress_manager.save() chunks_completed += 1 will_reset = history_manager.will_reset_on_next_append( config.HIST_LIMIT if config.CONTEXTUAL else 0, config.TRANSLATION_HISTORY_ROLLING ) history = history_manager.append_to_history( user_prompt, result, config.HIST_LIMIT if config.CONTEXTUAL else 0, reset_on_limit=True, rolling_window=config.TRANSLATION_HISTORY_ROLLING ) if chunk_idx < total_chunks: # Handle float delays while checking for stop full_seconds = int(config.DELAY) fractional_second = config.DELAY - full_seconds # Check stop signal every second for full seconds for i in range(full_seconds): if check_stop(): print("❌ Translation stopped during delay") return time.sleep(1) # Handle the fractional part if any if fractional_second > 0: if check_stop(): print("❌ Translation stopped during delay") return time.sleep(fractional_second) if check_stop(): print(f"❌ Translation stopped before saving chapter {actual_num}") return if len(translated_chunks) > 1: print(f" 📎 Merging {len(translated_chunks)} chunks...") translated_chunks.sort(key=lambda x: x[1]) merged_result = chapter_splitter.merge_translated_chunks(translated_chunks) else: merged_result = translated_chunks[0][0] if translated_chunks else "" if config.CONTEXTUAL and len(translated_chunks) > 1: user_summary, assistant_summary = chunk_context_manager.get_summary_for_history() if user_summary and assistant_summary: history_manager.append_to_history( user_summary, assistant_summary, config.HIST_LIMIT, reset_on_limit=False, rolling_window=config.TRANSLATION_HISTORY_ROLLING ) print(f" 📝 Added chapter summary to history") chunk_context_manager.clear() # For text file chunks, ensure we pass the decimal number if is_text_file and c.get('is_chunk', False) and isinstance(c.get('num'), float): fname = FileUtilities.create_chapter_filename(c, c['num']) # Use the decimal num directly else: fname = FileUtilities.create_chapter_filename(c, actual_num) client.set_output_filename(fname) cleaned = re.sub(r"^```(?:html)?\s*\n?", "", merged_result, count=1, flags=re.MULTILINE) cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=config.REMOVE_AI_ARTIFACTS) if is_mixed_content and image_translations: print(f"🔀 Merging {len(image_translations)} image translations with text...") from bs4 import BeautifulSoup # Parse the translated text (which has the translated title/header) soup_translated = BeautifulSoup(cleaned, 'html.parser') # For each image translation, insert it into the document for img_path, translation_html in image_translations.items(): if translation_html and ' 0: combined.write(f"\n\n{'='*50}\n\n") # Write the original chapter title (without Part X/Y suffix) original_title = chapter_data['title'] # Remove the (Part X/Y) suffix if present if ' (Part ' in original_title: original_title = original_title.split(' (Part ')[0] combined.write(f"{original_title}\n\n") # Add the chunk content combined.write(content) # Add spacing between chunks of the same chapter if chunk_idx < total_chunks: combined.write("\n\n") else: # This is a standalone chapter current_main_chapter = chapter_data['num'] # Add separator if not first chapter if i > 0: combined.write(f"\n\n{'='*50}\n\n") # Write the chapter title combined.write(f"{chapter_data['title']}\n\n") # Add the content combined.write(content) print(f" • Combined file with preserved sections: {combined_path}") total_time = time.time() - translation_start_time hours = int(total_time // 3600) minutes = int((total_time % 3600) // 60) seconds = int(total_time % 60) print(f"\n⏱️ Total translation time: {hours}h {minutes}m {seconds}s") print(f"📊 Chapters completed: {chapters_completed}") print(f"✅ Text file translation complete!") if log_callback: log_callback(f"✅ Text file translation complete! Created {combined_path}") except Exception as e: print(f"❌ Error creating combined text file: {e}") if log_callback: log_callback(f"❌ Error creating combined text file: {e}") else: print("🔍 Checking for translated chapters...") # Respect retain extension toggle: if enabled, don't look for response_ prefix if should_retain_source_extension(): response_files = [f for f in os.listdir(out) if f.endswith('.html') and not f.startswith('chapter_')] else: response_files = [f for f in os.listdir(out) if f.startswith('response_') and f.endswith('.html')] chapter_files = [f for f in os.listdir(out) if f.startswith('chapter_') and f.endswith('.html')] if not response_files and chapter_files: if should_retain_source_extension(): print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") print("ℹ️ Retain-source-extension mode is ON: skipping placeholder creation and using original files for EPUB compilation.") else: print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") print("📝 Creating placeholder response files for EPUB compilation...") for chapter_file in chapter_files: response_file = chapter_file.replace('chapter_', 'response_', 1) src = os.path.join(out, chapter_file) dst = os.path.join(out, response_file) try: with open(src, 'r', encoding='utf-8') as f: content = f.read() soup = BeautifulSoup(content, 'html.parser') notice = soup.new_tag('p') notice.string = "[Note: This chapter could not be translated - showing original content]" notice['style'] = "color: red; font-style: italic;" if soup.body: soup.body.insert(0, notice) with open(dst, 'w', encoding='utf-8') as f: f.write(str(soup)) except Exception as e: print(f"⚠️ Error processing {chapter_file}: {e}") try: shutil.copy2(src, dst) except: pass print(f"✅ Created {len(chapter_files)} placeholder response files") print("⚠️ Note: The EPUB will contain untranslated content") print("📘 Building final EPUB…") try: from epub_converter import fallback_compile_epub fallback_compile_epub(out, log_callback=log_callback) print("✅ All done: your final EPUB is in", out) total_time = time.time() - translation_start_time hours = int(total_time // 3600) minutes = int((total_time % 3600) // 60) seconds = int(total_time % 60) print(f"\n📊 Translation Statistics:") print(f" • Total chunks processed: {chunks_completed}") print(f" • Total time: {hours}h {minutes}m {seconds}s") if chunks_completed > 0: avg_time = total_time / chunks_completed print(f" • Average time per chunk: {avg_time:.1f} seconds") stats = progress_manager.get_stats(out) print(f"\n📊 Progress Tracking Summary:") print(f" • Total chapters tracked: {stats['total_tracked']}") print(f" • Successfully completed: {stats['completed']}") print(f" • Missing files: {stats['missing_files']}") print(f" • In progress: {stats['in_progress']}") except Exception as e: print("❌ EPUB build failed:", e) print("TRANSLATION_COMPLETE_SIGNAL") if __name__ == "__main__": main()