# TransateKRtoEN.py
# -*- coding: utf-8 -*-
import json
import logging
import shutil
import threading
import queue
import uuid
import inspect
import os, sys, io, zipfile, time, re, mimetypes, subprocess, tiktoken
import builtins
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
try:
from bs4 import XMLParsedAsHTMLWarning
import warnings
# Suppress the warning since we handle both HTML and XHTML content
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except ImportError:
# Older versions of BeautifulSoup might not have this warning
pass
from collections import Counter
from unified_api_client import UnifiedClient, UnifiedClientError
import hashlib
import tempfile
import unicodedata
from difflib import SequenceMatcher
import unicodedata
import re
import time
from history_manager import HistoryManager
from chapter_splitter import ChapterSplitter
from image_translator import ImageTranslator
from typing import Dict, List, Tuple
from txt_processor import TextFileProcessor
from ai_hunter_enhanced import ImprovedAIHunterDetection
import csv
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
# Module-level functions for ProcessPoolExecutor compatibility
def _check_sentence_batch_for_terms(args):
"""Check a batch of sentences for term matches - used by ProcessPoolExecutor"""
batch_sentences, terms = args
filtered = []
# Use pre-compiled term list for fast checking
for sentence in batch_sentences:
# Quick check using any() - stops at first match
if any(term in sentence for term in terms):
filtered.append(sentence)
return filtered
def _process_sentence_batch_for_extraction(args):
"""Process sentences to extract terms - used by ProcessPoolExecutor"""
batch_sentences, batch_idx, combined_pattern, exclude_check_data = args
from collections import Counter
import re
local_word_freq = Counter()
local_important = []
local_seen = set()
# Rebuild the exclusion check function from data
honorifics_to_exclude, title_patterns_str, common_words, chinese_nums = exclude_check_data
title_patterns = [re.compile(p) for p in title_patterns_str]
def should_exclude_term(term):
term_lower = term.lower()
# Check if it's a common word
if term in common_words or term_lower in common_words:
return True
# Check if it contains honorifics
for honorific in honorifics_to_exclude:
if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])):
return True
# Check if it matches title patterns
for pattern in title_patterns:
if pattern.search(term):
return True
# Check if it's a number
if term in chinese_nums or term.isdigit():
return True
return False
for sentence in batch_sentences:
sentence = sentence.strip()
if len(sentence) < 10 or len(sentence) > 500:
continue
# Find all potential terms in this sentence
matches = re.findall(combined_pattern, sentence)
if matches:
# Filter out excluded terms
filtered_matches = []
for match in matches:
if not should_exclude_term(match):
local_word_freq[match] += 1
filtered_matches.append(match)
# Keep sentences with valid potential terms
if filtered_matches:
sentence_key = ' '.join(sorted(filtered_matches))
if sentence_key not in local_seen:
local_important.append(sentence)
local_seen.add(sentence_key)
return local_word_freq, local_important, local_seen, batch_idx
from tqdm import tqdm
def is_traditional_translation_api(model: str) -> bool:
"""Check if the model is a traditional translation API"""
return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/')
def get_chapter_terminology(is_text_file, chapter_data=None):
"""Get appropriate terminology (Chapter/Section) based on source type"""
if is_text_file:
return "Section"
if chapter_data:
if chapter_data.get('filename', '').endswith('.txt') or chapter_data.get('is_chunk', False):
return "Section"
return "Chapter"
# =====================================================
# CONFIGURATION AND ENVIRONMENT MANAGEMENT
# =====================================================
class TranslationConfig:
"""Centralized configuration management"""
def __init__(self):
self.MODEL = os.getenv("MODEL", "gemini-1.5-flash")
self.input_path = os.getenv("input_path", "default.epub")
self.PROFILE_NAME = os.getenv("PROFILE_NAME", "korean").lower()
self.CONTEXTUAL = os.getenv("CONTEXTUAL", "1") == "1"
self.DELAY = float(os.getenv("SEND_INTERVAL_SECONDS", "1"))
self.SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip()
self.REMOVE_AI_ARTIFACTS = os.getenv("REMOVE_AI_ARTIFACTS", "0") == "1"
self.TEMP = float(os.getenv("TRANSLATION_TEMPERATURE", "0.3"))
self.HIST_LIMIT = int(os.getenv("TRANSLATION_HISTORY_LIMIT", "20"))
self.MAX_OUTPUT_TOKENS = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
self.EMERGENCY_RESTORE = os.getenv("EMERGENCY_PARAGRAPH_RESTORE", "1") == "1"
self.BATCH_TRANSLATION = os.getenv("BATCH_TRANSLATION", "0") == "1"
self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
self.ENABLE_IMAGE_TRANSLATION = os.getenv("ENABLE_IMAGE_TRANSLATION", "1") == "1"
self.TRANSLATE_BOOK_TITLE = os.getenv("TRANSLATE_BOOK_TITLE", "1") == "1"
self.DISABLE_ZERO_DETECTION = os.getenv("DISABLE_ZERO_DETECTION", "0") == "1"
self.ENABLE_AUTO_GLOSSARY = os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1"
self.COMPREHENSIVE_EXTRACTION = os.getenv("COMPREHENSIVE_EXTRACTION", "0") == "1"
self.MANUAL_GLOSSARY = os.getenv("MANUAL_GLOSSARY")
self.RETRY_TRUNCATED = os.getenv("RETRY_TRUNCATED", "0") == "1"
self.RETRY_DUPLICATE_BODIES = os.getenv("RETRY_DUPLICATE_BODIES", "1") == "1"
self.RETRY_TIMEOUT = os.getenv("RETRY_TIMEOUT", "0") == "1"
self.CHUNK_TIMEOUT = int(os.getenv("CHUNK_TIMEOUT", "900"))
self.MAX_RETRY_TOKENS = int(os.getenv("MAX_RETRY_TOKENS", "16384"))
self.DUPLICATE_LOOKBACK_CHAPTERS = int(os.getenv("DUPLICATE_LOOKBACK_CHAPTERS", "3"))
self.USE_ROLLING_SUMMARY = os.getenv("USE_ROLLING_SUMMARY", "0") == "1"
self.ROLLING_SUMMARY_EXCHANGES = int(os.getenv("ROLLING_SUMMARY_EXCHANGES", "5"))
self.ROLLING_SUMMARY_MODE = os.getenv("ROLLING_SUMMARY_MODE", "replace")
# New: maximum number of rolling summary entries to retain when in append mode (0 = unlimited)
self.ROLLING_SUMMARY_MAX_ENTRIES = int(os.getenv("ROLLING_SUMMARY_MAX_ENTRIES", "10"))
self.DUPLICATE_DETECTION_MODE = os.getenv("DUPLICATE_DETECTION_MODE", "basic")
self.AI_HUNTER_THRESHOLD = int(os.getenv("AI_HUNTER_THRESHOLD", "75"))
self.TRANSLATION_HISTORY_ROLLING = os.getenv("TRANSLATION_HISTORY_ROLLING", "0") == "1"
self.API_KEY = (os.getenv("API_KEY") or
os.getenv("OPENAI_API_KEY") or
os.getenv("OPENAI_OR_Gemini_API_KEY") or
os.getenv("GEMINI_API_KEY"))
# NEW: Simple chapter number offset
self.CHAPTER_NUMBER_OFFSET = int(os.getenv("CHAPTER_NUMBER_OFFSET", "0"))
self.ENABLE_WATERMARK_REMOVAL = os.getenv("ENABLE_WATERMARK_REMOVAL", "1") == "1"
self.SAVE_CLEANED_IMAGES = os.getenv("SAVE_CLEANED_IMAGES", "1") == "1"
self.WATERMARK_PATTERN_THRESHOLD = int(os.getenv("WATERMARK_PATTERN_THRESHOLD", "10"))
self.WATERMARK_CLAHE_LIMIT = float(os.getenv("WATERMARK_CLAHE_LIMIT", "3.0"))
self.COMPRESSION_FACTOR = float(os.getenv("COMPRESSION_FACTOR", "1.0"))
# Multi API key support
self.use_multi_api_keys = os.environ.get('USE_MULTI_API_KEYS', '0') == '1'
self.multi_api_keys = []
if self.use_multi_api_keys:
multi_keys_json = os.environ.get('MULTI_API_KEYS', '[]')
try:
self.multi_api_keys = json.loads(multi_keys_json)
print(f"Loaded {len(self.multi_api_keys)} API keys for multi-key mode")
except Exception as e:
print(f"Failed to load multi API keys: {e}")
self.use_multi_api_keys = False
# =====================================================
# UNIFIED PATTERNS AND CONSTANTS
# =====================================================
class PatternManager:
"""Centralized pattern management"""
CHAPTER_PATTERNS = [
# English patterns
(r'chapter[\s_-]*(\d+)', re.IGNORECASE, 'english_chapter'),
(r'\bch\.?\s*(\d+)\b', re.IGNORECASE, 'english_ch'),
(r'part[\s_-]*(\d+)', re.IGNORECASE, 'english_part'),
(r'episode[\s_-]*(\d+)', re.IGNORECASE, 'english_episode'),
# Chinese patterns
(r'第\s*(\d+)\s*[章节話话回]', 0, 'chinese_chapter'),
(r'第\s*([一二三四五六七八九十百千万]+)\s*[章节話话回]', 0, 'chinese_chapter_cn'),
(r'(\d+)[章节話话回]', 0, 'chinese_short'),
# Japanese patterns
(r'第\s*(\d+)\s*話', 0, 'japanese_wa'),
(r'第\s*(\d+)\s*章', 0, 'japanese_chapter'),
(r'その\s*(\d+)', 0, 'japanese_sono'),
(r'(\d+)話目', 0, 'japanese_wame'),
# Korean patterns
(r'제\s*(\d+)\s*[장화권부편]', 0, 'korean_chapter'),
(r'(\d+)\s*[장화권부편]', 0, 'korean_short'),
(r'에피소드\s*(\d+)', 0, 'korean_episode'),
# Generic numeric patterns
(r'^\s*(\d+)\s*[-–—.\:]', re.MULTILINE, 'generic_numbered'),
(r'_(\d+)\.x?html?$', re.IGNORECASE, 'filename_number'),
(r'/(\d+)\.x?html?$', re.IGNORECASE, 'path_number'),
(r'(\d+)', 0, 'any_number'),
]
FILENAME_EXTRACT_PATTERNS = [
# IMPORTANT: More specific patterns MUST come first
r'^\d{3}(\d)_(\d{2})_\.x?html?$', # Captures both parts for decimal: group1.group2
r'^\d{4}_(\d+)\.x?html?$', # "0000_1.xhtml" - extracts 1, not 0000
r'^\d+_(\d+)[_\.]', # Any digits followed by underscore then capture next digits
r'^(\d+)[_\.]', # Standard: "0249_" or "0249."
r'response_(\d+)_', # Standard pattern: response_001_
r'response_(\d+)\.', # Pattern: response_001.
r'(\d{3,5})[_\.]', # 3-5 digit pattern with padding
r'[Cc]hapter[_\s]*(\d+)', # Chapter word pattern
r'[Cc]h[_\s]*(\d+)', # Ch abbreviation
r'No(\d+)Chapter', # No prefix with Chapter - matches "No00013Chapter.xhtml"
r'No(\d+)Section', # No prefix with Section - matches "No00013Section.xhtml"
r'No(\d+)(?=\.|_|$)', # No prefix followed by end, dot, or underscore (not followed by text)
r'第(\d+)[章话回]', # Chinese chapter markers
r'_(\d+)(?:_|\.|$)', # Number between underscores or at end
r'^(\d+)(?:_|\.|$)', # Starting with number
r'(\d+)', # Any number (fallback)
]
CJK_HONORIFICS = {
'korean': [
# Modern honorifics
'님', '씨', '선배', '후배', '동기', '형', '누나', '언니', '오빠', '동생',
'선생님', '교수님', '박사님', '사장님', '회장님', '부장님', '과장님', '대리님',
'팀장님', '실장님', '이사님', '전무님', '상무님', '부사장님', '고문님',
# Classical/formal honorifics
'공', '옹', '군', '양', '낭', '랑', '생', '자', '부', '모', '시', '제', '족하',
# Royal/noble address forms
'마마', '마노라', '대감', '영감', '나리', '도령', '낭자', '아씨', '규수',
'각하', '전하', '폐하', '저하', '합하', '대비', '대왕', '왕자', '공주',
# Buddhist/religious
'스님', '사부님', '조사님', '큰스님', '화상', '대덕', '대사', '법사',
'선사', '율사', '보살님', '거사님', '신부님', '목사님', '장로님', '집사님',
# Confucian/scholarly
'부자', '선생', '대인', '어른', '어르신', '존자', '현자', '군자', '대부',
'학사', '진사', '문하생', '제자',
# Kinship honorifics
'어르신', '할아버님', '할머님', '아버님', '어머님', '형님', '누님',
'아주버님', '아주머님', '삼촌', '이모님', '고모님', '외삼촌', '장인어른',
'장모님', '시아버님', '시어머님', '처남', '처형', '매형', '손님',
# Verb-based honorific endings and speech levels
'습니다', 'ㅂ니다', '습니까', 'ㅂ니까', '시다', '세요', '셔요', '십시오', '시오',
'이에요', '예요', '이예요', '에요', '어요', '아요', '여요', '해요', '이세요', '으세요',
'으시', '시', '으십니다', '십니다', '으십니까', '십니까', '으셨', '셨',
'드립니다', '드려요', '드릴게요', '드리겠습니다', '올립니다', '올려요',
'사옵니다', '사뢰', '여쭙니다', '여쭤요', '아뢰', '뵙니다', '뵈요', '모십니다',
'시지요', '시죠', '시네요', '시는군요', '시는구나', '으실', '실',
'드시다', '잡수시다', '주무시다', '계시다', '가시다', '오시다',
# Common verb endings with 있다/없다/하다
'있어요', '있습니다', '있으세요', '있으십니까', '없어요', '없습니다', '없으세요',
'해요', '합니다', '하세요', '하십시오', '하시죠', '하시네요', '했어요', '했습니다',
'되세요', '되셨어요', '되십니다', '됩니다', '되요', '돼요',
'이야', '이네', '이구나', '이군', '이네요', '인가요', '인가', '일까요', '일까',
'거예요', '거에요', '겁니다', '건가요', '게요', '을게요', '을까요', '었어요', '었습니다',
'겠습니다', '겠어요', '겠네요', '을겁니다', '을거예요', '을거에요',
# Common endings
'요', '죠', '네요', '는데요', '거든요', '니까', '으니까', '는걸요', '군요', '구나',
'는구나', '는군요', '더라고요', '더군요', '던데요', '나요', '가요', '까요',
'라고요', '다고요', '냐고요', '자고요', '란다', '단다', '냔다', '잔다',
# Formal archaic endings
'나이다', '사옵나이다', '옵니다', '오', '소서', '으오', '으옵소서', '사이다',
'으시옵니다', '시옵니다', '으시옵니까', '시옵니까', '나이까', '리이까', '리이다',
'옵소서', '으소서', '소이다', '로소이다', '이옵니다', '이올시다', '하옵니다'
],
'japanese': [
# Modern honorifics
'さん', 'ちゃん', '君', 'くん', '様', 'さま', '先生', 'せんせい', '殿', 'どの', '先輩', 'せんぱい',
# Classical/historical
'氏', 'し', '朝臣', 'あそん', '宿禰', 'すくね', '連', 'むらじ', '臣', 'おみ', '君', 'きみ',
'真人', 'まひと', '道師', 'みちのし', '稲置', 'いなぎ', '直', 'あたい', '造', 'みやつこ',
# Court titles
'卿', 'きょう', '大夫', 'たいふ', '郎', 'ろう', '史', 'し', '主典', 'さかん',
# Buddhist titles
'和尚', 'おしょう', '禅師', 'ぜんじ', '上人', 'しょうにん', '聖人', 'しょうにん',
'法師', 'ほうし', '阿闍梨', 'あじゃり', '大和尚', 'だいおしょう',
# Shinto titles
'大宮司', 'だいぐうじ', '宮司', 'ぐうじ', '禰宜', 'ねぎ', '祝', 'はふり',
# Samurai era
'守', 'かみ', '介', 'すけ', '掾', 'じょう', '目', 'さかん', '丞', 'じょう',
# Keigo (honorific language) verb forms
'です', 'ます', 'ございます', 'いらっしゃる', 'いらっしゃいます', 'おっしゃる', 'おっしゃいます',
'なさる', 'なさいます', 'くださる', 'くださいます', 'いただく', 'いただきます',
'おります', 'でございます', 'ございません', 'いたします', 'いたしました',
'申す', '申します', '申し上げる', '申し上げます', '存じる', '存じます', '存じ上げる',
'伺う', '伺います', '参る', '参ります', 'お目にかかる', 'お目にかかります',
'拝見', '拝見します', '拝聴', '拝聴します', '承る', '承ります',
# Respectful prefixes/suffixes
'お', 'ご', '御', 'み', '美', '貴', '尊'
],
'chinese': [
# Modern forms
'先生', '小姐', '夫人', '公子', '大人', '老师', '师父', '师傅', '同志', '同学',
# Ancient/classical forms
'子', '丈', '翁', '公', '侯', '伯', '叔', '仲', '季', '父', '甫', '卿', '君', '生',
# Imperial court
'陛下', '殿下', '千岁', '万岁', '圣上', '皇上', '天子', '至尊', '御前', '爷',
# Nobility/officials
'阁下', '大人', '老爷', '相公', '官人', '郎君', '娘子', '夫子', '足下',
# Religious titles
'上人', '法师', '禅师', '大师', '高僧', '圣僧', '神僧', '活佛', '仁波切',
'真人', '天师', '道长', '道友', '仙长', '上仙', '祖师', '掌教',
# Scholarly/Confucian
'夫子', '圣人', '贤人', '君子', '大儒', '鸿儒', '宗师', '泰斗', '巨擘',
# Martial arts
'侠士', '大侠', '少侠', '女侠', '英雄', '豪杰', '壮士', '义士',
# Family/kinship
'令尊', '令堂', '令郎', '令爱', '贤弟', '贤侄', '愚兄', '小弟', '家父', '家母',
# Humble forms
'在下', '小人', '鄙人', '不才', '愚', '某', '仆', '妾', '奴', '婢',
# Polite verbal markers
'请', '请问', '敢问', '恭请', '敬请', '烦请', '有请', '请教', '赐教',
'惠顾', '惠赐', '惠存', '笑纳', '雅正', '指正', '斧正', '垂询',
'拜', '拜见', '拜访', '拜读', '拜托', '拜谢', '敬上', '谨上', '顿首'
],
'english': [
# Modern Korean romanizations (Revised Romanization of Korean - 2000)
'-nim', '-ssi', '-seonbae', '-hubae', '-donggi', '-hyeong', '-nuna',
'-eonni', '-oppa', '-dongsaeng', '-seonsaengnim', '-gyosunim',
'-baksanim', '-sajangnim', '-hoejangnim', '-bujangnim', '-gwajangnim',
'-daerim', '-timjangnim', '-siljangnim', '-isanim', '-jeonmunim',
'-sangmunim', '-busajangnim', '-gomunnim',
# Classical/formal Korean romanizations
'-gong', '-ong', '-gun', '-yang', '-nang', '-rang', '-saeng', '-ja',
'-bu', '-mo', '-si', '-je', '-jokha',
# Royal/noble Korean romanizations
'-mama', '-manora', '-daegam', '-yeonggam', '-nari', '-doryeong',
'-nangja', '-assi', '-gyusu', '-gakha', '-jeonha', '-pyeha', '-jeoha',
'-hapka', '-daebi', '-daewang', '-wangja', '-gongju',
# Buddhist/religious Korean romanizations
'-seunim', '-sabunim', '-josanim', '-keunseunim', '-hwasang',
'-daedeok', '-daesa', '-beopsa', '-seonsa', '-yulsa', '-bosalnim',
'-geosanim', '-sinbunim', '-moksanim', '-jangnonim', '-jipsanim',
# Confucian/scholarly Korean romanizations
'-buja', '-seonsaeng', '-daein', '-eoreun', '-eoreusin', '-jonja',
'-hyeonja', '-gunja', '-daebu', '-haksa', '-jinsa', '-munhasaeng', '-jeja',
# Kinship Korean romanizations
'-harabeonim', '-halmeonim', '-abeonim', '-eomeonim', '-hyeongnim',
'-nunim', '-ajubeonim', '-ajumeonim', '-samchon', '-imonim', '-gomonim',
'-oesamchon', '-jangineoreun', '-jangmonim', '-siabeonim', '-sieomeonim',
'-cheonam', '-cheohyeong', '-maehyeong', '-sonnim',
# Korean verb endings romanized (Revised Romanization)
'-seumnida', '-mnida', '-seumnikka', '-mnikka', '-sida', '-seyo',
'-syeoyo', '-sipsio', '-sio', '-ieyo', '-yeyo', '-iyeyo', '-eyo',
'-eoyo', '-ayo', '-yeoyo', '-haeyo', '-iseyo', '-euseyo',
'-eusi', '-si', '-eusimnida', '-simnida', '-eusimnikka', '-simnikka',
'-eusyeot', '-syeot', '-deurimnida', '-deuryeoyo', '-deurilgeyo',
'-deurigesseumnida', '-ollimnida', '-ollyeoyo', '-saomnida', '-saroe',
'-yeojjumnida', '-yeojjwoyo', '-aroe', '-boemnida', '-boeyo', '-mosimnida',
'-sijiyo', '-sijyo', '-sineyo', '-sineungunyo', '-sineunguna', '-eusil', '-sil',
'-deusida', '-japsusida', '-jumusida', '-gyesida', '-gasida', '-osida',
# Common Korean verb endings romanized
'-isseoyo', '-isseumnida', '-isseuseyo', '-isseusimnikka',
'-eopseoyo', '-eopseumnida', '-eopseuseyo', '-hamnida', '-haseyo',
'-hasipsio', '-hasijyo', '-hasineyo', '-haesseoyo', '-haesseumnida',
'-doeseyo', '-doesyeosseoyo', '-doesimnida', '-doemnida', '-doeyo', '-dwaeyo',
'-iya', '-ine', '-iguna', '-igun', '-ineyo', '-ingayo', '-inga',
'-ilkkayo', '-ilkka', '-geoyeyo', '-geoeyo', '-geomnida', '-geongayo',
'-geyo', '-eulgeyo', '-eulkkayo', '-eosseoyo', '-eosseumnida',
'-gesseumnida', '-gesseoyo', '-genneyo', '-eulgeommida', '-eulgeoyeyo', '-eulgeoeyo',
# Common Korean endings romanized
'-yo', '-jyo', '-neyo', '-neundeyo', '-geodeunyo', '-nikka',
'-eunikka', '-neungeolyo', '-gunyo', '-guna', '-neunguna', '-neungunyo',
'-deoragoyo', '-deogunyo', '-deondeyo', '-nayo', '-gayo', '-kkayo',
'-ragoyo', '-dagoyo', '-nyagoyo', '-jagoyo', '-randa', '-danda',
'-nyanda', '-janda',
# Formal archaic Korean romanized
'-naida', '-saomnaida', '-omnida', '-o', '-soseo', '-euo',
'-euopsoseo', '-saida', '-eusiomnida', '-siomnida', '-eusiomnikka',
'-siomnikka', '-naikka', '-riikka', '-riida', '-opsoseo', '-eusoseo',
'-soida', '-rosoida', '-iomnida', '-iolsida', '-haomnida',
# Japanese keigo romanized (keeping existing)
'-san', '-chan', '-kun', '-sama', '-sensei', '-senpai', '-dono',
'-shi', '-tan', '-chin', '-desu', '-masu', '-gozaimasu',
'-irassharu', '-irasshaimasu', '-ossharu', '-osshaimasu',
'-nasaru', '-nasaimasu', '-kudasaru', '-kudasaimasu', '-itadaku',
'-itadakimasu', '-orimasu', '-degozaimasu', '-gozaimasen',
'-itashimasu', '-itashimashita', '-mousu', '-moushimasu',
'-moushiageru', '-moushiagemasu', '-zonjiru', '-zonjimasu',
'-ukagau', '-ukagaimasu', '-mairu', '-mairimasu', '-haiken',
'-haikenshimasu',
# Chinese romanizations (keeping existing)
'-xiong', '-di', '-ge', '-gege', '-didi', '-jie', '-jiejie',
'-meimei', '-shixiong', '-shidi', '-shijie', '-shimei', '-gongzi',
'-guniang', '-xiaojie', '-daren', '-qianbei', '-daoyou', '-zhanglao',
'-shibo', '-shishu', '-shifu', '-laoshi', '-xiansheng', '-daxia',
'-shaoxia', '-nvxia', '-jushi', '-shanren', '-dazhang', '-zhenren',
# Ancient Chinese romanizations
'-zi', '-gong', '-hou', '-bo', '-jun', '-qing', '-weng', '-fu',
'-sheng', '-lang', '-langjun', '-niangzi', '-furen', '-gege',
'-jiejie', '-yeye', '-nainai',
# Chinese politeness markers romanized
'-qing', '-jing', '-gong', '-hui', '-ci', '-bai', '-gan', '-chui',
'qingwen', 'ganwen', 'gongjing', 'jingjing', 'baijian', 'baifang',
'baituo'
]
}
TITLE_PATTERNS = {
'korean': [
# Modern titles
r'\b(왕|여왕|왕자|공주|황제|황후|대왕|대공|공작|백작|자작|남작|기사|장군|대장|원수|제독|함장|대신|재상|총리|대통령|시장|지사|검사|판사|변호사|의사|박사|교수|신부|목사|스님|도사)\b',
r'\b(폐하|전하|각하|예하|님|대감|영감|나리|도련님|아가씨|부인|선생)\b',
# Historical/classical titles
r'\b(대왕|태왕|왕비|왕후|세자|세자빈|대군|군|옹주|공주|부마|원자|원손)\b',
r'\b(영의정|좌의정|우의정|판서|참판|참의|정승|판사|사또|현령|군수|목사|부사)\b',
r'\b(대제학|제학|대사간|사간|대사헌|사헌|도승지|승지|한림|사관|내시|환관)\b',
r'\b(병조판서|이조판서|호조판서|예조판서|형조판서|공조판서)\b',
r'\b(도원수|부원수|병마절도사|수군절도사|첨절제사|만호|천호|백호)\b',
r'\b(정일품|종일품|정이품|종이품|정삼품|종삼품|정사품|종사품|정오품|종오품)\b',
# Korean honorific verb endings patterns
r'(습니다|ㅂ니다|습니까|ㅂ니까|세요|셔요|십시오|시오)$',
r'(이에요|예요|이예요|에요|어요|아요|여요|해요)$',
r'(으시|시)(었|겠|ㄹ|을|는|던)*(습니다|ㅂ니다|어요|아요|세요)',
r'(드립니다|드려요|드릴게요|드리겠습니다|올립니다|올려요)$',
r'(사옵니다|여쭙니다|여쭤요|뵙니다|뵈요|모십니다)$',
r'(나이다|사옵나이다|옵니다|으오|으옵소서|사이다)$'
],
'japanese': [
# Modern titles
r'\b(王|女王|王子|姫|皇帝|皇后|天皇|皇太子|大王|大公|公爵|伯爵|子爵|男爵|騎士|将軍|大将|元帥|提督|艦長|大臣|宰相|総理|大統領|市長|知事|検事|裁判官|弁護士|医者|博士|教授|神父|牧師|僧侶|道士)\b',
r'\b(陛下|殿下|閣下|猊下|様|大人|殿|卿|君|氏)\b',
# Historical titles
r'\b(天皇|皇后|皇太子|親王|内親王|王|女王|太政大臣|左大臣|右大臣|内大臣|大納言|中納言|参議)\b',
r'\b(関白|摂政|征夷大将軍|管領|執権|守護|地頭|代官|奉行|与力|同心)\b',
r'\b(太政官|神祇官|式部省|治部省|民部省|兵部省|刑部省|大蔵省|宮内省)\b',
r'\b(大僧正|僧正|大僧都|僧都|律師|大法師|法師|大禅師|禅師)\b',
r'\b(正一位|従一位|正二位|従二位|正三位|従三位|正四位|従四位|正五位|従五位)\b',
r'\b(大和守|山城守|摂津守|河内守|和泉守|伊賀守|伊勢守|尾張守|三河守|遠江守)\b',
# Japanese keigo (honorific language) patterns
r'(です|ます|ございます)$',
r'(いらっしゃ|おっしゃ|なさ|くださ)(います|いました|る|った)$',
r'(いただ|お|ご|御)(き|きます|きました|く|ける|けます)',
r'(申し上げ|申し|存じ上げ|存じ|伺い|参り)(ます|ました|る)$',
r'(拝見|拝聴|承り|承)(します|しました|いたします|いたしました)$',
r'お[^あ-ん]+[になる|になります|くださる|くださいます]'
],
'chinese': [
# Modern titles
r'\b(王|女王|王子|公主|皇帝|皇后|大王|大公|公爵|伯爵|子爵|男爵|骑士|将军|大将|元帅|提督|舰长|大臣|宰相|总理|大总统|市长|知事|检察官|法官|律师|医生|博士|教授|神父|牧师|和尚|道士)\b',
r'\b(陛下|殿下|阁下|大人|老爷|夫人|小姐|公子|少爷|姑娘|先生)\b',
# Imperial titles
r'\b(天子|圣上|皇上|万岁|万岁爷|太上皇|皇太后|太后|皇后|贵妃|妃|嫔|贵人|常在|答应)\b',
r'\b(太子|皇子|皇孙|亲王|郡王|贝勒|贝子|公主|格格|郡主|县主|郡君|县君)\b',
# Ancient official titles
r'\b(丞相|相国|太师|太傅|太保|太尉|司徒|司空|大司马|大司农|大司寇)\b',
r'\b(尚书|侍郎|郎中|员外郎|主事|知府|知州|知县|同知|通判|推官|巡抚|总督)\b',
r'\b(御史大夫|御史中丞|监察御史|给事中|都察院|翰林院|国子监|钦天监)\b',
r'\b(大学士|学士|侍读|侍讲|编修|检讨|庶吉士|举人|进士|状元|榜眼|探花)\b',
# Military ranks
r'\b(大元帅|元帅|大将军|将军|都督|都指挥使|指挥使|千户|百户|总兵|副将|参将|游击|都司|守备)\b',
r'\b(提督|总兵官|副总兵|参将|游击将军|都司|守备|千总|把总|外委)\b',
# Religious titles
r'\b(国师|帝师|法王|活佛|堪布|仁波切|大和尚|方丈|住持|首座|维那|知客)\b',
r'\b(天师|真人|道长|掌教|监院|高功|都讲|总理|提点|知观)\b',
# Nobility ranks
r'\b(公|侯|伯|子|男|开国公|郡公|国公|郡侯|县侯|郡伯|县伯|县子|县男)\b',
r'\b(一品|二品|三品|四品|五品|六品|七品|八品|九品|正一品|从一品|正二品|从二品)\b',
# Chinese politeness markers
r'(请|敢|恭|敬|烦|有)(问|请|赐|教|告|示)',
r'(拜|惠|赐|垂|雅|笑)(见|访|读|托|谢|顾|赐|存|纳|正|询)',
r'(敬|谨|顿)(上|呈|启|白|首)'
],
'english': [
# Western titles
r'\b(King|Queen|Prince|Princess|Emperor|Empress|Duke|Duchess|Marquis|Marquess|Earl|Count|Countess|Viscount|Viscountess|Baron|Baroness|Knight|Lord|Lady|Sir|Dame|General|Admiral|Captain|Major|Colonel|Commander|Lieutenant|Sergeant|Minister|Chancellor|President|Mayor|Governor|Judge|Doctor|Professor|Father|Reverend|Master|Mistress)\b',
r'\b(His|Her|Your|Their)\s+(Majesty|Highness|Grace|Excellency|Honor|Worship|Lordship|Ladyship)\b',
# Romanized historical titles
r'\b(Tianzi|Huangdi|Huanghou|Taizi|Qinwang|Junwang|Beile|Beizi|Gongzhu|Gege)\b',
r'\b(Chengxiang|Zaixiang|Taishi|Taifu|Taibao|Taiwei|Situ|Sikong|Dasima)\b',
r'\b(Shogun|Daimyo|Samurai|Ronin|Ninja|Tenno|Mikado|Kampaku|Sessho)\b',
r'\b(Taewang|Wangbi|Wanghu|Seja|Daegun|Gun|Ongju|Gongju|Buma)\b'
]
}
# Expanded Chinese numbers including classical forms
CHINESE_NUMS = {
# Basic numbers
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15,
'十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20,
'二十一': 21, '二十二': 22, '二十三': 23, '二十四': 24, '二十五': 25,
'三十': 30, '四十': 40, '五十': 50, '六十': 60,
'七十': 70, '八十': 80, '九十': 90, '百': 100,
# Classical/formal numbers
'壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5,
'陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
'佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
# Ordinal indicators
'第一': 1, '第二': 2, '第三': 3, '第四': 4, '第五': 5,
'首': 1, '次': 2, '初': 1, '末': -1,
}
# Common words - keeping the same for filtering
COMMON_WORDS = {
'이', '그', '저', '우리', '너희', '자기', '당신', '여기', '거기', '저기',
'오늘', '내일', '어제', '지금', '아까', '나중', '먼저', '다음', '마지막',
'모든', '어떤', '무슨', '이런', '그런', '저런', '같은', '다른', '새로운',
'하다', '있다', '없다', '되다', '하는', '있는', '없는', '되는',
'것', '수', '때', '년', '월', '일', '시', '분', '초',
'은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '만',
'에서', '으로', '로', '까지', '부터', '에게', '한테', '께', '께서',
'この', 'その', 'あの', 'どの', 'これ', 'それ', 'あれ', 'どれ',
'わたし', 'あなた', 'かれ', 'かのじょ', 'わたしたち', 'あなたたち',
'きょう', 'あした', 'きのう', 'いま', 'あとで', 'まえ', 'つぎ',
'の', 'は', 'が', 'を', 'に', 'で', 'と', 'も', 'や', 'から', 'まで',
'这', '那', '哪', '这个', '那个', '哪个', '这里', '那里', '哪里',
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们',
'今天', '明天', '昨天', '现在', '刚才', '以后', '以前', '后来',
'的', '了', '在', '是', '有', '和', '与', '或', '但', '因为', '所以',
'一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
}
# =====================================================
# CHUNK CONTEXT MANAGER (unchanged - already optimal)
# =====================================================
class ChunkContextManager:
"""Manage context within a chapter separate from history"""
def __init__(self):
self.current_chunks = []
self.chapter_num = None
self.chapter_title = None
def start_chapter(self, chapter_num, chapter_title):
"""Start a new chapter context"""
self.current_chunks = []
self.chapter_num = chapter_num
self.chapter_title = chapter_title
def add_chunk(self, user_content, assistant_content, chunk_idx, total_chunks):
"""Add a chunk to the current chapter context"""
self.current_chunks.append({
"user": user_content,
"assistant": assistant_content,
"chunk_idx": chunk_idx,
"total_chunks": total_chunks
})
def get_context_messages(self, limit=3):
"""Get last N chunks as messages for API context"""
context = []
for chunk in self.current_chunks[-limit:]:
context.extend([
{"role": "user", "content": chunk["user"]},
{"role": "assistant", "content": chunk["assistant"]}
])
return context
def get_summary_for_history(self):
"""Create a summary representation for the history"""
if not self.current_chunks:
return None, None
total_chunks = len(self.current_chunks)
user_summary = f"[Chapter {self.chapter_num}: {self.chapter_title}]\n"
user_summary += f"[{total_chunks} chunks processed]\n"
if self.current_chunks:
first_chunk = self.current_chunks[0]['user']
if len(first_chunk) > 500:
user_summary += first_chunk[:500] + "..."
else:
user_summary += first_chunk
assistant_summary = f"[Chapter {self.chapter_num} Translation Complete]\n"
assistant_summary += f"[Translated in {total_chunks} chunks]\n"
if self.current_chunks:
samples = []
first_trans = self.current_chunks[0]['assistant']
samples.append(f"Beginning: {first_trans[:200]}..." if len(first_trans) > 200 else f"Beginning: {first_trans}")
if total_chunks > 2:
mid_idx = total_chunks // 2
mid_trans = self.current_chunks[mid_idx]['assistant']
samples.append(f"Middle: {mid_trans[:200]}..." if len(mid_trans) > 200 else f"Middle: {mid_trans}")
if total_chunks > 1:
last_trans = self.current_chunks[-1]['assistant']
samples.append(f"End: {last_trans[:200]}..." if len(last_trans) > 200 else f"End: {last_trans}")
assistant_summary += "\n".join(samples)
return user_summary, assistant_summary
def clear(self):
"""Clear the current chapter context"""
self.current_chunks = []
self.chapter_num = None
self.chapter_title = None
# =====================================================
# UNIFIED UTILITIES
# =====================================================
class FileUtilities:
"""Utilities for file and path operations"""
@staticmethod
def extract_actual_chapter_number(chapter, patterns=None, config=None):
"""Extract actual chapter number from filename using improved logic"""
# IMPORTANT: Check if this is a pre-split TEXT FILE chunk first
if (chapter.get('is_chunk', False) and
'num' in chapter and
isinstance(chapter['num'], float) and
chapter.get('filename', '').endswith('.txt')):
# For text file chunks only, preserve the decimal number
return chapter['num'] # This will be 1.1, 1.2, etc.
# Get filename for extraction
filename = chapter.get('original_basename') or chapter.get('filename', '')
# Use our improved extraction function
# Note: We don't have opf_spine_position here, so pass None
actual_num, method = extract_chapter_number_from_filename(filename, opf_spine_position=None)
# If extraction succeeded, return the result
if actual_num is not None:
#print(f"[DEBUG] Extracted {actual_num} from '{filename}' using method: {method}")
return actual_num
# Fallback to original complex logic for edge cases
actual_num = None
if patterns is None:
patterns = PatternManager.FILENAME_EXTRACT_PATTERNS
# Try to extract from original basename first
if chapter.get('original_basename'):
basename = chapter['original_basename']
# Check if decimal chapters are enabled for EPUBs
enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1'
# For EPUBs, only check decimal patterns if the toggle is enabled
if enable_decimal:
# Check for standard decimal chapter numbers (e.g., Chapter_1.1, 1.2.html)
decimal_match = re.search(r'(\d+)\.(\d+)', basename)
if decimal_match:
actual_num = float(f"{decimal_match.group(1)}.{decimal_match.group(2)}")
return actual_num
# Check for the XXXX_YY pattern where it represents X.YY decimal chapters
decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename)
if decimal_prefix_match:
first_part = decimal_prefix_match.group(1)
second_part = decimal_prefix_match.group(2)
if len(second_part) == 2 and int(second_part) > 9:
chapter_num = int(first_part[-1])
decimal_part = second_part
actual_num = float(f"{chapter_num}.{decimal_part}")
return actual_num
# Standard XXXX_Y format handling (existing logic)
prefix_suffix_match = re.match(r'^(\d+)_(\d+)', basename)
if prefix_suffix_match:
second_part = prefix_suffix_match.group(2)
if not enable_decimal:
actual_num = int(second_part)
return actual_num
else:
if len(second_part) == 1 or (len(second_part) == 2 and int(second_part) <= 9):
actual_num = int(second_part)
return actual_num
# Check other patterns if no match yet
for pattern in patterns:
if pattern in [r'^(\d+)[_\.]', r'(\d{3,5})[_\.]', r'^(\d+)_']:
continue
match = re.search(pattern, basename, re.IGNORECASE)
if match:
actual_num = int(match.group(1))
break
# Final fallback to chapter num
if actual_num is None:
actual_num = chapter.get("num", 0)
print(f"[DEBUG] No pattern matched, using chapter num: {actual_num}")
return actual_num
@staticmethod
def create_chapter_filename(chapter, actual_num=None):
"""Create consistent chapter filename"""
# Check if we should use header as output name
use_header_output = os.getenv("USE_HEADER_AS_OUTPUT", "0") == "1"
# Check if this is for a text file
is_text_file = chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False)
# Respect toggle: retain source extension and remove 'response_' prefix
retain = should_retain_source_extension()
# Helper to compute full original extension chain (e.g., '.html.xhtml')
def _full_ext_from_original(ch):
fn = ch.get('original_filename')
if not fn:
return '.html'
bn = os.path.basename(fn)
root, ext = os.path.splitext(bn)
if not ext:
return '.html'
full_ext = ''
while ext:
full_ext = ext + full_ext
root, ext = os.path.splitext(root)
return full_ext or '.html'
if use_header_output and chapter.get('title'):
safe_title = make_safe_filename(chapter['title'], actual_num or chapter.get('num', 0))
if safe_title and safe_title != f"chapter_{actual_num or chapter.get('num', 0):03d}":
if is_text_file:
return f"{safe_title}.txt" if retain else f"response_{safe_title}.txt"
else:
# If retaining, use full original ext chain; else default .html
if retain:
return f"{safe_title}{_full_ext_from_original(chapter)}"
return f"response_{safe_title}.html"
# Check if decimal chapters are enabled
enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1'
# For EPUBs with decimal detection enabled
if enable_decimal and 'original_basename' in chapter and chapter['original_basename']:
basename = chapter['original_basename']
# Check for standard decimal pattern (e.g., Chapter_1.1)
decimal_match = re.search(r'(\d+)\.(\d+)', basename)
if decimal_match:
# Create a modified basename that preserves the decimal
base = os.path.splitext(basename)[0]
# Replace dots with underscores for filesystem compatibility
base = base.replace('.', '_')
# Use .txt extension for text files
if is_text_file:
return f"{base}.txt" if retain else f"response_{base}.txt"
else:
if retain:
return f"{base}{_full_ext_from_original(chapter)}"
return f"response_{base}.html"
# Check for the special XXXX_YY decimal pattern
decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename)
if decimal_prefix_match:
first_part = decimal_prefix_match.group(1)
second_part = decimal_prefix_match.group(2)
# If this matches our decimal pattern (e.g., 0002_33 -> 2.33)
if len(second_part) == 2 and int(second_part) > 9:
chapter_num = int(first_part[-1])
decimal_part = second_part
# Create filename reflecting the decimal interpretation
if is_text_file:
return f"{chapter_num:04d}_{decimal_part}.txt" if retain else f"response_{chapter_num:04d}_{decimal_part}.txt"
else:
return f"{chapter_num:04d}_{decimal_part}{_full_ext_from_original(chapter)}" if retain else f"response_{chapter_num:04d}_{decimal_part}.html"
# Standard EPUB handling - use original basename
if 'original_basename' in chapter and chapter['original_basename']:
base = os.path.splitext(chapter['original_basename'])[0]
# Use .txt extension for text files
if is_text_file:
return f"{base}.txt" if retain else f"response_{base}.txt"
else:
if retain:
# Preserve the full original extension chain
return f"{base}{_full_ext_from_original(chapter)}"
return f"response_{base}.html"
else:
# Text file handling (no original basename)
if actual_num is None:
actual_num = chapter.get('actual_chapter_num', chapter.get('num', 0))
# Handle decimal chapter numbers from text file splitting
if isinstance(actual_num, float):
major = int(actual_num)
minor = int(round((actual_num - major) * 10))
if is_text_file:
return f"{major:04d}_{minor}.txt" if retain else f"response_{major:04d}_{minor}.txt"
else:
return f"{major:04d}_{minor}.html" if retain else f"response_{major:04d}_{minor}.html"
else:
if is_text_file:
return f"{actual_num:04d}.txt" if retain else f"response_{actual_num:04d}.txt"
else:
return f"{actual_num:04d}.html" if retain else f"response_{actual_num:04d}.html"
# =====================================================
# UNIFIED PROGRESS MANAGER
# =====================================================
class ProgressManager:
"""Unified progress management"""
def __init__(self, payloads_dir):
self.payloads_dir = payloads_dir
self.PROGRESS_FILE = os.path.join(payloads_dir, "translation_progress.json")
self.prog = self._init_or_load()
def _init_or_load(self):
"""Initialize or load progress tracking with improved structure"""
if os.path.exists(self.PROGRESS_FILE):
try:
with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf:
prog = json.load(pf)
except json.JSONDecodeError as e:
print(f"⚠️ Warning: Progress file is corrupted: {e}")
print("🔧 Attempting to fix JSON syntax...")
try:
with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf:
content = pf.read()
content = re.sub(r',\s*\]', ']', content)
content = re.sub(r',\s*\}', '}', content)
prog = json.loads(content)
with open(self.PROGRESS_FILE, "w", encoding="utf-8") as pf:
json.dump(prog, pf, ensure_ascii=False, indent=2)
print("✅ Successfully fixed and saved progress file")
except Exception as fix_error:
print(f"❌ Could not fix progress file: {fix_error}")
print("🔄 Creating backup and starting fresh...")
backup_name = f"translation_progress_backup_{int(time.time())}.json"
backup_path = os.path.join(self.payloads_dir, backup_name)
try:
shutil.copy(self.PROGRESS_FILE, backup_path)
print(f"📁 Backup saved to: {backup_name}")
except:
pass
prog = {
"chapters": {},
"chapter_chunks": {},
"version": "2.0"
}
if "chapters" not in prog:
prog["chapters"] = {}
for idx in prog.get("completed", []):
prog["chapters"][str(idx)] = {
"status": "completed",
"timestamp": None
}
if "chapter_chunks" not in prog:
prog["chapter_chunks"] = {}
else:
prog = {
"chapters": {},
"chapter_chunks": {},
"image_chunks": {},
"version": "2.1"
}
return prog
def save(self):
"""Save progress to file"""
try:
self.prog["completed_list"] = []
for chapter_key, chapter_info in self.prog.get("chapters", {}).items():
if chapter_info.get("status") == "completed" and chapter_info.get("output_file"):
self.prog["completed_list"].append({
"num": chapter_info.get("chapter_num", 0),
"idx": chapter_info.get("chapter_idx", 0),
"title": f"Chapter {chapter_info.get('chapter_num', 0)}",
"file": chapter_info.get("output_file", ""),
"key": chapter_key
})
if self.prog.get("completed_list"):
self.prog["completed_list"].sort(key=lambda x: x["num"])
temp_file = self.PROGRESS_FILE + '.tmp'
with open(temp_file, "w", encoding="utf-8") as pf:
json.dump(self.prog, pf, ensure_ascii=False, indent=2)
if os.path.exists(self.PROGRESS_FILE):
os.remove(self.PROGRESS_FILE)
os.rename(temp_file, self.PROGRESS_FILE)
except Exception as e:
print(f"⚠️ Warning: Failed to save progress: {e}")
temp_file = self.PROGRESS_FILE + '.tmp'
if os.path.exists(temp_file):
try:
os.remove(temp_file)
except:
pass
def update(self, idx, actual_num, content_hash, output_file, status="in_progress", ai_features=None, raw_num=None):
"""Update progress for a chapter"""
# CHANGE THIS LINE - Use actual_num instead of idx
chapter_key = str(actual_num) # WAS: chapter_key = str(idx)
chapter_info = {
"actual_num": actual_num,
"content_hash": content_hash,
"output_file": output_file,
"status": status,
"last_updated": time.time()
}
# Add raw number tracking
if raw_num is not None:
chapter_info["raw_chapter_num"] = raw_num
# Check if zero detection was disabled
if hasattr(builtins, '_DISABLE_ZERO_DETECTION') and builtins._DISABLE_ZERO_DETECTION:
chapter_info["zero_adjusted"] = False
else:
chapter_info["zero_adjusted"] = (raw_num != actual_num) if raw_num is not None else False
# FIXED: Store AI features if provided
if ai_features is not None:
chapter_info["ai_features"] = ai_features
# Preserve existing AI features if not overwriting
elif chapter_key in self.prog["chapters"] and "ai_features" in self.prog["chapters"][chapter_key]:
chapter_info["ai_features"] = self.prog["chapters"][chapter_key]["ai_features"]
self.prog["chapters"][chapter_key] = chapter_info
def check_chapter_status(self, chapter_idx, actual_num, content_hash, output_dir, chapter_obj=None):
"""Check if a chapter needs translation"""
chapter_key = str(actual_num)
# Check if we have tracking for this chapter
if chapter_key in self.prog["chapters"]:
chapter_info = self.prog["chapters"][chapter_key]
status = chapter_info.get("status")
# Failed statuses ALWAYS trigger retranslation
if status in ["qa_failed", "failed", "error", "file_missing"]:
return True, None, None
# Completed - check file exists
if status in ["completed", "completed_empty", "completed_image_only"]:
output_file = chapter_info.get("output_file")
if output_file:
output_path = os.path.join(output_dir, output_file)
if os.path.exists(output_path):
return False, f"Chapter {actual_num} already translated: {output_file}", output_file
# File missing - retranslate
del self.prog["chapters"][chapter_key]
if chapter_key in self.prog.get("chapter_chunks", {}):
del self.prog["chapter_chunks"][chapter_key]
self.save()
return True, None, None
# Any other status - retranslate
return True, None, None
# BEFORE auto-discovery, check if ANY entry exists for this chapter's file
if chapter_obj:
from TransateKRtoEN import FileUtilities
output_filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num)
# Check if ANY entry has this output file
for key, info in self.prog["chapters"].items():
if info.get("output_file") == output_filename:
# Entry exists somewhere else - don't auto-discover
return True, None, None
# NOW check if file exists for auto-discovery
output_path = os.path.join(output_dir, output_filename)
if os.path.exists(output_path):
print(f"📁 Found existing file for chapter {actual_num}: {output_filename}")
self.prog["chapters"][chapter_key] = {
"actual_num": actual_num,
"content_hash": content_hash,
"output_file": output_filename,
"status": "completed",
"last_updated": os.path.getmtime(output_path),
"auto_discovered": True
}
self.save()
return False, f"Chapter {actual_num} already exists: {output_filename}", output_filename
# No entry and no file - needs translation
return True, None, None
def cleanup_missing_files(self, output_dir):
"""Remove missing files and duplicates - NO RESTORATION BULLSHIT"""
cleaned_count = 0
# Remove entries for missing files
for chapter_key, chapter_info in list(self.prog["chapters"].items()):
output_file = chapter_info.get("output_file")
if output_file:
output_path = os.path.join(output_dir, output_file)
if not os.path.exists(output_path):
print(f"🗑️ Removing entry for missing file: {output_file}")
# Delete the entry
del self.prog["chapters"][chapter_key]
# Remove chunk data
if chapter_key in self.prog.get("chapter_chunks", {}):
del self.prog["chapter_chunks"][chapter_key]
cleaned_count += 1
if cleaned_count > 0:
print(f"🔄 Removed {cleaned_count} entries - will retranslate")
def migrate_to_content_hash(self, chapters):
"""Change keys to match actual_num values for proper mapping and sort by chapter number"""
new_chapters = {}
migrated_count = 0
for old_key, chapter_info in self.prog["chapters"].items():
actual_num = chapter_info.get("actual_num")
if actual_num is not None:
new_key = str(actual_num)
# If key needs to change
if old_key != new_key:
print(f" Migrating: key '{old_key}' → '{new_key}' (actual_num: {actual_num})")
migrated_count += 1
# Check for collision
if new_key in new_chapters:
print(f" ⚠️ Warning: Key '{new_key}' already exists, keeping newer entry")
if chapter_info.get("last_updated", 0) > new_chapters[new_key].get("last_updated", 0):
new_chapters[new_key] = chapter_info
else:
new_chapters[new_key] = chapter_info
else:
# Key already matches actual_num
new_chapters[old_key] = chapter_info
else:
# No actual_num, keep as-is
print(f" ⚠️ Warning: No actual_num for key '{old_key}', keeping as-is")
new_chapters[old_key] = chapter_info
# Sort chapters by actual_num field, then by key as fallback
def sort_key(item):
key, chapter_info = item
actual_num = chapter_info.get("actual_num")
if actual_num is not None:
return actual_num
else:
# Fallback to key if no actual_num
try:
return int(key)
except ValueError:
# For non-numeric keys, sort them at the end
return float('inf')
sorted_chapters = dict(sorted(new_chapters.items(), key=sort_key))
if migrated_count > 0:
# Also migrate and sort chapter_chunks if they exist
if "chapter_chunks" in self.prog:
new_chunks = {}
for old_key, chunk_data in self.prog["chapter_chunks"].items():
if old_key in self.prog["chapters"] and "actual_num" in self.prog["chapters"][old_key]:
new_key = str(self.prog["chapters"][old_key]["actual_num"])
new_chunks[new_key] = chunk_data
else:
new_chunks[old_key] = chunk_data
# Sort chapter_chunks using the same sorting logic
sorted_chunks = dict(sorted(new_chunks.items(), key=sort_key))
self.prog["chapter_chunks"] = sorted_chunks
self.prog["chapters"] = sorted_chapters
self.save()
print(f"✅ Migrated {migrated_count} entries to use actual_num as key and sorted by chapter number")
else:
# Even if no migration occurred, still apply sorting
self.prog["chapters"] = sorted_chapters
if "chapter_chunks" in self.prog:
sorted_chunks = dict(sorted(self.prog["chapter_chunks"].items(), key=sort_key))
self.prog["chapter_chunks"] = sorted_chunks
self.save()
print("✅ Sorted chapters by chapter number")
def get_stats(self, output_dir):
"""Get statistics about translation progress"""
stats = {
"total_tracked": len(self.prog["chapters"]),
"completed": 0,
"missing_files": 0,
"in_progress": 0
}
for chapter_info in self.prog["chapters"].values():
status = chapter_info.get("status")
output_file = chapter_info.get("output_file")
if status == "completed" and output_file:
output_path = os.path.join(output_dir, output_file)
if os.path.exists(output_path):
stats["completed"] += 1
else:
stats["missing_files"] += 1
elif status == "in_progress":
stats["in_progress"] += 1
elif status == "file_missing":
stats["missing_files"] += 1
return stats
# =====================================================
# UNIFIED CONTENT PROCESSOR
# =====================================================
class ContentProcessor:
"""Unified content processing"""
@staticmethod
def clean_ai_artifacts(text, remove_artifacts=True):
"""Remove AI response artifacts from text - but ONLY when enabled"""
if not remove_artifacts:
return text
# First, remove thinking tags if they exist
text = ContentProcessor._remove_thinking_tags(text)
# After removing thinking tags, re-analyze the text structure
# to catch AI artifacts that may now be at the beginning
lines = text.split('\n')
# Clean up empty lines at the beginning
while lines and not lines[0].strip():
lines.pop(0)
if not lines:
return text
# Check the first non-empty line for AI artifacts
first_line = lines[0].strip()
ai_patterns = [
r'^(?:Sure|Okay|Understood|Of course|Got it|Alright|Certainly|Here\'s|Here is)',
r'^(?:I\'ll|I will|Let me) (?:translate|help|assist)',
r'^(?:System|Assistant|AI|User|Human|Model)\s*:',
r'^\[PART\s+\d+/\d+\]',
r'^(?:Translation note|Note|Here\'s the translation|I\'ve translated)',
r'^```(?:html|xml|text)?\s*$', # Enhanced code block detection
r'^', remaining_text, re.IGNORECASE) or
len(remaining_text.strip()) > 50): # Reduced from 100 to 50
print(f"✂️ Removed AI artifact: {first_line[:50]}...")
return remaining_text.lstrip()
if first_line.lower() in ['html', 'text', 'content', 'translation', 'output']:
remaining_lines = lines[1:]
remaining_text = '\n'.join(remaining_lines)
if remaining_text.strip():
print(f"✂️ Removed single word artifact: {first_line}")
return remaining_text.lstrip()
return '\n'.join(lines)
@staticmethod
def _remove_thinking_tags(text):
"""Remove thinking tags that some AI models produce"""
if not text:
return text
# Common thinking tag patterns used by various AI models
thinking_patterns = [
# XML-style thinking tags
(r'.*?', 'thinking'),
(r'.*?', 'think'),
(r'.*?', 'thoughts'),
(r'.*?', 'reasoning'),
(r'.*?', 'analysis'),
(r'.*?', 'reflection'),
# OpenAI o1-style reasoning blocks - fix the regex escaping
(r'<\|thinking\|>.*?\|thinking\|>', 'o1-thinking'),
# Claude-style thinking blocks
(r'\[thinking\].*?\[/thinking\]', 'claude-thinking'),
# Generic bracketed thinking patterns
(r'\[THINKING\].*?\[/THINKING\]', 'bracketed-thinking'),
(r'\[ANALYSIS\].*?\[/ANALYSIS\]', 'bracketed-analysis'),
]
original_text = text
removed_count = 0
for pattern, tag_type in thinking_patterns:
# Use DOTALL flag to match across newlines
matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
if matches:
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
removed_count += len(matches)
# Also remove standalone code block markers that might be artifacts
# But preserve all actual content - only remove the ``` markers themselves
code_block_removed = 0
code_block_patterns = [
(r'^```\w*\s*\n', '\n'), # Opening code blocks - replace with newline
(r'\n```\s*$', ''), # Closing code blocks at end - remove entirely
(r'^```\w*\s*$', ''), # Standalone ``` on its own line - remove entirely
]
for pattern, replacement in code_block_patterns:
matches = re.findall(pattern, text, re.MULTILINE)
if matches:
text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
code_block_removed += len(matches)
# Clean up any extra whitespace or empty lines left after removing thinking tags
total_removed = removed_count + code_block_removed
if total_removed > 0:
# Remove multiple consecutive newlines
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
# Remove leading/trailing whitespace
text = text.strip()
if removed_count > 0 and code_block_removed > 0:
print(f"🧠 Removed {removed_count} thinking tag(s) and {code_block_removed} code block marker(s)")
elif removed_count > 0:
print(f"🧠 Removed {removed_count} thinking tag(s)")
elif code_block_removed > 0:
print(f"📝 Removed {code_block_removed} code block marker(s)")
return text
@staticmethod
def clean_memory_artifacts(text):
"""Remove any memory/summary artifacts that leaked into the translation"""
text = re.sub(r'\[MEMORY\].*?\[END MEMORY\]', '', text, flags=re.DOTALL)
lines = text.split('\n')
cleaned_lines = []
skip_next = False
for line in lines:
if any(marker in line for marker in ['[MEMORY]', '[END MEMORY]', 'Previous context summary:',
'memory summary', 'context summary', '[Context]']):
skip_next = True
continue
if skip_next and line.strip() == '':
skip_next = False
continue
skip_next = False
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
@staticmethod
def emergency_restore_paragraphs(text, original_html=None, verbose=True):
"""Emergency restoration when AI returns wall of text without proper paragraph tags"""
def log(message):
if verbose:
print(message)
if text.count('
') >= 3:
return text
if original_html:
original_para_count = original_html.count('')
current_para_count = text.count('
')
if current_para_count < original_para_count / 2:
log(f"⚠️ Paragraph mismatch! Original: {original_para_count}, Current: {current_para_count}")
log("🔧 Attempting emergency paragraph restoration...")
if '
' not in text and len(text) > 300:
log("❌ No paragraph tags found - applying emergency restoration")
if '\n\n' in text:
parts = text.split('\n\n')
paragraphs = ['' + part.strip() + '
' for part in parts if part.strip()]
return '\n'.join(paragraphs)
dialogue_pattern = r'(?<=[.!?])\s+(?=[""\u201c\u201d])'
if re.search(dialogue_pattern, text):
parts = re.split(dialogue_pattern, text)
paragraphs = []
for part in parts:
part = part.strip()
if part:
if not part.startswith(''):
part = '
' + part
if not part.endswith('
'):
part = part + ''
paragraphs.append(part)
return '\n'.join(paragraphs)
sentence_boundary = r'(?<=[.!?])\s+(?=[A-Z\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af])'
sentences = re.split(sentence_boundary, text)
if len(sentences) > 1:
paragraphs = []
current_para = []
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
current_para.append(sentence)
should_break = (
len(current_para) >= 3 or
sentence.rstrip().endswith(('"', '"', '"')) or
'* * *' in sentence or
'***' in sentence or
'---' in sentence
)
if should_break:
para_text = ' '.join(current_para)
if not para_text.startswith(''):
para_text = '
' + para_text
if not para_text.endswith('
'):
para_text = para_text + ''
paragraphs.append(para_text)
current_para = []
if current_para:
para_text = ' '.join(current_para)
if not para_text.startswith(''):
para_text = '
' + para_text
if not para_text.endswith('
'):
para_text = para_text + ''
paragraphs.append(para_text)
result = '\n'.join(paragraphs)
log(f"✅ Restored {len(paragraphs)} paragraphs from wall of text")
return result
words = text.split()
if len(words) > 100:
paragraphs = []
words_per_para = max(100, len(words) // 10)
for i in range(0, len(words), words_per_para):
chunk = ' '.join(words[i:i + words_per_para])
if chunk.strip():
paragraphs.append('' + chunk.strip() + '
')
return '\n'.join(paragraphs)
elif '' in text and text.count('
') < 3 and len(text) > 1000:
log("⚠️ Very few paragraphs for long text - checking if more breaks needed")
soup = BeautifulSoup(text, 'html.parser')
existing_paras = soup.find_all('p')
new_paragraphs = []
for para in existing_paras:
para_text = para.get_text()
if len(para_text) > 500:
sentences = re.split(r'(?<=[.!?])\s+', para_text)
if len(sentences) > 5:
chunks = []
current = []
for sent in sentences:
current.append(sent)
if len(current) >= 3:
chunks.append('
' + ' '.join(current) + '
')
current = []
if current:
chunks.append('' + ' '.join(current) + '
')
new_paragraphs.extend(chunks)
else:
new_paragraphs.append(str(para))
else:
new_paragraphs.append(str(para))
return '\n'.join(new_paragraphs)
return text
@staticmethod
def get_content_hash(html_content):
"""Create a stable hash of content"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
for tag in soup(['script', 'style', 'meta', 'link']):
tag.decompose()
text_content = soup.get_text(separator=' ', strip=True)
text_content = ' '.join(text_content.split())
return hashlib.md5(text_content.encode('utf-8')).hexdigest()
except Exception as e:
print(f"[WARNING] Failed to create hash: {e}")
return hashlib.md5(html_content.encode('utf-8')).hexdigest()
@staticmethod
def is_meaningful_text_content(html_content):
"""Check if chapter has meaningful text beyond just structure"""
try:
# Check if this is plain text from enhanced extraction (html2text output)
# html2text output characteristics:
# - Often starts with # for headers
# - Contains markdown-style formatting
# - Doesn't have HTML tags
content_stripped = html_content.strip()
# Quick check for plain text/markdown content
is_plain_text = False
if content_stripped and (
not content_stripped.startswith('<') or # Doesn't start with HTML tag
content_stripped.startswith('#') or # Markdown header
'\n\n' in content_stripped[:500] or # Markdown paragraphs
not '' in content_stripped[:500] and not '
' in content_stripped[:500] # No common HTML tags
):
# This looks like plain text or markdown from html2text
is_plain_text = True
if is_plain_text:
# For plain text, just check the length
text_length = len(content_stripped)
# Be more lenient with plain text since it's already extracted
return text_length > 50 # Much lower threshold for plain text
# Original HTML parsing logic
soup = BeautifulSoup(html_content, 'html.parser')
soup_copy = BeautifulSoup(str(soup), 'html.parser')
for img in soup_copy.find_all('img'):
img.decompose()
text_elements = soup_copy.find_all(['p', 'div', 'span'])
text_content = ' '.join(elem.get_text(strip=True) for elem in text_elements)
headers = soup_copy.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
header_text = ' '.join(h.get_text(strip=True) for h in headers)
if headers and len(text_content.strip()) > 1:
return True
if len(text_content.strip()) > 200:
return True
if len(header_text.strip()) > 100:
return True
return False
except Exception as e:
print(f"Warning: Error checking text content: {e}")
return True
# =====================================================
# UNIFIED CHAPTER EXTRACTOR
# =====================================================
class ChapterExtractor:
"""Unified chapter extraction with three modes: Smart, Comprehensive, and Full"""
def __init__(self, progress_callback=None):
self.pattern_manager = PatternManager()
self.progress_callback = progress_callback # Add progress callback
self.parser = self._get_best_parser() # Determine best parser on init
def _get_best_parser(self):
"""Determine the best parser available, preferring lxml for CJK text"""
try:
import lxml
return 'lxml'
except ImportError:
return 'html.parser'
def _sort_by_opf_spine(self, chapters, opf_path):
"""Sort chapters according to OPF spine order"""
try:
import xml.etree.ElementTree as ET
# Read OPF file
with open(opf_path, 'r', encoding='utf-8') as f:
opf_content = f.read()
# Parse OPF
root = ET.fromstring(opf_content)
# Find namespaces
ns = {'opf': 'http://www.idpf.org/2007/opf'}
if root.tag.startswith('{'):
default_ns = root.tag[1:root.tag.index('}')]
ns = {'opf': default_ns}
# Build manifest map (id -> href)
manifest = {}
for item in root.findall('.//opf:manifest/opf:item', ns):
item_id = item.get('id')
href = item.get('href')
if item_id and href:
manifest[item_id] = href
# Get spine order
spine_order = []
spine = root.find('.//opf:spine', ns)
if spine is not None:
for itemref in spine.findall('opf:itemref', ns):
idref = itemref.get('idref')
if idref and idref in manifest:
href = manifest[idref]
spine_order.append(href)
if not spine_order:
print("⚠️ No spine order found in OPF, keeping original order")
return chapters
# Create a mapping of filenames to spine position
spine_map = {}
for idx, href in enumerate(spine_order):
# Try different matching strategies
basename = os.path.basename(href)
spine_map[basename] = idx
spine_map[href] = idx
# Also store without extension for flexible matching
name_no_ext = os.path.splitext(basename)[0]
spine_map[name_no_ext] = idx
print(f"📋 OPF spine contains {len(spine_order)} items")
# Sort chapters based on spine order
def get_spine_position(chapter):
# Try to match chapter to spine
filename = chapter.get('filename', '')
basename = chapter.get('original_basename', '')
# Try exact filename match
if filename in spine_map:
return spine_map[filename]
# Try basename match
if basename in spine_map:
return spine_map[basename]
# Try basename of filename
if filename:
fname_base = os.path.basename(filename)
if fname_base in spine_map:
return spine_map[fname_base]
# Try without extension
if basename:
if basename + '.html' in spine_map:
return spine_map[basename + '.html']
if basename + '.xhtml' in spine_map:
return spine_map[basename + '.xhtml']
# Fallback to chapter number * 1000 (to sort after spine items)
return 1000000 + chapter.get('num', 0)
# Sort chapters
sorted_chapters = sorted(chapters, key=get_spine_position)
# Renumber chapters based on new order
for idx, chapter in enumerate(sorted_chapters, 1):
chapter['spine_order'] = idx
# Optionally update chapter numbers to match spine order
# chapter['num'] = idx # Uncomment if you want to renumber
# Log reordering info
reordered_count = 0
for idx, chapter in enumerate(sorted_chapters):
original_idx = chapters.index(chapter)
if original_idx != idx:
reordered_count += 1
if reordered_count > 0:
print(f"🔄 Reordered {reordered_count} chapters to match OPF spine")
else:
print(f"✅ Chapter order already matches OPF spine")
return sorted_chapters
except Exception as e:
print(f"⚠️ Could not sort by OPF spine: {e}")
import traceback
traceback.print_exc()
return chapters
def protect_angle_brackets_with_korean(self, text: str) -> str:
"""Protect CJK text in angle brackets from HTML parsing"""
if text is None:
return ""
import re
# Extended pattern to include Korean, Chinese, and Japanese characters
cjk_pattern = r'[가-힣ㄱ-ㅎㅏ-ㅣ一-龿ぁ-ゟァ-ヿ]'
bracket_pattern = rf'<([^<>]*{cjk_pattern}[^<>]*)>'
def replace_brackets(match):
content = match.group(1)
return f'<{content}>'
return re.sub(bracket_pattern, replace_brackets, text)
def ensure_all_opf_chapters_extracted(zf, chapters, out):
"""Ensure ALL chapters from OPF spine are extracted, not just what ChapterExtractor found"""
# Parse OPF to get ALL chapters in spine
opf_chapters = []
try:
# Find content.opf
opf_content = None
for name in zf.namelist():
if name.endswith('content.opf'):
opf_content = zf.read(name)
break
if not opf_content:
return chapters # No OPF, return original
import xml.etree.ElementTree as ET
root = ET.fromstring(opf_content)
# Handle namespaces
ns = {'opf': 'http://www.idpf.org/2007/opf'}
if root.tag.startswith('{'):
default_ns = root.tag[1:root.tag.index('}')]
ns = {'opf': default_ns}
# Get manifest
manifest = {}
for item in root.findall('.//opf:manifest/opf:item', ns):
item_id = item.get('id')
href = item.get('href')
media_type = item.get('media-type', '')
if item_id and href and ('html' in media_type.lower() or href.endswith(('.html', '.xhtml', '.htm'))):
manifest[item_id] = href
# Get spine order
spine = root.find('.//opf:spine', ns)
if spine:
for itemref in spine.findall('opf:itemref', ns):
idref = itemref.get('idref')
if idref and idref in manifest:
href = manifest[idref]
filename = os.path.basename(href)
# Skip nav, toc, cover
if any(skip in filename.lower() for skip in ['nav', 'toc', 'cover']):
continue
opf_chapters.append(href)
print(f"📚 OPF spine contains {len(opf_chapters)} chapters")
# Check which OPF chapters are missing from extraction
extracted_files = set()
for c in chapters:
if 'filename' in c:
extracted_files.add(c['filename'])
if 'original_basename' in c:
extracted_files.add(c['original_basename'])
missing_chapters = []
for opf_chapter in opf_chapters:
basename = os.path.basename(opf_chapter)
if basename not in extracted_files and opf_chapter not in extracted_files:
missing_chapters.append(opf_chapter)
if missing_chapters:
print(f"⚠️ {len(missing_chapters)} chapters in OPF but not extracted!")
print(f" Missing: {missing_chapters[:5]}{'...' if len(missing_chapters) > 5 else ''}")
# Extract the missing chapters
for href in missing_chapters:
try:
# Read the chapter content
content = zf.read(href).decode('utf-8')
# Extract chapter number
import re
basename = os.path.basename(href)
matches = re.findall(r'(\d+)', basename)
if matches:
chapter_num = int(matches[-1])
else:
chapter_num = len(chapters) + 1
# Create chapter entry
from bs4 import BeautifulSoup
parser = 'lxml' if 'lxml' in sys.modules else 'html.parser'
soup = BeautifulSoup(content, parser)
# Get title
title = "Chapter " + str(chapter_num)
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip() or title
else:
for tag in ['h1', 'h2', 'h3']:
header = soup.find(tag)
if header:
title = header.get_text().strip() or title
break
# Save the chapter file
output_filename = f"chapter_{chapter_num:04d}_{basename}"
output_path = os.path.join(out, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
# Add to chapters list
new_chapter = {
'num': chapter_num,
'title': title,
'body': content,
'filename': href,
'original_basename': basename,
'file_size': len(content),
'has_images': bool(soup.find_all('img')),
'detection_method': 'opf_recovery',
'content_hash': None # Will be calculated later
}
chapters.append(new_chapter)
print(f" ✅ Recovered chapter {chapter_num}: {basename}")
except Exception as e:
print(f" ❌ Failed to extract {href}: {e}")
# Re-sort chapters by number
chapters.sort(key=lambda x: x['num'])
print(f"✅ Total chapters after OPF recovery: {len(chapters)}")
except Exception as e:
print(f"⚠️ Error checking OPF chapters: {e}")
import traceback
traceback.print_exc()
return chapters
def extract_chapters(self, zf, output_dir):
"""Extract chapters and all resources from EPUB using ThreadPoolExecutor"""
import time
# Check stop at the very beginning
if is_stop_requested():
print("❌ Extraction stopped by user")
return []
print("🚀 Starting EPUB extraction with ThreadPoolExecutor...")
print(f"📄 Using parser: {self.parser} {'(optimized for CJK)' if self.parser == 'lxml' else '(standard)'}")
# Initial progress
if self.progress_callback:
self.progress_callback("Starting EPUB extraction...")
# First, extract and save content.opf for reference
for name in zf.namelist():
if name.endswith('.opf'):
try:
opf_content = zf.read(name).decode('utf-8', errors='ignore')
opf_output_path = os.path.join(output_dir, 'content.opf')
with open(opf_output_path, 'w', encoding='utf-8') as f:
f.write(opf_content)
print(f"📋 Saved OPF file: {name} → content.opf")
break
except Exception as e:
print(f"⚠️ Could not save OPF file: {e}")
# Get extraction mode from environment
extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower()
print(f"✅ Using {extraction_mode.capitalize()} extraction mode")
# Get number of workers from environment or use default
max_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))
print(f"🔧 Using {max_workers} workers for parallel processing")
extracted_resources = self._extract_all_resources(zf, output_dir)
# Check stop after resource extraction
if is_stop_requested():
print("❌ Extraction stopped by user")
return []
metadata_path = os.path.join(output_dir, 'metadata.json')
if os.path.exists(metadata_path):
print("📋 Loading existing metadata...")
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
else:
print("📋 Extracting fresh metadata...")
metadata = self._extract_epub_metadata(zf)
print(f"📋 Extracted metadata: {list(metadata.keys())}")
chapters, detected_language = self._extract_chapters_universal(zf, extraction_mode)
# Sort chapters according to OPF spine order if available
opf_path = os.path.join(output_dir, 'content.opf')
if os.path.exists(opf_path) and chapters:
print("📋 Sorting chapters according to OPF spine order...")
chapters = self._sort_by_opf_spine(chapters, opf_path)
print(f"✅ Chapters sorted according to OPF reading order")
# Check stop after chapter extraction
if is_stop_requested():
print("❌ Extraction stopped by user")
return []
if not chapters:
print("❌ No chapters could be extracted!")
return []
chapters_info_path = os.path.join(output_dir, 'chapters_info.json')
chapters_info = []
chapters_info_lock = threading.Lock()
def process_chapter(chapter):
"""Process a single chapter"""
# Check stop in worker
if is_stop_requested():
return None
info = {
'num': chapter['num'],
'title': chapter['title'],
'original_filename': chapter.get('filename', ''),
'has_images': chapter.get('has_images', False),
'image_count': chapter.get('image_count', 0),
'text_length': chapter.get('file_size', len(chapter.get('body', ''))),
'detection_method': chapter.get('detection_method', 'unknown'),
'content_hash': chapter.get('content_hash', '')
}
if chapter.get('has_images'):
try:
soup = BeautifulSoup(chapter.get('body', ''), self.parser)
images = soup.find_all('img')
info['images'] = [img.get('src', '') for img in images]
except:
info['images'] = []
return info
# Process chapters in parallel
print(f"🔄 Processing {len(chapters)} chapters in parallel...")
if self.progress_callback:
self.progress_callback(f"Processing {len(chapters)} chapters...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_chapter = {
executor.submit(process_chapter, chapter): chapter
for chapter in chapters
}
# Process completed tasks
completed = 0
for future in as_completed(future_to_chapter):
if is_stop_requested():
print("❌ Extraction stopped by user")
# Cancel remaining futures
for f in future_to_chapter:
f.cancel()
return []
try:
result = future.result()
if result:
with chapters_info_lock:
chapters_info.append(result)
completed += 1
# Yield to GUI periodically (can be disabled for max speed)
if completed % 5 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
time.sleep(0.001)
# Progress updates
if completed % 10 == 0 or completed == len(chapters):
progress_msg = f"Processed {completed}/{len(chapters)} chapters"
print(f" 📊 {progress_msg}")
if self.progress_callback:
self.progress_callback(progress_msg)
except Exception as e:
chapter = future_to_chapter[future]
print(f" ❌ Error processing chapter {chapter['num']}: {e}")
# Sort chapters_info by chapter number to maintain order
chapters_info.sort(key=lambda x: x['num'])
print(f"✅ Successfully processed {len(chapters_info)} chapters")
with open(chapters_info_path, 'w', encoding='utf-8') as f:
json.dump(chapters_info, f, ensure_ascii=False, indent=2)
print(f"💾 Saved detailed chapter info to: chapters_info.json")
metadata.update({
'chapter_count': len(chapters),
'detected_language': detected_language,
'extracted_resources': extracted_resources,
'extraction_mode': extraction_mode,
'extraction_summary': {
'total_chapters': len(chapters),
'chapter_range': f"{chapters[0]['num']}-{chapters[-1]['num']}",
'resources_extracted': sum(len(files) for files in extracted_resources.values())
}
})
metadata['chapter_titles'] = {
str(c['num']): c['title'] for c in chapters
}
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"💾 Saved comprehensive metadata to: {metadata_path}")
self._create_extraction_report(output_dir, metadata, chapters, extracted_resources)
self._log_extraction_summary(chapters, extracted_resources, detected_language)
print(f"🔍 VERIFICATION: {extraction_mode.capitalize()} chapter extraction completed successfully")
print(f"⚡ Used {max_workers} workers for parallel processing")
return chapters
def _extract_all_resources(self, zf, output_dir):
"""Extract all resources with parallel processing"""
import time
extracted_resources = {
'css': [],
'fonts': [],
'images': [],
'epub_structure': [],
'other': []
}
# Check if already extracted
extraction_marker = os.path.join(output_dir, '.resources_extracted')
if os.path.exists(extraction_marker):
print("📦 Resources already extracted, skipping...")
return self._count_existing_resources(output_dir, extracted_resources)
self._cleanup_old_resources(output_dir)
# Create directories
for resource_type in ['css', 'fonts', 'images']:
os.makedirs(os.path.join(output_dir, resource_type), exist_ok=True)
print(f"📦 Extracting resources in parallel...")
# Get list of files to process
file_list = [f for f in zf.namelist() if not f.endswith('/') and os.path.basename(f)]
# Thread-safe lock for extracted_resources
resource_lock = threading.Lock()
def extract_single_resource(file_path):
if is_stop_requested():
return None
try:
file_data = zf.read(file_path)
resource_info = self._categorize_resource(file_path, os.path.basename(file_path))
if resource_info:
resource_type, target_dir, safe_filename = resource_info
target_path = os.path.join(output_dir, target_dir, safe_filename) if target_dir else os.path.join(output_dir, safe_filename)
with open(target_path, 'wb') as f:
f.write(file_data)
# Thread-safe update
with resource_lock:
extracted_resources[resource_type].append(safe_filename)
return (resource_type, safe_filename)
except Exception as e:
print(f"[WARNING] Failed to extract {file_path}: {e}")
return None
# Process files in parallel
total_resources = len(file_list)
extracted_count = 0
# Use same worker count as chapter processing
resource_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))
with ThreadPoolExecutor(max_workers=resource_workers) as executor:
futures = {executor.submit(extract_single_resource, file_path): file_path
for file_path in file_list}
for future in as_completed(futures):
if is_stop_requested():
executor.shutdown(wait=False)
break
extracted_count += 1
# Progress update every 20 files
if extracted_count % 20 == 0 and self.progress_callback:
self.progress_callback(f"Extracting resources: {extracted_count}/{total_resources}")
# Yield to GUI periodically (can be disabled for max speed)
if extracted_count % 10 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
time.sleep(0.001)
result = future.result()
if result:
resource_type, filename = result
# Only print for important resources
if extracted_count < 10 or resource_type in ['css', 'fonts']:
print(f" 📄 Extracted {resource_type}: {filename}")
# Mark as complete
with open(extraction_marker, 'w') as f:
f.write(f"Resources extracted at {time.time()}")
self._validate_critical_files(output_dir, extracted_resources)
return extracted_resources
def _extract_chapters_universal(self, zf, extraction_mode="smart"):
"""Universal chapter extraction with four modes: smart, comprehensive, full, enhanced
All modes now properly merge Section/Chapter pairs
Enhanced mode uses html2text for superior text processing
Now with parallel processing for improved performance
"""
# Check stop at the beginning
if is_stop_requested():
print("❌ Chapter extraction stopped by user")
return [], 'unknown'
# Import time for yielding
import time
# Initialize enhanced extractor if using enhanced mode
enhanced_extractor = None
enhanced_filtering = extraction_mode # Default fallback
preserve_structure = True
# Independent control: translate cover.html when requested
translate_cover_html = os.getenv("TRANSLATE_COVER_HTML", "0") == "1"
if extraction_mode == "enhanced":
print("🚀 Initializing Enhanced extraction mode with html2text...")
# Get enhanced mode configuration from environment
enhanced_filtering = os.getenv("ENHANCED_FILTERING", "smart")
# Avoid 'full' with html2text to prevent XML declaration artifacts; use 'comprehensive' instead
if str(enhanced_filtering).lower() == 'full':
enhanced_filtering = 'comprehensive'
preserve_structure = os.getenv("ENHANCED_PRESERVE_STRUCTURE", "1") == "1"
print(f" • Enhanced filtering level: {enhanced_filtering}")
print(f" • Preserve structure: {preserve_structure}")
# Try to initialize enhanced extractor
try:
# Import our enhanced extractor (assume it's in the same directory or importable)
from enhanced_text_extractor import EnhancedTextExtractor
enhanced_extractor = EnhancedTextExtractor(
filtering_mode=enhanced_filtering,
preserve_structure=preserve_structure
)
print("✅ Enhanced text extractor initialized successfully")
except ImportError as e:
print(f"❌ Enhanced text extractor module not found: {e}")
print(f"❌ Cannot use enhanced extraction mode. Please install enhanced_text_extractor or select a different extraction mode.")
raise e
except Exception as e:
print(f"❌ Enhanced extractor initialization failed: {e}")
print(f"❌ Cannot use enhanced extraction mode. Please select a different extraction mode.")
raise e
chapters = []
sample_texts = []
# First phase: Collect HTML files
html_files = []
file_list = zf.namelist()
total_files = len(file_list)
# Update progress for file collection
if self.progress_callback and total_files > 100:
self.progress_callback(f"Scanning {total_files} files in EPUB...")
for idx, name in enumerate(file_list):
# Check stop while collecting files
if is_stop_requested():
print("❌ Chapter extraction stopped by user")
return [], 'unknown'
# Yield to GUI every 50 files (can be disabled for max speed)
if idx % 50 == 0 and idx > 0:
if os.getenv("ENABLE_GUI_YIELD", "1") == "1":
time.sleep(0.001) # Brief yield to GUI
if self.progress_callback and total_files > 100:
self.progress_callback(f"Scanning files: {idx}/{total_files}")
if name.lower().endswith(('.xhtml', '.html', '.htm')):
# Skip cover files by default unless override is enabled
basename = os.path.basename(name).lower()
if basename in ['cover.html', 'cover.xhtml', 'cover.htm'] and not translate_cover_html:
print(f"[SKIP] Cover file excluded from all modes: {name}")
continue
# Apply filtering based on the actual extraction mode (or enhanced_filtering for enhanced mode)
current_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
if current_filtering == "smart":
# Smart mode: aggressive filtering
lower_name = name.lower()
if any(skip in lower_name for skip in [
'nav', 'toc', 'contents', 'title', 'index',
'copyright', 'acknowledgment', 'dedication'
]):
continue
elif current_filtering == "comprehensive":
# Comprehensive mode: moderate filtering
skip_keywords = ['nav.', 'toc.', 'contents.', 'copyright.']
basename = os.path.basename(name.lower())
should_skip = False
for skip in skip_keywords:
if basename == skip + 'xhtml' or basename == skip + 'html' or basename == skip + 'htm':
should_skip = True
break
if should_skip:
print(f"[SKIP] Navigation/TOC file: {name}")
continue
# else: full mode - no filtering at all (except cover which is filtered above)
html_files.append(name)
# Update mode description to include enhanced mode
mode_description = {
"smart": "potential content files",
"comprehensive": "HTML files",
"full": "ALL HTML/XHTML files (no filtering)",
"enhanced": f"files (enhanced with {enhanced_filtering} filtering)"
}
print(f"📚 Found {len(html_files)} {mode_description.get(extraction_mode, 'files')} in EPUB")
# Sort files to ensure proper order
html_files.sort()
# Check if merging is disabled via environment variable
disable_merging = os.getenv("DISABLE_CHAPTER_MERGING", "0") == "1"
processed_files = set()
merge_candidates = {} # Store potential merges without reading files yet
if disable_merging:
print("📌 Chapter merging is DISABLED - processing all files independently")
else:
print("📌 Chapter merging is ENABLED")
# Only do merging logic if not disabled
file_groups = {}
# Group files by their base number to detect Section/Chapter pairs
for file_path in html_files:
filename = os.path.basename(file_path)
# Try different patterns to extract base number
base_num = None
# Pattern 1: "No00014" from "No00014Section.xhtml"
match = re.match(r'(No\d+)', filename)
if match:
base_num = match.group(1)
else:
# Pattern 2: "0014" from "0014_section.html" or "0014_chapter.html"
match = re.match(r'^(\d+)[_\-]', filename)
if match:
base_num = match.group(1)
else:
# Pattern 3: Just numbers at the start
match = re.match(r'^(\d+)', filename)
if match:
base_num = match.group(1)
if base_num:
if base_num not in file_groups:
file_groups[base_num] = []
file_groups[base_num].append(file_path)
# Identify merge candidates WITHOUT reading files yet
for base_num, group_files in sorted(file_groups.items()):
if len(group_files) == 2:
# Check if we have a Section/Chapter pair based on filenames only
section_file = None
chapter_file = None
for file_path in group_files:
basename = os.path.basename(file_path)
# More strict detection - must have 'section' or 'chapter' in the filename
if 'section' in basename.lower() and 'chapter' not in basename.lower():
section_file = file_path
elif 'chapter' in basename.lower() and 'section' not in basename.lower():
chapter_file = file_path
if section_file and chapter_file:
# Store as potential merge candidate
merge_candidates[chapter_file] = section_file
processed_files.add(section_file)
print(f"[DEBUG] Potential merge candidate: {base_num}")
print(f" Section: {os.path.basename(section_file)}")
print(f" Chapter: {os.path.basename(chapter_file)}")
# Filter out section files that were marked for merging
files_to_process = []
for file_path in html_files:
if not disable_merging and file_path in processed_files:
print(f"[DEBUG] Skipping section file: {file_path}")
continue
files_to_process.append(file_path)
print(f"📚 Processing {len(files_to_process)} files after merge analysis")
# Thread-safe collections
sample_texts_lock = threading.Lock()
file_size_groups_lock = threading.Lock()
h1_count_lock = threading.Lock()
h2_count_lock = threading.Lock()
# Initialize counters
file_size_groups = {}
h1_count = 0
h2_count = 0
processed_count = 0
processed_count_lock = threading.Lock()
# Progress tracking
total_files = len(files_to_process)
# Function to process a single HTML file
def process_single_html_file(file_path, file_index):
nonlocal h1_count, h2_count, processed_count
# Check stop
if is_stop_requested():
return None
# Update progress
with processed_count_lock:
processed_count += 1
current_count = processed_count
if self.progress_callback and current_count % 5 == 0:
progress_msg = f"Processing chapters: {current_count}/{total_files} ({current_count*100//total_files}%)"
self.progress_callback(progress_msg)
try:
# Read file data
file_data = zf.read(file_path)
# Decode the file data
html_content = None
detected_encoding = None
for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']:
try:
html_content = file_data.decode(encoding)
detected_encoding = encoding
break
except UnicodeDecodeError:
continue
if not html_content:
print(f"[WARNING] Could not decode {file_path}")
return None
# Check if this file needs merging
if not disable_merging and file_path in merge_candidates:
section_file = merge_candidates[file_path]
print(f"[DEBUG] Processing merge for: {file_path}")
try:
# Read section file
section_data = zf.read(section_file)
section_html = None
for encoding in ['utf-8', 'utf-16', 'gb18030', 'shift_jis', 'euc-kr', 'gbk', 'big5']:
try:
section_html = section_data.decode(encoding)
break
except UnicodeDecodeError:
continue
if section_html:
# Quick check if section is small enough to merge
section_soup = BeautifulSoup(section_html, self.parser)
section_text = section_soup.get_text(strip=True)
if len(section_text) < 200: # Merge if section is small
# Extract body content
chapter_soup = BeautifulSoup(html_content, self.parser)
if section_soup.body:
section_body_content = ''.join(str(child) for child in section_soup.body.children)
else:
section_body_content = section_html
if chapter_soup.body:
chapter_body_content = ''.join(str(child) for child in chapter_soup.body.children)
else:
chapter_body_content = html_content
# Merge content
html_content = section_body_content + "\n
\n" + chapter_body_content
print(f" → MERGED: Section ({len(section_text)} chars) + Chapter")
else:
print(f" → NOT MERGED: Section too large ({len(section_text)} chars)")
# Remove from processed files so it gets processed separately
processed_files.discard(section_file)
except Exception as e:
print(f"[WARNING] Failed to merge {file_path}: {e}")
# === ENHANCED EXTRACTION POINT ===
# Initialize variables that will be set by extraction
content_html = None
content_text = None
chapter_title = None
enhanced_extraction_used = False
# Determine whether to use enhanced extractor based on toggle and provider
use_enhanced = enhanced_extractor and extraction_mode == "enhanced"
force_bs_traditional = False
try:
force_bs = os.getenv('FORCE_BS_FOR_TRADITIONAL', '0') == '1'
model_env = os.getenv('MODEL', '')
if force_bs and is_traditional_translation_api(model_env):
use_enhanced = False
force_bs_traditional = True
except Exception:
pass
# Use enhanced extractor if available and allowed
if use_enhanced:
print(f"🚀 Using enhanced extraction for: {os.path.basename(file_path)}")
# Get clean text from html2text
clean_content, _, chapter_title = enhanced_extractor.extract_chapter_content(
html_content, enhanced_filtering
)
enhanced_extraction_used = True
print(f"✅ Enhanced extraction complete: {len(clean_content)} chars")
# For enhanced mode, store the markdown/plain text
# This will be sent to the translation API as-is
content_html = clean_content # This is MARKDOWN/PLAIN TEXT from html2text
content_text = clean_content # Same clean text for analysis
# BeautifulSoup method (only for non-enhanced modes)
if not enhanced_extraction_used:
if extraction_mode == "enhanced" and not force_bs_traditional:
# Enhanced mode failed - skip this file
print(f"❌ Skipping {file_path} - enhanced extraction required but not available")
return None
# Parse the (possibly merged) content
protected_html = self.protect_angle_brackets_with_korean(html_content)
# Use lxml parser which handles both HTML and XHTML well
soup = BeautifulSoup(protected_html, self.parser)
# Get effective mode for filtering
effective_filtering = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
# In full mode, keep the entire HTML structure
if effective_filtering == "full":
content_html = html_content # Keep EVERYTHING
content_text = soup.get_text(strip=True)
else:
# Smart and comprehensive modes extract body content
if soup.body:
content_html = str(soup.body)
content_text = soup.body.get_text(strip=True)
else:
content_html = html_content
content_text = soup.get_text(strip=True)
# Extract title (with ignore settings support)
chapter_title = None
# Check ignore settings for batch translation
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
# Extract from title tag if not ignored
if not ignore_title_tag and soup.title and soup.title.string:
chapter_title = soup.title.string.strip()
# Extract from header tags if not ignored and no title found
if not chapter_title and not ignore_header_tags:
for header_tag in ['h1', 'h2', 'h3']:
header = soup.find(header_tag)
if header:
chapter_title = header.get_text(strip=True)
break
# Fallback to filename if nothing found
if not chapter_title:
chapter_title = os.path.splitext(os.path.basename(file_path))[0]
# Get the effective extraction mode for processing logic
effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
# Skip truly empty files in smart mode
# BUT: Never skip anything when merging is disabled (to ensure section files are processed)
if effective_mode == "smart" and not disable_merging and len(content_text.strip()) < 10:
print(f"[SKIP] Nearly empty file: {file_path} ({len(content_text)} chars)")
return None
# Get actual chapter number based on original position
actual_chapter_num = files_to_process.index(file_path) + 1
# Mode-specific logic
if effective_mode == "comprehensive" or effective_mode == "full":
# For comprehensive/full mode, use sequential numbering
chapter_num = actual_chapter_num
if not chapter_title:
chapter_title = os.path.splitext(os.path.basename(file_path))[0]
detection_method = f"{extraction_mode}_sequential" if extraction_mode == "enhanced" else f"{effective_mode}_sequential"
elif effective_mode == "smart":
# For smart mode, when merging is disabled, use sequential numbering
if disable_merging:
chapter_num = actual_chapter_num
if not chapter_title:
chapter_title = os.path.splitext(os.path.basename(file_path))[0]
detection_method = f"{extraction_mode}_sequential_no_merge" if extraction_mode == "enhanced" else "sequential_no_merge"
else:
# When merging is enabled, try to extract chapter info
protected_html = self.protect_angle_brackets_with_korean(html_content)
soup = BeautifulSoup(protected_html, self.parser)
# Count headers (thread-safe)
h1_tags = soup.find_all('h1')
h2_tags = soup.find_all('h2')
if h1_tags:
with h1_count_lock:
h1_count += 1
if h2_tags:
with h2_count_lock:
h2_count += 1
# Try to extract chapter number and title
chapter_num, extracted_title, detection_method = self._extract_chapter_info(
soup, file_path, content_text, html_content
)
# Use extracted title if we don't have one
if extracted_title and not chapter_title:
chapter_title = extracted_title
# For hash-based filenames, chapter_num might be None
if chapter_num is None:
chapter_num = actual_chapter_num # Use actual chapter count
detection_method = f"{extraction_mode}_sequential_fallback" if extraction_mode == "enhanced" else "sequential_fallback"
print(f"[DEBUG] No chapter number found in {file_path}, assigning: {chapter_num}")
# Filter content_html for ignore settings (before processing)
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
if (ignore_title_tag or ignore_header_tags) and content_html and not enhanced_extraction_used:
# Parse the content HTML to remove ignored tags
content_soup = BeautifulSoup(content_html, self.parser)
# Remove title tags if ignored
if ignore_title_tag:
for title_tag in content_soup.find_all('title'):
title_tag.decompose()
# Remove header tags if ignored
if ignore_header_tags:
for header_tag in content_soup.find_all(['h1', 'h2', 'h3']):
header_tag.decompose()
# Update content_html with filtered version
content_html = str(content_soup)
# Process images and metadata (same for all modes)
protected_html = self.protect_angle_brackets_with_korean(html_content)
soup = BeautifulSoup(protected_html, self.parser)
images = soup.find_all('img')
has_images = len(images) > 0
is_image_only_chapter = has_images and len(content_text.strip()) < 500
if is_image_only_chapter:
print(f"[DEBUG] Image-only chapter detected: {file_path} ({len(images)} images, {len(content_text)} chars)")
content_hash = ContentProcessor.get_content_hash(content_html)
# Collect file size groups for smart mode (thread-safe)
if effective_mode == "smart":
file_size = len(content_text)
with file_size_groups_lock:
if file_size not in file_size_groups:
file_size_groups[file_size] = []
file_size_groups[file_size].append(file_path)
# Collect sample texts (thread-safe)
with sample_texts_lock:
if len(sample_texts) < 5:
sample_texts.append(content_text[:1000])
# Ensure chapter_num is always an integer
if isinstance(chapter_num, float):
chapter_num = int(chapter_num)
# Create chapter info
chapter_info = {
"num": chapter_num, # Now guaranteed to have a value
"title": chapter_title or f"Chapter {chapter_num}",
"body": content_html,
"filename": file_path,
"original_filename": os.path.basename(file_path),
"original_basename": os.path.splitext(os.path.basename(file_path))[0],
"content_hash": content_hash,
"detection_method": detection_method if detection_method else "pending",
"file_size": len(content_text),
"has_images": has_images,
"image_count": len(images),
"is_empty": len(content_text.strip()) == 0,
"is_image_only": is_image_only_chapter,
"extraction_mode": extraction_mode,
"file_index": file_index # Store original file index for sorting
}
# Add enhanced extraction info if used
if enhanced_extraction_used:
chapter_info["enhanced_extraction"] = True
chapter_info["enhanced_filtering"] = enhanced_filtering
chapter_info["preserve_structure"] = preserve_structure
# Add merge info if applicable
if not disable_merging and file_path in merge_candidates:
chapter_info["was_merged"] = True
chapter_info["merged_with"] = merge_candidates[file_path]
if effective_mode == "smart":
chapter_info["language_sample"] = content_text[:500]
# Debug for section files
if 'section' in chapter_info['original_basename'].lower():
print(f"[DEBUG] Added section file to candidates: {chapter_info['original_basename']} (size: {chapter_info['file_size']})")
return chapter_info
except Exception as e:
print(f"[ERROR] Failed to process {file_path}: {e}")
import traceback
traceback.print_exc()
return None
# Process files in parallel or sequentially based on file count
print(f"🚀 Processing {len(files_to_process)} HTML files...")
# Initial progress
if self.progress_callback:
self.progress_callback(f"Processing {len(files_to_process)} chapters...")
candidate_chapters = [] # For smart mode
chapters_direct = [] # For other modes
# Decide whether to use parallel processing
use_parallel = len(files_to_process) > 10
if use_parallel:
# Get worker count from environment variable
max_workers = int(os.getenv("EXTRACTION_WORKERS", "2"))
print(f"📦 Using parallel processing with {max_workers} workers...")
# Process files in parallel
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all files for processing
future_to_file = {
executor.submit(process_single_html_file, file_path, idx): (file_path, idx)
for idx, file_path in enumerate(files_to_process)
}
# Collect results as they complete
for future in as_completed(future_to_file):
if is_stop_requested():
print("❌ Chapter processing stopped by user")
executor.shutdown(wait=False)
return [], 'unknown'
try:
chapter_info = future.result()
if chapter_info:
effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
# For smart mode when merging is enabled, collect candidates
# Otherwise, add directly to chapters
if effective_mode == "smart" and not disable_merging:
candidate_chapters.append(chapter_info)
else:
chapters_direct.append(chapter_info)
except Exception as e:
file_path, idx = future_to_file[future]
print(f"[ERROR] Thread error processing {file_path}: {e}")
else:
print("📦 Using sequential processing (small file count)...")
# Process files sequentially for small EPUBs
for idx, file_path in enumerate(files_to_process):
if is_stop_requested():
print("❌ Chapter processing stopped by user")
return [], 'unknown'
chapter_info = process_single_html_file(file_path, idx)
if chapter_info:
effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
# For smart mode when merging is enabled, collect candidates
# Otherwise, add directly to chapters
if effective_mode == "smart" and not disable_merging:
candidate_chapters.append(chapter_info)
else:
chapters_direct.append(chapter_info)
# Final progress update
if self.progress_callback:
self.progress_callback(f"Chapter processing complete: {len(candidate_chapters) + len(chapters_direct)} chapters")
# Sort direct chapters by file index to maintain order
chapters_direct.sort(key=lambda x: x["file_index"])
# Post-process smart mode candidates (only when merging is enabled)
effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode
if effective_mode == "smart" and candidate_chapters and not disable_merging:
# Check stop before post-processing
if is_stop_requested():
print("❌ Chapter post-processing stopped by user")
return chapters, 'unknown'
print(f"\n[SMART MODE] Processing {len(candidate_chapters)} candidate files...")
# Sort candidates by file index to maintain order
candidate_chapters.sort(key=lambda x: x["file_index"])
# Debug: Show what files we have
section_files = [c for c in candidate_chapters if 'section' in c['original_basename'].lower()]
chapter_files = [c for c in candidate_chapters if 'chapter' in c['original_basename'].lower() and 'section' not in c['original_basename'].lower()]
other_files = [c for c in candidate_chapters if c not in section_files and c not in chapter_files]
print(f" 📊 File breakdown:")
print(f" • Section files: {len(section_files)}")
print(f" • Chapter files: {len(chapter_files)}")
print(f" • Other files: {len(other_files)}")
# Original smart mode logic when merging is enabled
# First, separate files with detected chapter numbers from those without
numbered_chapters = []
unnumbered_chapters = []
for idx, chapter in enumerate(candidate_chapters):
# Yield periodically during categorization (can be disabled for max speed)
if idx % 10 == 0 and idx > 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1":
time.sleep(0.001)
if chapter["num"] is not None:
numbered_chapters.append(chapter)
else:
unnumbered_chapters.append(chapter)
print(f" • Files with chapter numbers: {len(numbered_chapters)}")
print(f" • Files without chapter numbers: {len(unnumbered_chapters)}")
# Check if we have hash-based filenames (no numbered chapters found)
if not numbered_chapters and unnumbered_chapters:
print(" ⚠️ No chapter numbers found - likely hash-based filenames")
print(" → Using file order as chapter sequence")
# Sort by file index to maintain order
unnumbered_chapters.sort(key=lambda x: x["file_index"])
# Assign sequential numbers
for i, chapter in enumerate(unnumbered_chapters, 1):
chapter["num"] = i
chapter["detection_method"] = f"{extraction_mode}_hash_filename_sequential" if extraction_mode == "enhanced" else "hash_filename_sequential"
if not chapter["title"] or chapter["title"] == chapter["original_basename"]:
chapter["title"] = f"Chapter {i}"
chapters = unnumbered_chapters
else:
# We have some numbered chapters
chapters = numbered_chapters
# For unnumbered files, check if they might be duplicates or appendices
if unnumbered_chapters:
print(f" → Analyzing {len(unnumbered_chapters)} unnumbered files...")
# Get the max chapter number
max_num = max(c["num"] for c in numbered_chapters)
# Check each unnumbered file
for chapter in unnumbered_chapters:
# Check stop in post-processing loop
if is_stop_requested():
print("❌ Chapter post-processing stopped by user")
return chapters, 'unknown'
# Check if it's very small (might be a separator or note)
if chapter["file_size"] < 200:
print(f" [SKIP] Very small file: {chapter['filename']} ({chapter['file_size']} chars)")
continue
# Check if it has similar size to existing chapters (might be duplicate)
size = chapter["file_size"]
similar_chapters = [c for c in numbered_chapters
if abs(c["file_size"] - size) < 50]
if similar_chapters:
# Might be a duplicate, skip it
print(f" [SKIP] Possible duplicate: {chapter['filename']} (similar size to {len(similar_chapters)} chapters)")
continue
# Otherwise, add as appendix
max_num += 1
chapter["num"] = max_num
chapter["detection_method"] = f"{extraction_mode}_appendix_sequential" if extraction_mode == "enhanced" else "appendix_sequential"
if not chapter["title"] or chapter["title"] == chapter["original_basename"]:
chapter["title"] = f"Appendix {max_num}"
chapters.append(chapter)
print(f" [ADD] Added as chapter {max_num}: {chapter['filename']}")
else:
# For other modes or smart mode with merging disabled
chapters = chapters_direct
# Sort chapters by number
chapters.sort(key=lambda x: x["num"])
# Ensure chapter numbers are integers
# When merging is disabled, all chapters should have integer numbers anyway
for chapter in chapters:
if isinstance(chapter["num"], float):
chapter["num"] = int(chapter["num"])
# Final validation
if chapters:
print(f"\n✅ Final chapter count: {len(chapters)}")
print(f" • Chapter range: {chapters[0]['num']} - {chapters[-1]['num']}")
# Enhanced mode summary
if extraction_mode == "enhanced":
enhanced_count = sum(1 for c in chapters if c.get('enhanced_extraction', False))
print(f" 🚀 Enhanced extraction used: {enhanced_count}/{len(chapters)} chapters")
# Check for gaps
chapter_nums = [c["num"] for c in chapters]
expected_nums = list(range(min(chapter_nums), max(chapter_nums) + 1))
missing = set(expected_nums) - set(chapter_nums)
if missing:
print(f" ⚠️ Missing chapter numbers: {sorted(missing)}")
# Language detection
combined_sample = ' '.join(sample_texts) if effective_mode == "smart" else ''
detected_language = self._detect_content_language(combined_sample) if combined_sample else 'unknown'
if chapters:
self._print_extraction_summary(chapters, detected_language, extraction_mode,
h1_count if effective_mode == "smart" else 0,
h2_count if effective_mode == "smart" else 0,
file_size_groups if effective_mode == "smart" else {})
return chapters, detected_language
def _extract_chapter_info(self, soup, file_path, content_text, html_content):
"""Extract chapter number and title from various sources with parallel pattern matching"""
chapter_num = None
chapter_title = None
detection_method = None
# SPECIAL HANDLING: When we have Section/Chapter pairs, differentiate them
filename = os.path.basename(file_path)
# Handle different naming patterns for Section/Chapter files
if ('section' in filename.lower() or '_section' in filename.lower()) and 'chapter' not in filename.lower():
# For Section files, add 0.1 to the base number
# Try different patterns
match = re.search(r'No(\d+)', filename)
if not match:
match = re.search(r'^(\d+)[_\-]', filename)
if not match:
match = re.search(r'^(\d+)', filename)
if match:
base_num = int(match.group(1))
chapter_num = base_num + 0.1 # Section gets .1
detection_method = "filename_section_special"
elif ('chapter' in filename.lower() or '_chapter' in filename.lower()) and 'section' not in filename.lower():
# For Chapter files, use the base number
# Try different patterns
match = re.search(r'No(\d+)', filename)
if not match:
match = re.search(r'^(\d+)[_\-]', filename)
if not match:
match = re.search(r'^(\d+)', filename)
if match:
chapter_num = int(match.group(1))
detection_method = "filename_chapter_special"
# If not handled by special logic, continue with normal extraction
if not chapter_num:
# Try filename first - use parallel pattern matching for better performance
chapter_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS
if method.endswith('_number')]
if len(chapter_patterns) > 3: # Only parallelize if we have enough patterns
# Parallel pattern matching for filename
with ThreadPoolExecutor(max_workers=min(4, len(chapter_patterns))) as executor:
def try_pattern(pattern_info):
pattern, flags, method = pattern_info
match = re.search(pattern, file_path, flags)
if match:
try:
num_str = match.group(1)
if num_str.isdigit():
return int(num_str), f"filename_{method}"
elif method == 'chinese_chapter_cn':
converted = self._convert_chinese_number(num_str)
if converted:
return converted, f"filename_{method}"
except (ValueError, IndexError):
pass
return None, None
# Submit all patterns
futures = [executor.submit(try_pattern, pattern_info) for pattern_info in chapter_patterns]
# Check results as they complete
for future in as_completed(futures):
try:
num, method = future.result()
if num:
chapter_num = num
detection_method = method
# Cancel remaining futures
for f in futures:
f.cancel()
break
except Exception:
continue
else:
# Sequential processing for small pattern sets
for pattern, flags, method in chapter_patterns:
match = re.search(pattern, file_path, flags)
if match:
try:
num_str = match.group(1)
if num_str.isdigit():
chapter_num = int(num_str)
detection_method = f"filename_{method}"
break
elif method == 'chinese_chapter_cn':
converted = self._convert_chinese_number(num_str)
if converted:
chapter_num = converted
detection_method = f"filename_{method}"
break
except (ValueError, IndexError):
continue
# Try content if not found in filename
if not chapter_num:
# Check ignore settings for batch translation
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
# Prepare all text sources to check in parallel
text_sources = []
# Add title tag if not ignored
if not ignore_title_tag and soup.title and soup.title.string:
title_text = soup.title.string.strip()
text_sources.append(("title", title_text, True)) # True means this can be chapter_title
# Add headers if not ignored
if not ignore_header_tags:
for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
headers = soup.find_all(header_tag)
for header in headers[:3]: # Limit to first 3 of each type
header_text = header.get_text(strip=True)
if header_text:
text_sources.append((f"header_{header_tag}", header_text, True))
# Add first paragraphs
first_elements = soup.find_all(['p', 'div'])[:5]
for elem in first_elements:
elem_text = elem.get_text(strip=True)
if elem_text:
text_sources.append(("content", elem_text, False)) # False means don't use as chapter_title
# Process text sources in parallel if we have many
if len(text_sources) > 5:
with ThreadPoolExecutor(max_workers=min(6, len(text_sources))) as executor:
def extract_from_source(source_info):
source_type, text, can_be_title = source_info
num, method = self._extract_from_text(text, source_type)
return num, method, text if (num and can_be_title) else None
# Submit all text sources
future_to_source = {executor.submit(extract_from_source, source): source
for source in text_sources}
# Process results as they complete
for future in as_completed(future_to_source):
try:
num, method, title = future.result()
if num:
chapter_num = num
detection_method = method
if title and not chapter_title:
chapter_title = title
# Cancel remaining futures
for f in future_to_source:
f.cancel()
break
except Exception:
continue
else:
# Sequential processing for small text sets
for source_type, text, can_be_title in text_sources:
num, method = self._extract_from_text(text, source_type)
if num:
chapter_num = num
detection_method = method
if can_be_title and not chapter_title:
chapter_title = text
break
# Final fallback to filename patterns
if not chapter_num:
filename_base = os.path.basename(file_path)
# Parallel pattern matching for filename extraction
if len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS) > 3:
with ThreadPoolExecutor(max_workers=min(4, len(self.pattern_manager.FILENAME_EXTRACT_PATTERNS))) as executor:
def try_filename_pattern(pattern):
match = re.search(pattern, filename_base, re.IGNORECASE)
if match:
try:
return int(match.group(1))
except (ValueError, IndexError):
pass
return None
futures = [executor.submit(try_filename_pattern, pattern)
for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS]
for future in as_completed(futures):
try:
num = future.result()
if num:
chapter_num = num
detection_method = "filename_number"
for f in futures:
f.cancel()
break
except Exception:
continue
else:
# Sequential for small pattern sets
for pattern in self.pattern_manager.FILENAME_EXTRACT_PATTERNS:
match = re.search(pattern, filename_base, re.IGNORECASE)
if match:
chapter_num = int(match.group(1))
detection_method = "filename_number"
break
# Extract title if not already found (with ignore settings support)
if not chapter_title:
# Check ignore settings for batch translation
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
# Try title tag if not ignored
if not ignore_title_tag and soup.title and soup.title.string:
chapter_title = soup.title.string.strip()
# Try header tags if not ignored and no title found
if not chapter_title and not ignore_header_tags:
for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
header = soup.find(header_tag)
if header:
chapter_title = header.get_text(strip=True)
break
# Final fallback
if not chapter_title:
chapter_title = f"Chapter {chapter_num}" if chapter_num else None
chapter_title = re.sub(r'\s+', ' ', chapter_title).strip() if chapter_title else None
return chapter_num, chapter_title, detection_method
def _extract_from_text(self, text, source_type):
"""Extract chapter number from text using patterns with parallel matching for large pattern sets"""
# Get patterns that don't end with '_number'
text_patterns = [(pattern, flags, method) for pattern, flags, method in self.pattern_manager.CHAPTER_PATTERNS
if not method.endswith('_number')]
# Only use parallel processing if we have many patterns
if len(text_patterns) > 5:
with ThreadPoolExecutor(max_workers=min(4, len(text_patterns))) as executor:
def try_text_pattern(pattern_info):
pattern, flags, method = pattern_info
match = re.search(pattern, text, flags)
if match:
try:
num_str = match.group(1)
if num_str.isdigit():
return int(num_str), f"{source_type}_{method}"
elif method == 'chinese_chapter_cn':
converted = self._convert_chinese_number(num_str)
if converted:
return converted, f"{source_type}_{method}"
except (ValueError, IndexError):
pass
return None, None
# Submit all patterns
futures = [executor.submit(try_text_pattern, pattern_info) for pattern_info in text_patterns]
# Check results as they complete
for future in as_completed(futures):
try:
num, method = future.result()
if num:
# Cancel remaining futures
for f in futures:
f.cancel()
return num, method
except Exception:
continue
else:
# Sequential processing for small pattern sets
for pattern, flags, method in text_patterns:
match = re.search(pattern, text, flags)
if match:
try:
num_str = match.group(1)
if num_str.isdigit():
return int(num_str), f"{source_type}_{method}"
elif method == 'chinese_chapter_cn':
converted = self._convert_chinese_number(num_str)
if converted:
return converted, f"{source_type}_{method}"
except (ValueError, IndexError):
continue
return None, None
def _convert_chinese_number(self, cn_num):
"""Convert Chinese number to integer"""
if cn_num in self.pattern_manager.CHINESE_NUMS:
return self.pattern_manager.CHINESE_NUMS[cn_num]
if '十' in cn_num:
parts = cn_num.split('十')
if len(parts) == 2:
tens = self.pattern_manager.CHINESE_NUMS.get(parts[0], 1) if parts[0] else 1
ones = self.pattern_manager.CHINESE_NUMS.get(parts[1], 0) if parts[1] else 0
return tens * 10 + ones
return None
def _detect_content_language(self, text_sample):
"""Detect the primary language of content with parallel processing for large texts"""
# For very short texts, use sequential processing
if len(text_sample) < 1000:
scripts = {
'korean': 0,
'japanese_hiragana': 0,
'japanese_katakana': 0,
'chinese': 0,
'latin': 0
}
for char in text_sample:
code = ord(char)
if 0xAC00 <= code <= 0xD7AF:
scripts['korean'] += 1
elif 0x3040 <= code <= 0x309F:
scripts['japanese_hiragana'] += 1
elif 0x30A0 <= code <= 0x30FF:
scripts['japanese_katakana'] += 1
elif 0x4E00 <= code <= 0x9FFF:
scripts['chinese'] += 1
elif 0x0020 <= code <= 0x007F:
scripts['latin'] += 1
else:
# For longer texts, use parallel processing
# Split text into chunks for parallel processing
chunk_size = max(500, len(text_sample) // (os.cpu_count() or 4))
chunks = [text_sample[i:i + chunk_size] for i in range(0, len(text_sample), chunk_size)]
# Thread-safe accumulator
scripts_lock = threading.Lock()
scripts = {
'korean': 0,
'japanese_hiragana': 0,
'japanese_katakana': 0,
'chinese': 0,
'latin': 0
}
def process_chunk(text_chunk):
"""Process a chunk of text and return script counts"""
local_scripts = {
'korean': 0,
'japanese_hiragana': 0,
'japanese_katakana': 0,
'chinese': 0,
'latin': 0
}
for char in text_chunk:
code = ord(char)
if 0xAC00 <= code <= 0xD7AF:
local_scripts['korean'] += 1
elif 0x3040 <= code <= 0x309F:
local_scripts['japanese_hiragana'] += 1
elif 0x30A0 <= code <= 0x30FF:
local_scripts['japanese_katakana'] += 1
elif 0x4E00 <= code <= 0x9FFF:
local_scripts['chinese'] += 1
elif 0x0020 <= code <= 0x007F:
local_scripts['latin'] += 1
return local_scripts
# Process chunks in parallel
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, len(chunks))) as executor:
# Submit all chunks
futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
# Collect results
for future in as_completed(futures):
try:
chunk_scripts = future.result()
# Thread-safe accumulation
with scripts_lock:
for script, count in chunk_scripts.items():
scripts[script] += count
except Exception as e:
print(f"[WARNING] Error processing chunk in language detection: {e}")
# Language determination logic (same as original)
total_cjk = scripts['korean'] + scripts['japanese_hiragana'] + scripts['japanese_katakana'] + scripts['chinese']
if scripts['korean'] > total_cjk * 0.3:
return 'korean'
elif scripts['japanese_hiragana'] + scripts['japanese_katakana'] > total_cjk * 0.2:
return 'japanese'
elif scripts['chinese'] > total_cjk * 0.3:
return 'chinese'
elif scripts['latin'] > len(text_sample) * 0.7:
return 'english'
else:
return 'unknown'
def _print_extraction_summary(self, chapters, detected_language, extraction_mode, h1_count, h2_count, file_size_groups):
"""Print extraction summary"""
print(f"\n📊 Chapter Extraction Summary ({extraction_mode.capitalize()} Mode):")
print(f" • Total chapters extracted: {len(chapters)}")
# Format chapter range handling both int and float
first_num = chapters[0]['num']
last_num = chapters[-1]['num']
print(f" • Chapter range: {first_num} to {last_num}")
print(f" • Detected language: {detected_language}")
if extraction_mode == "smart":
print(f" • Primary header type: {'
' if h2_count > h1_count else ''}")
image_only_count = sum(1 for c in chapters if c.get('is_image_only', False))
text_only_count = sum(1 for c in chapters if not c.get('has_images', False) and c.get('file_size', 0) >= 500)
mixed_count = sum(1 for c in chapters if c.get('has_images', False) and c.get('file_size', 0) >= 500)
empty_count = sum(1 for c in chapters if c.get('file_size', 0) < 50)
print(f" • Text-only chapters: {text_only_count}")
print(f" • Image-only chapters: {image_only_count}")
print(f" • Mixed content chapters: {mixed_count}")
print(f" • Empty/minimal content: {empty_count}")
# Check for merged chapters
merged_count = sum(1 for c in chapters if c.get('was_merged', False))
if merged_count > 0:
print(f" • Merged chapters: {merged_count}")
# Check for missing chapters (only for integer sequences)
expected_chapters = set(range(chapters[0]['num'], chapters[-1]['num'] + 1))
actual_chapters = set(c['num'] for c in chapters)
missing = expected_chapters - actual_chapters
if missing:
print(f" ⚠️ Missing chapter numbers: {sorted(missing)}")
if extraction_mode == "smart":
method_stats = Counter(c['detection_method'] for c in chapters)
print(f" 📈 Detection methods used:")
for method, count in method_stats.most_common():
print(f" • {method}: {count} chapters")
large_groups = [size for size, files in file_size_groups.items() if len(files) > 1]
if large_groups:
print(f" ⚠️ Found {len(large_groups)} file size groups with potential duplicates")
else:
print(f" • Empty/placeholder: {empty_count}")
if extraction_mode == "full":
print(f" 🔍 Full extraction preserved all HTML structure and tags")
def _extract_epub_metadata(self, zf):
"""Extract comprehensive metadata from EPUB file including all custom fields"""
meta = {}
# Use lxml for XML if available
xml_parser = 'lxml-xml' if self.parser == 'lxml' else 'xml'
try:
for name in zf.namelist():
if name.lower().endswith('.opf'):
opf_content = zf.read(name)
soup = BeautifulSoup(opf_content, xml_parser)
# Extract ALL Dublin Core elements (expanded list)
dc_elements = ['title', 'creator', 'subject', 'description',
'publisher', 'contributor', 'date', 'type',
'format', 'identifier', 'source', 'language',
'relation', 'coverage', 'rights']
for element in dc_elements:
tag = soup.find(element)
if tag and tag.get_text(strip=True):
meta[element] = tag.get_text(strip=True)
# Extract ALL meta tags (not just series)
meta_tags = soup.find_all('meta')
for meta_tag in meta_tags:
# Try different attribute names for the metadata name
name = meta_tag.get('name') or meta_tag.get('property', '')
content = meta_tag.get('content', '')
if name and content:
# Store original name for debugging
original_name = name
# Clean up common prefixes
if name.startswith('calibre:'):
name = name[8:] # Remove 'calibre:' prefix
elif name.startswith('dc:'):
name = name[3:] # Remove 'dc:' prefix
elif name.startswith('opf:'):
name = name[4:] # Remove 'opf:' prefix
# Normalize the field name - replace hyphens with underscores
name = name.replace('-', '_')
# Don't overwrite if already exists (prefer direct tags over meta tags)
if name not in meta:
meta[name] = content
# Debug output for custom fields
if original_name != name:
print(f" • Found custom field: {original_name} → {name}")
# Special handling for series information (maintain compatibility)
if 'series' not in meta:
series_tags = soup.find_all('meta', attrs={'name': lambda x: x and 'series' in x.lower()})
for series_tag in series_tags:
series_name = series_tag.get('content', '')
if series_name:
meta['series'] = series_name
break
# Extract refines metadata (used by some EPUB creators)
refines_metas = soup.find_all('meta', attrs={'refines': True})
for refine in refines_metas:
property_name = refine.get('property', '')
content = refine.get_text(strip=True) or refine.get('content', '')
if property_name and content:
# Clean property name
if ':' in property_name:
property_name = property_name.split(':')[-1]
property_name = property_name.replace('-', '_')
if property_name not in meta:
meta[property_name] = content
# Log extraction summary
print(f"📋 Extracted {len(meta)} metadata fields")
# Show standard vs custom fields
standard_keys = {'title', 'creator', 'language', 'subject', 'description',
'publisher', 'date', 'identifier', 'source', 'rights',
'contributor', 'type', 'format', 'relation', 'coverage'}
custom_keys = set(meta.keys()) - standard_keys
if custom_keys:
print(f"📋 Standard fields: {len(standard_keys & set(meta.keys()))}")
print(f"📋 Custom fields found: {sorted(custom_keys)}")
# Show sample values for custom fields (truncated)
for key in sorted(custom_keys)[:5]: # Show first 5 custom fields
value = str(meta[key])
if len(value) > 50:
value = value[:47] + "..."
print(f" • {key}: {value}")
if len(custom_keys) > 5:
print(f" • ... and {len(custom_keys) - 5} more custom fields")
break
except Exception as e:
print(f"[WARNING] Failed to extract metadata: {e}")
import traceback
traceback.print_exc()
return meta
def _categorize_resource(self, file_path, file_name):
"""Categorize a file and return (resource_type, target_dir, safe_filename)"""
file_path_lower = file_path.lower()
file_name_lower = file_name.lower()
if file_path_lower.endswith('.css'):
return 'css', 'css', sanitize_resource_filename(file_name)
elif file_path_lower.endswith(('.ttf', '.otf', '.woff', '.woff2', '.eot')):
return 'fonts', 'fonts', sanitize_resource_filename(file_name)
elif file_path_lower.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.bmp', '.webp')):
return 'images', 'images', sanitize_resource_filename(file_name)
elif (file_path_lower.endswith(('.opf', '.ncx')) or
file_name_lower == 'container.xml' or
'container.xml' in file_path_lower):
if 'container.xml' in file_path_lower:
safe_filename = 'container.xml'
else:
safe_filename = file_name
return 'epub_structure', None, safe_filename
elif file_path_lower.endswith(('.js', '.xml', '.txt')):
return 'other', None, sanitize_resource_filename(file_name)
return None
def _cleanup_old_resources(self, output_dir):
"""Clean up old resource directories and EPUB structure files"""
print("🧹 Cleaning up any existing resource directories...")
cleanup_success = True
for resource_type in ['css', 'fonts', 'images']:
resource_dir = os.path.join(output_dir, resource_type)
if os.path.exists(resource_dir):
try:
shutil.rmtree(resource_dir)
print(f" 🗑️ Removed old {resource_type} directory")
except PermissionError as e:
print(f" ⚠️ Cannot remove {resource_type} directory (permission denied) - will merge with existing files")
cleanup_success = False
except Exception as e:
print(f" ⚠️ Error removing {resource_type} directory: {e} - will merge with existing files")
cleanup_success = False
epub_structure_files = ['container.xml', 'content.opf', 'toc.ncx']
for epub_file in epub_structure_files:
input_path = os.path.join(output_dir, epub_file)
if os.path.exists(input_path):
try:
os.remove(input_path)
print(f" 🗑️ Removed old {epub_file}")
except PermissionError:
print(f" ⚠️ Cannot remove {epub_file} (permission denied) - will use existing file")
except Exception as e:
print(f" ⚠️ Error removing {epub_file}: {e}")
try:
for file in os.listdir(output_dir):
if file.lower().endswith(('.opf', '.ncx')):
file_path = os.path.join(output_dir, file)
try:
os.remove(file_path)
print(f" 🗑️ Removed old EPUB file: {file}")
except PermissionError:
print(f" ⚠️ Cannot remove {file} (permission denied)")
except Exception as e:
print(f" ⚠️ Error removing {file}: {e}")
except Exception as e:
print(f"⚠️ Error scanning for EPUB files: {e}")
if not cleanup_success:
print("⚠️ Some cleanup operations failed due to file permissions")
print(" The program will continue and merge with existing files")
return cleanup_success
def _count_existing_resources(self, output_dir, extracted_resources):
"""Count existing resources when skipping extraction"""
for resource_type in ['css', 'fonts', 'images', 'epub_structure']:
if resource_type == 'epub_structure':
epub_files = []
for file in ['container.xml', 'content.opf', 'toc.ncx']:
if os.path.exists(os.path.join(output_dir, file)):
epub_files.append(file)
try:
for file in os.listdir(output_dir):
if file.lower().endswith(('.opf', '.ncx')) and file not in epub_files:
epub_files.append(file)
except:
pass
extracted_resources[resource_type] = epub_files
else:
resource_dir = os.path.join(output_dir, resource_type)
if os.path.exists(resource_dir):
try:
files = [f for f in os.listdir(resource_dir) if os.path.isfile(os.path.join(resource_dir, f))]
extracted_resources[resource_type] = files
except:
extracted_resources[resource_type] = []
total_existing = sum(len(files) for files in extracted_resources.values())
print(f"✅ Found {total_existing} existing resource files")
return extracted_resources
def _validate_critical_files(self, output_dir, extracted_resources):
"""Validate that critical EPUB files were extracted"""
total_extracted = sum(len(files) for files in extracted_resources.values())
print(f"✅ Extracted {total_extracted} resource files:")
for resource_type, files in extracted_resources.items():
if files:
if resource_type == 'epub_structure':
print(f" • EPUB Structure: {len(files)} files")
for file in files:
print(f" - {file}")
else:
print(f" • {resource_type.title()}: {len(files)} files")
critical_files = ['container.xml']
missing_critical = [f for f in critical_files if not os.path.exists(os.path.join(output_dir, f))]
if missing_critical:
print(f"⚠️ WARNING: Missing critical EPUB files: {missing_critical}")
print(" This may prevent proper EPUB reconstruction!")
else:
print("✅ All critical EPUB structure files extracted successfully")
opf_files = [f for f in extracted_resources['epub_structure'] if f.lower().endswith('.opf')]
if not opf_files:
print("⚠️ WARNING: No OPF file found! This will prevent EPUB reconstruction.")
else:
print(f"✅ Found OPF file(s): {opf_files}")
def _create_extraction_report(self, output_dir, metadata, chapters, extracted_resources):
"""Create comprehensive extraction report with HTML file tracking"""
report_path = os.path.join(output_dir, 'extraction_report.txt')
with open(report_path, 'w', encoding='utf-8') as f:
f.write("EPUB Extraction Report\n")
f.write("=" * 50 + "\n\n")
f.write(f"EXTRACTION MODE: {metadata.get('extraction_mode', 'unknown').upper()}\n\n")
f.write("METADATA:\n")
for key, value in metadata.items():
if key not in ['chapter_titles', 'extracted_resources', 'extraction_mode']:
f.write(f" {key}: {value}\n")
f.write(f"\nCHAPTERS ({len(chapters)}):\n")
text_chapters = []
image_only_chapters = []
mixed_chapters = []
for chapter in chapters:
if chapter.get('has_images') and chapter.get('file_size', 0) < 500:
image_only_chapters.append(chapter)
elif chapter.get('has_images') and chapter.get('file_size', 0) >= 500:
mixed_chapters.append(chapter)
else:
text_chapters.append(chapter)
if text_chapters:
f.write(f"\n TEXT CHAPTERS ({len(text_chapters)}):\n")
for c in text_chapters:
f.write(f" {c['num']:3d}. {c['title']} ({c['detection_method']})\n")
if c.get('original_html_file'):
f.write(f" → {c['original_html_file']}\n")
if image_only_chapters:
f.write(f"\n IMAGE-ONLY CHAPTERS ({len(image_only_chapters)}):\n")
for c in image_only_chapters:
f.write(f" {c['num']:3d}. {c['title']} (images: {c.get('image_count', 0)})\n")
if c.get('original_html_file'):
f.write(f" → {c['original_html_file']}\n")
if 'body' in c:
try:
soup = BeautifulSoup(c['body'], 'html.parser')
images = soup.find_all('img')
for img in images[:3]:
src = img.get('src', 'unknown')
f.write(f" • Image: {src}\n")
if len(images) > 3:
f.write(f" • ... and {len(images) - 3} more images\n")
except:
pass
if mixed_chapters:
f.write(f"\n MIXED CONTENT CHAPTERS ({len(mixed_chapters)}):\n")
for c in mixed_chapters:
f.write(f" {c['num']:3d}. {c['title']} (text: {c.get('file_size', 0)} chars, images: {c.get('image_count', 0)})\n")
if c.get('original_html_file'):
f.write(f" → {c['original_html_file']}\n")
f.write(f"\nRESOURCES EXTRACTED:\n")
for resource_type, files in extracted_resources.items():
if files:
if resource_type == 'epub_structure':
f.write(f" EPUB Structure: {len(files)} files\n")
for file in files:
f.write(f" - {file}\n")
else:
f.write(f" {resource_type.title()}: {len(files)} files\n")
for file in files[:5]:
f.write(f" - {file}\n")
if len(files) > 5:
f.write(f" ... and {len(files) - 5} more\n")
f.write(f"\nHTML FILES WRITTEN:\n")
html_files_written = metadata.get('html_files_written', 0)
f.write(f" Total: {html_files_written} files\n")
f.write(f" Location: Main directory and 'originals' subdirectory\n")
f.write(f"\nPOTENTIAL ISSUES:\n")
issues = []
if image_only_chapters:
issues.append(f" • {len(image_only_chapters)} chapters contain only images (may need OCR)")
missing_html = sum(1 for c in chapters if not c.get('original_html_file'))
if missing_html > 0:
issues.append(f" • {missing_html} chapters failed to write HTML files")
if not extracted_resources.get('epub_structure'):
issues.append(" • No EPUB structure files found (may affect reconstruction)")
if not issues:
f.write(" None detected - extraction appears successful!\n")
else:
for issue in issues:
f.write(issue + "\n")
print(f"📄 Saved extraction report to: {report_path}")
def _log_extraction_summary(self, chapters, extracted_resources, detected_language, html_files_written=0):
"""Log final extraction summary with HTML file information"""
extraction_mode = chapters[0].get('extraction_mode', 'unknown') if chapters else 'unknown'
print(f"\n✅ {extraction_mode.capitalize()} extraction complete!")
print(f" 📚 Chapters: {len(chapters)}")
print(f" 📄 HTML files written: {html_files_written}")
print(f" 🎨 Resources: {sum(len(files) for files in extracted_resources.values())}")
print(f" 🌍 Language: {detected_language}")
image_only_count = sum(1 for c in chapters if c.get('has_images') and c.get('file_size', 0) < 500)
if image_only_count > 0:
print(f" 📸 Image-only chapters: {image_only_count}")
epub_files = extracted_resources.get('epub_structure', [])
if epub_files:
print(f" 📋 EPUB Structure: {len(epub_files)} files ({', '.join(epub_files)})")
else:
print(f" ⚠️ No EPUB structure files extracted!")
print(f"\n🔍 Pre-flight check readiness:")
print(f" ✅ HTML files: {'READY' if html_files_written > 0 else 'NOT READY'}")
print(f" ✅ Metadata: READY")
print(f" ✅ Resources: READY")
# =====================================================
# UNIFIED TRANSLATION PROCESSOR
# =====================================================
class TranslationProcessor:
"""Handles the translation of individual chapters"""
def __init__(self, config, client, out_dir, log_callback=None, stop_callback=None, uses_zero_based=False, is_text_file=False):
self.config = config
self.client = client
self.out_dir = out_dir
self.log_callback = log_callback
self.stop_callback = stop_callback
self.chapter_splitter = ChapterSplitter(model_name=config.MODEL)
self.uses_zero_based = uses_zero_based
self.is_text_file = is_text_file
# Check and log multi-key status
if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys:
stats = self.client.get_stats()
self._log(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys")
self._log(f" Active keys: {stats.get('active_keys', 0)}")
def _log(self, message):
"""Log a message"""
if self.log_callback:
self.log_callback(message)
else:
print(message)
def report_key_status(self):
"""Report multi-key status if available"""
if hasattr(self.client, 'get_stats'):
stats = self.client.get_stats()
if stats.get('multi_key_mode', False):
self._log(f"\n📊 API Key Status:")
self._log(f" Active Keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}")
self._log(f" Success Rate: {stats.get('success_rate', 0):.1%}")
self._log(f" Total Requests: {stats.get('total_requests', 0)}\n")
def check_stop(self):
"""Check if translation should stop"""
if self.stop_callback and self.stop_callback():
print("❌ Translation stopped by user request.")
return True
def check_duplicate_content(self, result, idx, prog, out, actual_num=None):
"""Check if translated content is duplicate - with mode selection"""
# Get detection mode from config
detection_mode = getattr(self.config, 'DUPLICATE_DETECTION_MODE', 'basic')
print(f" 🔍 DEBUG: Detection mode = '{detection_mode}'")
print(f" 🔍 DEBUG: Lookback chapters = {self.config.DUPLICATE_LOOKBACK_CHAPTERS}")
# Extract content_hash if available from progress
content_hash = None
if detection_mode == 'ai-hunter':
# Try to get content_hash from the current chapter info
# Use actual_num if provided, otherwise fallback to idx+1
if actual_num is not None:
chapter_key = str(actual_num)
else:
chapter_key = str(idx + 1)
if chapter_key in prog.get("chapters", {}):
chapter_info = prog["chapters"][chapter_key]
content_hash = chapter_info.get("content_hash")
print(f" 🔍 DEBUG: Found content_hash for chapter {idx}: {content_hash}")
if detection_mode == 'ai-hunter':
print(" 🤖 DEBUG: Routing to AI Hunter detection...")
# Check if AI Hunter method is available (injected by the wrapper)
if hasattr(self, '_check_duplicate_ai_hunter'):
return self._check_duplicate_ai_hunter(result, idx, prog, out, content_hash)
else:
print(" ⚠️ AI Hunter method not available, falling back to basic detection")
return self._check_duplicate_basic(result, idx, prog, out)
elif detection_mode == 'cascading':
print(" 🔄 DEBUG: Routing to Cascading detection...")
return self._check_duplicate_cascading(result, idx, prog, out)
else:
print(" 📋 DEBUG: Routing to Basic detection...")
return self._check_duplicate_basic(result, idx, prog, out)
def _check_duplicate_basic(self, result, idx, prog, out):
"""Original basic duplicate detection"""
try:
result_clean = re.sub(r'<[^>]+>', '', result).strip().lower()
result_sample = result_clean[:1000]
lookback_chapters = self.config.DUPLICATE_LOOKBACK_CHAPTERS
for prev_idx in range(max(0, idx - lookback_chapters), idx):
prev_key = str(prev_idx)
if prev_key in prog["chapters"] and prog["chapters"][prev_key].get("output_file"):
prev_file = prog["chapters"][prev_key]["output_file"]
prev_path = os.path.join(out, prev_file)
if os.path.exists(prev_path):
try:
with open(prev_path, 'r', encoding='utf-8') as f:
prev_content = f.read()
prev_clean = re.sub(r'<[^>]+>', '', prev_content).strip().lower()
prev_sample = prev_clean[:1000]
# Use SequenceMatcher for similarity comparison
similarity = SequenceMatcher(None, result_sample, prev_sample).ratio()
if similarity >= 0.85: # 85% threshold
print(f" 🚀 Basic detection: Duplicate found ({int(similarity*100)}%)")
return True, int(similarity * 100)
except Exception as e:
print(f" Warning: Failed to read {prev_path}: {e}")
continue
return False, 0
except Exception as e:
print(f" Warning: Failed to check duplicate content: {e}")
return False, 0
def _check_duplicate_cascading(self, result, idx, prog, out):
"""Cascading detection - basic first, then AI Hunter for borderline cases"""
# Step 1: Basic
is_duplicate_basic, similarity_basic = self._check_duplicate_basic(result, idx, prog, out)
if is_duplicate_basic:
return True, similarity_basic
# Step 2: If basic detection finds moderate similarity, use AI Hunter
if similarity_basic >= 60: # Configurable threshold
print(f" 🤖 Moderate similarity ({similarity_basic}%) - running AI Hunter analysis...")
if hasattr(self, '_check_duplicate_ai_hunter'):
is_duplicate_ai, similarity_ai = self._check_duplicate_ai_hunter(result, idx, prog, out)
if is_duplicate_ai:
return True, similarity_ai
else:
print(" ⚠️ AI Hunter method not available for cascading analysis")
return False, max(similarity_basic, 0)
def _extract_text_features(self, text):
"""Extract multiple features from text for AI Hunter analysis"""
features = {
'semantic': {},
'structural': {},
'characters': [],
'patterns': {}
}
# Semantic fingerprint
lines = text.split('\n')
# Character extraction (names that appear 3+ times)
words = re.findall(r'\b[A-Z][a-z]+\b', text)
word_freq = Counter(words)
features['characters'] = [name for name, count in word_freq.items() if count >= 3]
# Dialogue patterns
dialogue_patterns = re.findall(r'"([^"]+)"', text)
features['semantic']['dialogue_count'] = len(dialogue_patterns)
features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]]
# Speaker patterns
speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower())
features['semantic']['speakers'] = list(set(speaker_patterns[:20]))
# Number extraction
numbers = re.findall(r'\b\d+\b', text)
features['patterns']['numbers'] = numbers[:20]
# Structural signature
para_lengths = []
dialogue_count = 0
for para in text.split('\n\n'):
if para.strip():
para_lengths.append(len(para))
if '"' in para:
dialogue_count += 1
features['structural']['para_count'] = len(para_lengths)
features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths))
features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths))
# Create structural pattern string
pattern = []
for para in text.split('\n\n')[:20]: # First 20 paragraphs
if para.strip():
if '"' in para:
pattern.append('D') # Dialogue
elif len(para) > 300:
pattern.append('L') # Long
elif len(para) < 100:
pattern.append('S') # Short
else:
pattern.append('M') # Medium
features['structural']['pattern'] = ''.join(pattern)
return features
def _calculate_exact_similarity(self, text1, text2):
"""Calculate exact text similarity"""
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
def _calculate_smart_similarity(self, text1, text2):
"""Smart similarity with length-aware sampling"""
# Check length ratio first
len_ratio = len(text1) / max(1, len(text2))
if len_ratio < 0.7 or len_ratio > 1.3:
return 0.0
# Smart sampling for large texts
if len(text1) > 10000:
sample_size = 3000
samples1 = [
text1[:sample_size],
text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2],
text1[-sample_size:]
]
samples2 = [
text2[:sample_size],
text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2],
text2[-sample_size:]
]
similarities = [SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
for s1, s2 in zip(samples1, samples2)]
return sum(similarities) / len(similarities)
else:
# Use first 2000 chars for smaller texts
return SequenceMatcher(None, text1[:2000].lower(), text2[:2000].lower()).ratio()
def _calculate_semantic_similarity(self, sem1, sem2):
"""Calculate semantic fingerprint similarity"""
score = 0.0
max_score = 0.0
# Compare dialogue counts
if 'dialogue_count' in sem1 and 'dialogue_count' in sem2:
max_score += 1.0
ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / max(1, max(sem1['dialogue_count'], sem2['dialogue_count']))
score += ratio * 0.3
# Compare speakers
if 'speakers' in sem1 and 'speakers' in sem2:
max_score += 1.0
if sem1['speakers'] and sem2['speakers']:
overlap = len(set(sem1['speakers']) & set(sem2['speakers']))
total = len(set(sem1['speakers']) | set(sem2['speakers']))
score += (overlap / max(1, total)) * 0.4
# Compare dialogue lengths pattern
if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2:
max_score += 1.0
if sem1['dialogue_lengths'] and sem2['dialogue_lengths']:
# Compare dialogue length patterns
len1 = sem1['dialogue_lengths'][:10]
len2 = sem2['dialogue_lengths'][:10]
if len1 and len2:
avg1 = sum(len1) / len(len1)
avg2 = sum(len2) / len(len2)
ratio = min(avg1, avg2) / max(1, max(avg1, avg2))
score += ratio * 0.3
return score / max(1, max_score)
def _calculate_structural_similarity(self, struct1, struct2):
"""Calculate structural signature similarity"""
score = 0.0
# Compare paragraph patterns
if 'pattern' in struct1 and 'pattern' in struct2:
pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio()
score += pattern_sim * 0.4
# Compare paragraph statistics
if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \
all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']):
# Paragraph count ratio
para_ratio = min(struct1['para_count'], struct2['para_count']) / max(1, max(struct1['para_count'], struct2['para_count']))
score += para_ratio * 0.2
# Average length ratio
avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / max(1, max(struct1['avg_para_length'], struct2['avg_para_length']))
score += avg_ratio * 0.2
# Dialogue ratio similarity
dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio'])
score += (1 - dialogue_diff) * 0.2
return score
def _calculate_character_similarity(self, chars1, chars2):
"""Calculate character name similarity"""
if not chars1 or not chars2:
return 0.0
# Find overlapping characters
set1 = set(chars1)
set2 = set(chars2)
overlap = len(set1 & set2)
total = len(set1 | set2)
return overlap / max(1, total)
def _calculate_pattern_similarity(self, pat1, pat2):
"""Calculate pattern-based similarity"""
score = 0.0
# Compare numbers (they rarely change in translations)
if 'numbers' in pat1 and 'numbers' in pat2:
nums1 = set(pat1['numbers'])
nums2 = set(pat2['numbers'])
if nums1 and nums2:
overlap = len(nums1 & nums2)
total = len(nums1 | nums2)
score = overlap / max(1, total)
return score
def generate_rolling_summary(self, history_manager, chapter_num, base_system_content=None, source_text=None):
"""Generate rolling summary after a chapter for context continuity.
Uses a dedicated summary system prompt (with glossary) distinct from translation.
Writes the summary to rolling_summary.txt and returns the summary string.
"""
if not self.config.USE_ROLLING_SUMMARY:
return None
current_history = history_manager.load_history()
messages_to_include = self.config.ROLLING_SUMMARY_EXCHANGES * 2
# Prefer directly provided source text (e.g., just-translated chapter) when available
assistant_responses = []
if source_text and isinstance(source_text, str) and source_text.strip():
assistant_responses = [source_text]
else:
if len(current_history) >= 2:
recent_messages = current_history[-messages_to_include:] if messages_to_include > 0 else current_history
for h in recent_messages:
if h.get("role") == "assistant":
assistant_responses.append(h["content"])
# If still empty, skip quietly
if not assistant_responses:
return None
# Build a dedicated summary system prompt (do NOT reuse main translation system prompt)
# Append glossary to keep terminology consistent
summary_system_template = os.getenv("ROLLING_SUMMARY_SYSTEM_PROMPT", "You create concise summaries for continuity.").strip()
try:
glossary_path = find_glossary_file(self.out_dir)
except Exception:
glossary_path = None
system_prompt = build_system_prompt(summary_system_template, glossary_path)
# Add explicit instruction for clarity
system_prompt += "\n\n[Instruction: Generate a concise rolling summary of the previous chapter. Use glossary terms consistently. Do not include warnings or explanations.]"
user_prompt_template = os.getenv(
"ROLLING_SUMMARY_USER_PROMPT",
"Summarize the key events, characters, tone, and important details from these translations. "
"Focus on: character names/relationships, plot developments, and any special terminology used.\n\n"
"{translations}"
)
translations_text = "\n---\n".join(assistant_responses)
user_prompt = user_prompt_template.replace("{translations}", translations_text)
summary_msgs = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"[Rolling Summary of Chapter {chapter_num}]\n" + user_prompt}
]
try:
summary_resp, _ = send_with_interrupt(
summary_msgs, self.client, self.config.TEMP,
min(2000, self.config.MAX_OUTPUT_TOKENS),
self.check_stop,
context='summary'
)
# Save the summary to the output folder
summary_file = os.path.join(self.out_dir, "rolling_summary.txt")
header = f"=== Rolling Summary of Chapter {chapter_num} ===\n(This is a summary of the previous chapter for context)\n"
mode = "a" if self.config.ROLLING_SUMMARY_MODE == "append" else "w"
with open(summary_file, mode, encoding="utf-8") as sf:
if mode == "a":
sf.write("\n\n")
sf.write(header)
sf.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}]\n")
sf.write(summary_resp.strip())
# If in append mode, trim to retain only the last N entries if configured
try:
if self.config.ROLLING_SUMMARY_MODE == "append":
max_entries = int(getattr(self.config, "ROLLING_SUMMARY_MAX_ENTRIES", 0) or 0)
if max_entries > 0:
with open(summary_file, 'r', encoding='utf-8') as rf:
content = rf.read()
# Find the start of each summary block by header line
headers = [m.start() for m in re.finditer(r"(?m)^===\s*Rolling Summary.*$", content)]
if len(headers) > max_entries:
# Keep only the last max_entries blocks
keep_starts = headers[-max_entries:]
blocks = []
for i, s in enumerate(keep_starts):
e = keep_starts[i + 1] if i + 1 < len(keep_starts) else len(content)
block = content[s:e].strip()
if block:
blocks.append(block)
trimmed_content = ("\n\n".join(blocks) + "\n") if blocks else ""
with open(summary_file, 'w', encoding='utf-8') as wf:
wf.write(trimmed_content)
# Optional log showing retained count
try:
self._log(f"📚 Total summaries in memory: {len(blocks)} (trimmed to last {max_entries})")
except Exception:
pass
except Exception as _trim_err:
try:
self._log(f"⚠️ Failed to trim rolling summaries: {_trim_err}")
except Exception:
pass
# Log to GUI if available, otherwise console
try:
self._log(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)")
self._log(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)")
except Exception:
print(f"📝 Generated rolling summary for Chapter {chapter_num} ({'append' if mode=='a' else 'replace'} mode)")
print(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)")
return summary_resp.strip()
except Exception as e:
try:
self._log(f"⚠️ Failed to generate rolling summary: {e}")
except Exception:
print(f"⚠️ Failed to generate rolling summary: {e}")
return None
def translate_with_retry(self, msgs, chunk_html, c, chunk_idx, total_chunks):
"""Handle translation with retry logic"""
# CRITICAL FIX: Reset client state for each chunk
if hasattr(self.client, 'reset_cleanup_state'):
self.client.reset_cleanup_state()
# Also ensure we're not in cleanup mode from previous operations
if hasattr(self.client, '_in_cleanup'):
self.client._in_cleanup = False
if hasattr(self.client, '_cancelled'):
self.client._cancelled = False
retry_count = 0
# Get retry attempts from AI Hunter config if available
ai_config = {}
try:
# Try to get AI Hunter config from environment variable first
ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG')
if ai_hunter_config_str:
ai_config = json.loads(ai_hunter_config_str)
else:
# Fallback to config attribute
ai_config = getattr(self.config, 'ai_hunter_config', {})
except (json.JSONDecodeError, AttributeError):
ai_config = {}
if isinstance(ai_config, dict):
max_retries = ai_config.get('retry_attempts', 3)
max_duplicate_retries = ai_config.get('retry_attempts', 6) # Use same setting for duplicate retries
else:
max_retries = 3
max_duplicate_retries = 6
duplicate_retry_count = 0
timeout_retry_count = 0
max_timeout_retries = 2
history_purged = False
original_max_tokens = self.config.MAX_OUTPUT_TOKENS
original_temp = self.config.TEMP
original_user_prompt = msgs[-1]["content"]
chunk_timeout = None
if self.config.RETRY_TIMEOUT:
chunk_timeout = self.config.CHUNK_TIMEOUT
result = None
finish_reason = None
while True:
if self.check_stop():
return None, None
try:
current_max_tokens = self.config.MAX_OUTPUT_TOKENS
current_temp = self.config.TEMP
total_tokens = sum(self.chapter_splitter.count_tokens(m["content"]) for m in msgs)
# Determine file reference
if c.get('is_chunk', False):
file_ref = f"Section_{c['num']}"
else:
# Check if this is a text file - need to access from self
is_text_source = self.is_text_file or c.get('filename', '').endswith('.txt')
terminology = "Section" if is_text_source else "Chapter"
file_ref = c.get('original_basename', f'{terminology}_{c["num"]}')
print(f"[DEBUG] Chunk {chunk_idx}/{total_chunks} tokens = {total_tokens:,} / {self.get_token_budget_str()} [File: {file_ref}]")
self.client.context = 'translation'
# Generate filename for chunks
if chunk_idx and total_chunks > 1:
# This is a chunk - use chunk naming format
fname = f"response_{c['num']:03d}_chunk_{chunk_idx}.html"
else:
# Not a chunk - use regular naming
fname = FileUtilities.create_chapter_filename(c, c.get('actual_chapter_num', c['num']))
# Set output filename BEFORE the API call
if hasattr(self.client, 'set_output_filename'):
self.client.set_output_filename(fname)
# Track the filename so truncation logs know which file this is
if hasattr(self.client, '_current_output_file'):
self.client._current_output_file = fname
# Generate unique request ID for this chunk
#request_id = f"{c['num']:03d}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}"
result, finish_reason = send_with_interrupt(
msgs, self.client, current_temp, current_max_tokens,
self.check_stop, chunk_timeout
)
# Enhanced mode workflow:
# 1. Original HTML -> html2text -> Markdown/plain text (during extraction)
# 2. Markdown sent to translation API (better for translation quality)
# 3. Translated markdown -> HTML conversion (here)
if result and c.get("enhanced_extraction", False):
print(f"🔄 Converting translated markdown back to HTML...")
result = convert_enhanced_text_to_html(result, c)
retry_needed = False
retry_reason = ""
is_duplicate_retry = False
# ENHANCED: Force re-read environment variable for latest setting
retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1"
# Debug logging to verify the toggle state
#print(f" DEBUG: finish_reason='{finish_reason}', RETRY_TRUNCATED={retry_truncated_enabled}, config.RETRY_TRUNCATED={self.config.RETRY_TRUNCATED}")
#print(f" DEBUG: Current tokens={self.config.MAX_OUTPUT_TOKENS}, Min retry tokens={self.config.MAX_RETRY_TOKENS}, retry_count={retry_count}")
if finish_reason == "length" and (retry_truncated_enabled or self.config.RETRY_TRUNCATED):
if retry_count < max_retries:
# For truncated responses, ensure we never go below the minimum retry tokens
proposed_limit = self.config.MAX_OUTPUT_TOKENS * 2
# Always enforce minimum - never retry with tokens below the constraint
new_token_limit = max(proposed_limit, self.config.MAX_RETRY_TOKENS)
if new_token_limit != self.config.MAX_OUTPUT_TOKENS:
retry_needed = True
retry_reason = "truncated output"
old_limit = self.config.MAX_OUTPUT_TOKENS
self.config.MAX_OUTPUT_TOKENS = new_token_limit
retry_count += 1
if old_limit < self.config.MAX_RETRY_TOKENS:
print(f" 🔄 TRUNCATION RETRY: Boosting tokens {old_limit} → {new_token_limit} (enforcing minimum: {self.config.MAX_RETRY_TOKENS})")
else:
print(f" 🔄 TRUNCATION RETRY: Doubling tokens {old_limit} → {new_token_limit} (above minimum: {self.config.MAX_RETRY_TOKENS})")
else:
print(f" ⚠️ TRUNCATION DETECTED: Token adjustment not needed - already at maximum {self.config.MAX_OUTPUT_TOKENS}")
else:
print(f" ⚠️ TRUNCATION DETECTED: Max retries ({max_retries}) reached - accepting truncated response")
elif finish_reason == "length" and not (retry_truncated_enabled or self.config.RETRY_TRUNCATED):
print(f" ⏭️ TRUNCATION DETECTED: Auto-retry is DISABLED - accepting truncated response")
elif finish_reason == "length":
print(f" ⚠️ TRUNCATION DETECTED: Unexpected condition - check logic")
if not retry_needed:
# Force re-read the environment variable to ensure we have current setting
duplicate_enabled = os.getenv("RETRY_DUPLICATE_BODIES", "0") == "1"
if duplicate_enabled and duplicate_retry_count < max_duplicate_retries:
idx = c.get('__index', 0)
prog = c.get('__progress', {})
print(f" 🔍 Checking for duplicate content...")
# Get actual chapter number for duplicate detection
actual_num = c.get('actual_chapter_num', c.get('num', idx + 1))
is_duplicate, similarity = self.check_duplicate_content(result, idx, prog, self.out_dir, actual_num)
if is_duplicate:
retry_needed = True
is_duplicate_retry = True
retry_reason = f"duplicate content (similarity: {similarity}%)"
duplicate_retry_count += 1
# Check if temperature change is disabled
disable_temp_change = ai_config.get('disable_temperature_change', False) if isinstance(ai_config, dict) else False
if duplicate_retry_count >= 3 and not history_purged:
print(f" 🧹 Clearing history after 3 attempts...")
if 'history_manager' in c:
c['history_manager'].save_history([])
history_purged = True
if not disable_temp_change:
self.config.TEMP = original_temp
else:
print(f" 🌡️ Temperature change disabled - keeping current temp: {self.config.TEMP}")
elif duplicate_retry_count == 1:
if disable_temp_change:
print(f" 🔄 First duplicate retry - temperature change disabled")
else:
print(f" 🔄 First duplicate retry - same temperature")
elif history_purged:
if not disable_temp_change:
attempts_since_purge = duplicate_retry_count - 3
self.config.TEMP = min(original_temp + (0.1 * attempts_since_purge), 1.0)
print(f" 🌡️ Post-purge temp: {self.config.TEMP}")
else:
print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}")
else:
if not disable_temp_change:
self.config.TEMP = min(original_temp + (0.1 * (duplicate_retry_count - 1)), 1.0)
print(f" 🌡️ Gradual temp increase: {self.config.TEMP}")
else:
print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}")
if duplicate_retry_count == 1:
user_prompt = f"[RETRY] Chapter {c['num']}: Ensure unique translation.\n{chunk_html}"
elif duplicate_retry_count <= 3:
user_prompt = f"[ATTEMPT {duplicate_retry_count}] Translate uniquely:\n{chunk_html}"
else:
user_prompt = f"Chapter {c['num']}:\n{chunk_html}"
msgs[-1] = {"role": "user", "content": user_prompt}
elif not duplicate_enabled:
print(f" ⏭️ Duplicate detection is DISABLED - skipping check")
if retry_needed:
if is_duplicate_retry:
print(f" 🔄 Duplicate retry {duplicate_retry_count}/{max_duplicate_retries}")
else:
print(f" 🔄 Retry {retry_count}/{max_retries}: {retry_reason}")
time.sleep(2)
continue
break
except UnifiedClientError as e:
error_msg = str(e)
if "stopped by user" in error_msg:
print("❌ Translation stopped by user during API call")
return None, None
if "took" in error_msg and "timeout:" in error_msg:
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f" ⏱️ Chunk took too long, retry {timeout_retry_count}/{max_timeout_retries}")
print(f" 🔄 Retrying")
time.sleep(2)
continue
else:
print(f" ❌ Max timeout retries reached")
raise UnifiedClientError("Translation failed after timeout retries")
elif "timed out" in error_msg and "timeout:" not in error_msg:
print(f"⚠️ {error_msg}, retrying...")
time.sleep(5)
continue
elif getattr(e, "error_type", None) == "rate_limit" or getattr(e, "http_status", None) == 429:
# Rate limit errors - clean handling without traceback
print("⚠️ Rate limited, sleeping 60s…")
for i in range(60):
if self.check_stop():
print("❌ Translation stopped during rate limit wait")
return None, None
time.sleep(1)
continue
else:
# For unexpected errors, show the error message but suppress traceback in most cases
if getattr(e, "error_type", None) in ["api_error", "validation", "prohibited_content"]:
print(f"❌ API Error: {error_msg}")
raise UnifiedClientError(f"API Error: {error_msg}")
else:
raise
except Exception as e:
print(f"❌ Unexpected error during API call: {e}")
raise
self.config.MAX_OUTPUT_TOKENS = original_max_tokens
self.config.TEMP = original_temp
if retry_count > 0 or duplicate_retry_count > 0 or timeout_retry_count > 0:
if duplicate_retry_count > 0:
print(f" 🔄 Restored original temperature: {self.config.TEMP} (after {duplicate_retry_count} duplicate retries)")
elif timeout_retry_count > 0:
print(f" 🔄 Restored original settings after {timeout_retry_count} timeout retries")
elif retry_count > 0:
print(f" 🔄 Restored original settings after {retry_count} retries")
if duplicate_retry_count >= max_duplicate_retries:
print(f" ⚠️ WARNING: Duplicate content issue persists after {max_duplicate_retries} attempts")
return result, finish_reason
def get_token_budget_str(self):
"""Get token budget as string"""
_tok_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
max_tokens_limit, budget_str = parse_token_limit(_tok_env)
return budget_str
# =====================================================
# BATCH TRANSLATION PROCESSOR
# =====================================================
class BatchTranslationProcessor:
"""Handles batch/parallel translation processing"""
def __init__(self, config, client, base_msg, out_dir, progress_lock,
save_progress_fn, update_progress_fn, check_stop_fn,
image_translator=None, is_text_file=False):
self.config = config
self.client = client
self.base_msg = base_msg
self.out_dir = out_dir
self.progress_lock = progress_lock
self.save_progress_fn = save_progress_fn
self.update_progress_fn = update_progress_fn
self.check_stop_fn = check_stop_fn
self.image_translator = image_translator
self.chapters_completed = 0
self.chunks_completed = 0
self.is_text_file = is_text_file
# Optionally log multi-key status
if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys:
stats = self.client.get_stats()
print(f"🔑 Batch processor using multi-key mode: {stats.get('total_keys', 0)} keys")
def process_single_chapter(self, chapter_data):
"""Process a single chapter (runs in thread)"""
# APPLY INTERRUPTIBLE THREADING DELAY FIRST
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
if thread_delay > 0:
# Check if we need to wait (same logic as unified_api_client)
if hasattr(self.client, '_thread_submission_lock') and hasattr(self.client, '_last_thread_submission_time'):
with self.client._thread_submission_lock:
current_time = time.time()
time_since_last = current_time - self.client._last_thread_submission_time
if time_since_last < thread_delay:
sleep_time = thread_delay - time_since_last
thread_name = threading.current_thread().name
# PRINT BEFORE THE DELAY STARTS
idx, chapter = chapter_data # Extract chapter info for better logging
print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.1f}s for Chapter {idx+1}")
# Interruptible sleep - check stop flag every 0.1 seconds
elapsed = 0
check_interval = 0.1
while elapsed < sleep_time:
if self.check_stop_fn():
print(f"🛑 Threading delay interrupted by stop flag")
raise Exception("Translation stopped by user during threading delay")
sleep_chunk = min(check_interval, sleep_time - elapsed)
time.sleep(sleep_chunk)
elapsed += sleep_chunk
self.client._last_thread_submission_time = time.time()
if not hasattr(self.client, '_thread_submission_count'):
self.client._thread_submission_count = 0
self.client._thread_submission_count += 1
idx, chapter = chapter_data
chap_num = chapter["num"]
# Use the pre-calculated actual_chapter_num from the main loop
actual_num = chapter.get('actual_chapter_num')
# Fallback if not set (common in batch mode where first pass might be skipped)
if actual_num is None:
# Try to extract it using the same logic as non-batch mode
raw_num = FileUtilities.extract_actual_chapter_number(chapter, patterns=None, config=self.config)
# Apply offset if configured
offset = self.config.CHAPTER_NUMBER_OFFSET if hasattr(self.config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
# Check if zero detection is disabled
if hasattr(self.config, 'DISABLE_ZERO_DETECTION') and self.config.DISABLE_ZERO_DETECTION:
actual_num = raw_num
elif hasattr(self.config, '_uses_zero_based') and self.config._uses_zero_based:
# This is a 0-based novel, adjust the number
actual_num = raw_num + 1
else:
# Default to raw number (1-based or unknown)
actual_num = raw_num
print(f" 📖 Extracted actual chapter number: {actual_num} (from raw: {raw_num})")
try:
# Check if this is from a text file
ai_features = None
is_text_source = self.is_text_file or chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f"🔄 Starting #{idx+1} (Internal: {terminology} {chap_num}, Actual: {terminology} {actual_num}) (thread: {threading.current_thread().name}) [File: {chapter.get('original_basename', f'{terminology}_{chap_num}')}]")
content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"])
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, None, status="in_progress")
self.save_progress_fn()
chapter_body = chapter["body"]
if chapter.get('has_images') and self.image_translator and self.config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Processing images for Chapter {actual_num}...")
self.image_translator.set_current_chapter(actual_num)
chapter_body, image_translations = process_chapter_images(
chapter_body,
actual_num,
self.image_translator,
self.check_stop_fn
)
if image_translations:
# Create a copy of the processed body
from bs4 import BeautifulSoup
c = chapter
soup_for_text = BeautifulSoup(c["body"], 'html.parser')
# Remove all translated content
for trans_div in soup_for_text.find_all('div', class_='translated-text-only'):
trans_div.decompose()
# Use this cleaned version for text translation
text_to_translate = str(soup_for_text)
final_body_with_images = c["body"]
else:
text_to_translate = c["body"]
image_translations = {}
print(f"✅ Processed {len(image_translations)} images for Chapter {actual_num}")
chapter_msgs = self.base_msg + [{"role": "user", "content": chapter_body}]
# Generate filename before API call
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
self.client.set_output_filename(fname)
if hasattr(self.client, '_current_output_file'):
self.client._current_output_file = fname
print(f"📤 Sending Chapter {actual_num} to API...")
result, finish_reason = send_with_interrupt(
chapter_msgs, self.client, self.config.TEMP,
self.config.MAX_OUTPUT_TOKENS, self.check_stop_fn
)
print(f"📥 Received Chapter {actual_num} response, finish_reason: {finish_reason}")
# Enhanced mode workflow (same as non-batch):
# 1. Original HTML -> html2text -> Markdown/plain text (during extraction)
# 2. Markdown sent to translation API (better for translation quality)
# 3. Translated markdown -> HTML conversion (here)
if result and chapter.get("enhanced_extraction", False):
print(f"🔄 Converting translated markdown back to HTML...")
result = convert_enhanced_text_to_html(result, chapter)
if finish_reason in ["length", "max_tokens"]:
print(f"⚠️ Chapter {actual_num} response was TRUNCATED!")
if self.config.REMOVE_AI_ARTIFACTS:
result = ContentProcessor.clean_ai_artifacts(result, True)
result = ContentProcessor.clean_memory_artifacts(result)
cleaned = re.sub(r"^```(?:html)?\s*\n?", "", result, count=1, flags=re.MULTILINE)
cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE)
cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=self.config.REMOVE_AI_ARTIFACTS)
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
if self.is_text_file:
# For text files, save as plain text
fname_txt = fname.replace('.html', '.txt') if fname.endswith('.html') else fname
# Extract text from HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(cleaned, 'html.parser')
text_content = soup.get_text(strip=True)
# Merge image translations back with text translation
if 'final_body_with_images' in locals() and image_translations:
# Parse both versions
soup_with_images = BeautifulSoup(final_body_with_images, 'html.parser')
soup_with_text = BeautifulSoup(cleaned, 'html.parser')
# Get the translated text content (without images)
body_content = soup_with_text.body
# Add image translations to the translated content
for trans_div in soup_with_images.find_all('div', class_='translated-text-only'):
body_content.insert(0, trans_div)
final_html = str(soup_with_text)
cleaned = final_html
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned)
# Update with .txt filename
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, fname_txt, status="completed", ai_features=ai_features)
self.save_progress_fn()
else:
# Original code for EPUB files
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned)
print(f"💾 Saved Chapter {actual_num}: {fname} ({len(cleaned)} chars)")
# Initialize ai_features at the beginning to ensure it's always defined
if ai_features is None:
ai_features = None
# Extract and save AI features for future duplicate detection
if (self.config.RETRY_DUPLICATE_BODIES and
hasattr(self.config, 'DUPLICATE_DETECTION_MODE') and
self.config.DUPLICATE_DETECTION_MODE in ['ai-hunter', 'cascading']):
try:
# Extract features from the translated content
cleaned_text = re.sub(r'<[^>]+>', '', cleaned).strip()
# Note: self.translator doesn't exist, so we can't extract features here
# The features will need to be extracted during regular processing
print(f" ⚠️ AI features extraction not available in batch mode")
except Exception as e:
print(f" ⚠️ Failed to extract AI features: {e}")
with self.progress_lock:
# Check for QA failures with comprehensive detection
if is_qa_failed_response(cleaned):
chapter_status = "qa_failed"
failure_reason = get_failure_reason(cleaned)
print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: {failure_reason}")
# Update progress to qa_failed status
self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features)
self.save_progress_fn()
# DO NOT increment chapters_completed for qa_failed
# Return False to indicate failure
return False, actual_num
else:
chapter_status = "completed"
# Update progress to completed status
self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features)
self.save_progress_fn()
# Only increment chapters_completed for successful chapters
self.chapters_completed += 1
self.chunks_completed += 1
print(f"✅ Chapter {actual_num} completed successfully")
return True, actual_num
except Exception as e:
print(f"❌ Chapter {actual_num} failed: {e}")
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, None, status="failed")
self.save_progress_fn()
return False, actual_num
# =====================================================
# GLOSSARY MANAGER - TRUE CSV FORMAT WITH FUZZY MATCHING
# =====================================================
class GlossaryManager:
"""Unified glossary management with true CSV format, fuzzy matching, and parallel processing"""
# Class-level shared lock for API submission timing
_api_submission_lock = threading.Lock()
_last_api_submission_time = 0
def __init__(self):
self.pattern_manager = PatternManager()
self._results_lock = threading.Lock() # Thread lock for collecting results
self._file_write_lock = threading.Lock() # Thread lock for file operations
def _atomic_write_file(self, filepath, content, encoding='utf-8'):
"""Atomically write to a file to prevent corruption from concurrent writes"""
# Create temp file in same directory to ensure same filesystem
dir_path = os.path.dirname(filepath)
with self._file_write_lock:
try:
# Write to temporary file first
with tempfile.NamedTemporaryFile(mode='w', encoding=encoding,
dir=dir_path, delete=False) as tmp_file:
tmp_file.write(content)
tmp_path = tmp_file.name
# Atomic rename (on same filesystem)
if os.name == 'nt': # Windows
# Windows doesn't support atomic rename if target exists
if os.path.exists(filepath):
os.remove(filepath)
os.rename(tmp_path, filepath)
else: # Unix/Linux/Mac
os.rename(tmp_path, filepath)
return True
except Exception as e:
print(f"⚠️ Atomic write failed: {e}")
# Cleanup temp file if it exists
if 'tmp_path' in locals() and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except:
pass
# Fallback to direct write with lock
try:
with open(filepath, 'w', encoding=encoding) as f:
f.write(content)
return True
except Exception as e2:
print(f"⚠️ Fallback write also failed: {e2}")
return False
def save_glossary(self, output_dir, chapters, instructions, language="korean"):
"""Targeted glossary generator with true CSV format output and parallel processing"""
print("📑 Targeted Glossary Generator v6.0 (CSV Format + Parallel)")
# Check stop flag at start
# Ensure output directory exists
try:
os.makedirs(output_dir, exist_ok=True)
except Exception as _e:
print(f"⚠️ Could not ensure output directory exists: {output_dir} ({_e})")
if is_stop_requested():
print("📑 ❌ Glossary generation stopped by user")
return {}
# Check if glossary already exists; if so, we'll MERGE it later (do not return early)
glossary_path = os.path.join(output_dir, "glossary.csv")
existing_glossary_content = None
if os.path.exists(glossary_path):
print(f"📑 Existing glossary detected (will merge): {glossary_path}")
try:
with open(glossary_path, 'r', encoding='utf-8') as f:
existing_glossary_content = f.read()
except Exception as e:
print(f"⚠️ Could not read existing glossary: {e}")
# Rest of the method continues as before...
print("📑 Extracting names and terms with configurable options")
# Check stop flag before processing
if is_stop_requested():
print("📑 ❌ Glossary generation stopped by user")
return {}
# Check for manual glossary first (CSV only)
manual_glossary_path = os.getenv("MANUAL_GLOSSARY")
existing_glossary = None
if manual_glossary_path and os.path.exists(manual_glossary_path):
print(f"📑 Manual glossary detected: {os.path.basename(manual_glossary_path)}")
try:
with open(manual_glossary_path, 'r', encoding='utf-8') as f:
content = f.read()
# Treat as CSV text and stage it for merge; also copy to output for visibility
target_path = os.path.join(output_dir, "glossary.csv")
with open(target_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"📑 ✅ Manual CSV glossary copied to: {target_path}")
existing_glossary = content
except Exception as e:
print(f"⚠️ Could not copy manual glossary: {e}")
print(f"📑 Proceeding with automatic generation...")
# Check for existing glossary from manual extraction
glossary_folder_path = os.path.join(output_dir, "Glossary")
# existing_glossary may already be set by MANUAL_GLOSSARY above
if os.path.exists(glossary_folder_path):
for file in os.listdir(glossary_folder_path):
if file.endswith("_glossary.json"):
existing_path = os.path.join(glossary_folder_path, file)
try:
with open(existing_path, 'r', encoding='utf-8') as f:
existing_content = f.read()
existing_glossary = existing_content
print(f"📑 Found existing glossary from manual extraction: {file}")
break
except Exception as e:
print(f"⚠️ Could not load existing glossary: {e}")
# Get configuration from environment variables
min_frequency = int(os.getenv("GLOSSARY_MIN_FREQUENCY", "2"))
max_names = int(os.getenv("GLOSSARY_MAX_NAMES", "50"))
max_titles = int(os.getenv("GLOSSARY_MAX_TITLES", "30"))
batch_size = int(os.getenv("GLOSSARY_BATCH_SIZE", "50"))
strip_honorifics = os.getenv("GLOSSARY_STRIP_HONORIFICS", "1") == "1"
fuzzy_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90"))
max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000"))
print(f"📑 Settings: Min frequency: {min_frequency}, Max names: {max_names}, Max titles: {max_titles}")
print(f"📑 Strip honorifics: {'✅ Yes' if strip_honorifics else '❌ No'}")
print(f"📑 Fuzzy matching threshold: {fuzzy_threshold}")
# Get custom prompt from environment
custom_prompt = os.getenv("AUTO_GLOSSARY_PROMPT", "").strip()
def clean_html(html_text):
"""Remove HTML tags to get clean text"""
soup = BeautifulSoup(html_text, 'html.parser')
return soup.get_text()
# Check stop before processing chapters
if is_stop_requested():
print("📑 ❌ Glossary generation stopped by user")
return {}
# Get chapter split threshold and filter mode
chapter_split_threshold = int(os.getenv("GLOSSARY_CHAPTER_SPLIT_THRESHOLD", "100000"))
filter_mode = os.getenv("GLOSSARY_FILTER_MODE", "all") # all, only_with_honorifics, only_without_honorifics
# Check if parallel extraction is enabled for automatic glossary
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
batch_translation = os.getenv("BATCH_TRANSLATION", "0") == "1"
api_batch_size = int(os.getenv("BATCH_SIZE", "5"))
# Log the settings
print(f"📑 Filter mode: {filter_mode}")
if extraction_workers > 1:
print(f"📑 Parallel extraction enabled: {extraction_workers} workers")
if batch_translation:
print(f"📑 Batch API calls enabled: {api_batch_size} chunks per batch")
all_text = ' '.join(clean_html(chapter["body"]) for chapter in chapters)
print(f"📑 Processing {len(all_text):,} characters of text")
# Apply smart filtering FIRST to check actual size needed
use_smart_filter = os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1"
effective_text_size = len(all_text)
filtered_text_cache = None
if use_smart_filter and custom_prompt: # Only apply for AI extraction
print(f"📑 Smart filtering enabled - checking effective text size after filtering...")
# Perform filtering ONCE and reuse for chunking
filtered_sample, _ = self._filter_text_for_glossary(all_text, min_frequency)
filtered_text_cache = filtered_sample
effective_text_size = len(filtered_sample)
print(f"📑 Effective text size after filtering: {effective_text_size:,} chars (from {len(all_text):,})")
# Check if we need to split into chunks based on EFFECTIVE size after filtering
if chapter_split_threshold > 0 and effective_text_size > chapter_split_threshold:
print(f"📑 Effective text exceeds {chapter_split_threshold:,} chars, will process in chunks...")
# If using smart filter, we need to split the FILTERED text, not raw text
if use_smart_filter and custom_prompt:
# Split the filtered text into chunks (reuse cached filtered text)
filtered_text = filtered_text_cache if filtered_text_cache is not None else self._filter_text_for_glossary(all_text, min_frequency)[0]
chunks_to_process = []
# Split filtered text into chunks of appropriate size
chunk_size = chapter_split_threshold
for i in range(0, len(filtered_text), chunk_size):
chunk_text = filtered_text[i:i + chunk_size]
chunks_to_process.append((len(chunks_to_process) + 1, chunk_text))
print(f"📑 Split filtered text into {len(chunks_to_process)} chunks")
all_glossary_entries = []
else:
# Original logic for unfiltered text
all_glossary_entries = []
chunk_size = 0
chunk_chapters = []
chunks_to_process = []
for idx, chapter in enumerate(chapters):
if is_stop_requested():
print("📑 ❌ Glossary generation stopped by user")
return all_glossary_entries
chapter_text = clean_html(chapter["body"])
chunk_size += len(chapter_text)
chunk_chapters.append(chapter)
# Process chunk when it reaches threshold or last chapter
if chunk_size >= chapter_split_threshold or idx == len(chapters) - 1:
chunk_text = ' '.join(clean_html(ch["body"]) for ch in chunk_chapters)
chunks_to_process.append((len(chunks_to_process) + 1, chunk_text))
# Reset for next chunk
chunk_size = 0
chunk_chapters = []
print(f"📑 Split into {len(chunks_to_process)} chunks for processing")
# Batch toggle decides concurrency: ON => parallel API calls; OFF => strict sequential
if batch_translation and custom_prompt and len(chunks_to_process) > 1:
print(f"📑 Processing chunks in batch mode with {api_batch_size} chunks per batch...")
# Set fast mode for batch processing
os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "1"
# Use batch API calls for AI extraction
all_csv_lines = self._process_chunks_batch_api(
chunks_to_process, custom_prompt, language,
min_frequency, max_names, max_titles,
output_dir, strip_honorifics, fuzzy_threshold,
filter_mode, api_batch_size, extraction_workers
)
# Reset validation mode
os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "0"
print(f"📑 All chunks completed. Aggregated raw lines: {len(all_csv_lines)}")
# Process all collected entries at once (even if empty)
# Add header so downstream steps can work uniformly
all_csv_lines.insert(0, "type,raw_name,translated_name")
# Merge with any on-disk glossary first (to avoid overwriting user edits)
on_disk_path = os.path.join(output_dir, "glossary.csv")
if os.path.exists(on_disk_path):
try:
with open(on_disk_path, 'r', encoding='utf-8') as f:
on_disk_content = f.read()
all_csv_lines = self._merge_csv_entries(all_csv_lines, on_disk_content, strip_honorifics, language)
print("📑 Merged with existing on-disk glossary")
except Exception as e:
print(f"⚠️ Failed to merge with existing on-disk glossary: {e}")
# Apply filter mode if needed
if filter_mode == "only_with_honorifics":
filtered = [all_csv_lines[0]] # Keep header
for line in all_csv_lines[1:]:
parts = line.split(',', 2)
if len(parts) >= 3 and parts[0] == "character":
filtered.append(line)
all_csv_lines = filtered
print(f"📑 Filter applied: {len(all_csv_lines)-1} character entries with honorifics kept")
# Apply fuzzy deduplication (deferred until after all chunks)
try:
print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...")
all_csv_lines = self._deduplicate_glossary_with_fuzzy(all_csv_lines, fuzzy_threshold)
except Exception as e:
print(f"⚠️ Deduplication error: {e} — continuing without dedup")
# Sort by type and name
print(f"📑 Sorting glossary by type and name...")
header = all_csv_lines[0]
entries = all_csv_lines[1:]
if entries:
entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower()))
all_csv_lines = [header] + entries
# Save
# Check format preference
use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'
if not use_legacy_format:
# Convert to token-efficient format
all_csv_lines = self._convert_to_token_efficient_format(all_csv_lines)
# Final sanitize to prevent stray headers
all_csv_lines = self._sanitize_final_glossary_lines(all_csv_lines, use_legacy_format)
# Save
csv_content = '\n'.join(all_csv_lines)
glossary_path = os.path.join(output_dir, "glossary.csv")
self._atomic_write_file(glossary_path, csv_content)
# Verify file exists; fallback direct write if needed
if not os.path.exists(glossary_path):
try:
with open(glossary_path, 'w', encoding='utf-8') as f:
f.write(csv_content)
print("📑 Fallback write succeeded for glossary.csv")
except Exception as e:
print(f"❌ Failed to write glossary.csv: {e}")
print(f"\n📑 ✅ GLOSSARY SAVED!")
print(f"📑 ✅ AI GLOSSARY SAVED!")
c_count, t_count, total = self._count_glossary_entries(all_csv_lines, use_legacy_format)
print(f"📑 Character entries: {c_count}")
print(f"📑 Term entries: {t_count}")
print(f"📑 Total entries: {total}")
return self._parse_csv_to_dict(csv_content)
else:
# Strict sequential processing (one API call at a time)
_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
_prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED")
_prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER")
os.environ["GLOSSARY_DEFER_SAVE"] = "1"
# Tell the extractor each chunk is already filtered to avoid re-running smart filter per chunk
os.environ["_CHUNK_ALREADY_FILTERED"] = "1"
os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1"
try:
for chunk_idx, chunk_text in chunks_to_process:
if is_stop_requested():
break
print(f"📑 Processing chunk {chunk_idx}/{len(chunks_to_process)} ({len(chunk_text):,} chars)...")
if custom_prompt:
chunk_glossary = self._extract_with_custom_prompt(
custom_prompt, chunk_text, language,
min_frequency, max_names, max_titles,
None, output_dir, # Don't pass existing glossary to chunks
strip_honorifics, fuzzy_threshold, filter_mode
)
else:
chunk_glossary = self._extract_with_patterns(
chunk_text, language, min_frequency,
max_names, max_titles, batch_size,
None, output_dir, # Don't pass existing glossary to chunks
strip_honorifics, fuzzy_threshold, filter_mode
)
# Normalize to CSV lines and aggregate
chunk_lines = []
if isinstance(chunk_glossary, list):
for line in chunk_glossary:
if line and not line.startswith('type,'):
all_glossary_entries.append(line)
chunk_lines.append(line)
else:
for raw_name, translated_name in chunk_glossary.items():
entry_type = "character" if self._has_honorific(raw_name) else "term"
line = f"{entry_type},{raw_name},{translated_name}"
all_glossary_entries.append(line)
chunk_lines.append(line)
# Incremental update
try:
self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode)
print(f"📑 Incremental write: +{len(chunk_lines)} entries")
except Exception as e2:
print(f"⚠️ Incremental write failed: {e2}")
finally:
if _prev_defer is None:
if "GLOSSARY_DEFER_SAVE" in os.environ:
del os.environ["GLOSSARY_DEFER_SAVE"]
else:
os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer
if _prev_filtered is None:
os.environ.pop("_CHUNK_ALREADY_FILTERED", None)
else:
os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered
if _prev_force_disable is None:
os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None)
else:
os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable
# Build CSV from aggregated entries
csv_lines = ["type,raw_name,translated_name"] + all_glossary_entries
# Merge with any provided existing glossary AND on-disk glossary to avoid overwriting
on_disk_path = os.path.join(output_dir, "glossary.csv")
merge_sources = []
if existing_glossary:
merge_sources.append(existing_glossary)
if os.path.exists(on_disk_path):
try:
with open(on_disk_path, 'r', encoding='utf-8') as f:
merge_sources.append(f.read())
print("📑 Found existing on-disk glossary to merge")
except Exception as e:
print(f"⚠️ Failed to read on-disk glossary for merging: {e}")
# Also merge the main on-disk glossary if it was present at start
if existing_glossary_content:
csv_lines = self._merge_csv_entries(csv_lines, existing_glossary_content, strip_honorifics, language)
for src in merge_sources:
csv_lines = self._merge_csv_entries(csv_lines, src, strip_honorifics, language)
# Apply filter mode to final results
csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode)
# Apply fuzzy deduplication (deferred until after all chunks)
print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...")
original_count = len(csv_lines) - 1
csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)
deduped_count = len(csv_lines) - 1
if original_count > deduped_count:
print(f"📑 Removed {original_count - deduped_count} duplicate entries")
# Sort by type and name
print(f"📑 Sorting glossary by type and name...")
header = csv_lines[0]
entries = csv_lines[1:]
entries.sort(key=lambda x: (0 if x.startswith('character,') else 1, x.split(',')[1].lower() if ',' in x else x.lower()))
csv_lines = [header] + entries
# Token-efficient format if enabled
use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'
if not use_legacy_format:
csv_lines = self._convert_to_token_efficient_format(csv_lines)
# Final sanitize to prevent stray headers and section titles at end
csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format)
try:
# Save
csv_content = '\n'.join(csv_lines)
glossary_path = os.path.join(output_dir, "glossary.csv")
self._atomic_write_file(glossary_path, csv_content)
# Verify file exists; fallback direct write if needed
if not os.path.exists(glossary_path):
try:
with open(glossary_path, 'w', encoding='utf-8') as f:
f.write(csv_content)
print("📑 Fallback write succeeded for glossary.csv")
except Exception as e:
print(f"❌ Failed to write glossary.csv: {e}")
finally:
print(f"\n📑 ✅ CHUNKED GLOSSARY SAVED!")
print(f"📑 ✅ AI GLOSSARY SAVED!")
print(f"📑 File: {glossary_path}")
c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format)
print(f"📑 Character entries: {c_count}")
print(f"📑 Term entries: {t_count}")
print(f"📑 Total entries: {total}")
return self._parse_csv_to_dict(csv_content)
# Original single-text processing
if custom_prompt:
return self._extract_with_custom_prompt(custom_prompt, all_text, language,
min_frequency, max_names, max_titles,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
else:
return self._extract_with_patterns(all_text, language, min_frequency,
max_names, max_titles, batch_size,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
total_time = time.time() - total_start_time
print(f"\n📑 ========== GLOSSARY GENERATION COMPLETE ==========")
print(f"📑 Total time: {total_time:.1f}s")
print(f"📑 Performance breakdown:")
print(f"📑 - Extraction: {getattr(self, '_extraction_time', 0):.1f}s")
print(f"📑 - API calls: {getattr(self, '_api_time', 0):.1f}s")
print(f"📑 - Frequency checking: {getattr(self, '_freq_check_time', 0):.1f}s")
print(f"📑 - Deduplication: {getattr(self, '_dedup_time', 0):.1f}s")
print(f"📑 - File I/O: {getattr(self, '_io_time', 0):.1f}s")
print(f"📑 ================================================")
return result # This is the existing return statement
def _convert_to_token_efficient_format(self, csv_lines):
"""Convert CSV lines to token-efficient format with sections and asterisks"""
if len(csv_lines) <= 1:
return csv_lines
header = csv_lines[0]
entries = csv_lines[1:]
# Group by type (only from valid CSV lines)
import re as _re
grouped = {}
for line in entries:
if not line.strip():
continue
# Only accept proper CSV rows: at least 3 fields and a sane type token
parts_full = [p.strip() for p in line.split(',')]
if len(parts_full) < 3:
continue
entry_type = parts_full[0].lower()
if not _re.match(r'^[a-z_]+$', entry_type):
continue
if entry_type not in grouped:
grouped[entry_type] = []
grouped[entry_type].append(line)
# Rebuild with token-efficient format
result = []
result.append("Glossary: Characters, Terms, and Important Elements\n")
# Process in order: character first, then term, then others
type_order = ['character', 'term'] + [t for t in grouped.keys() if t not in ['character', 'term']]
for entry_type in type_order:
if entry_type not in grouped:
continue
entries = grouped[entry_type]
# Add section header
section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper()
result.append(f"=== {section_name} ===")
# Add entries in new format
for line in entries:
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 3:
raw_name = parts[1]
translated_name = parts[2]
# Format: * TranslatedName (RawName)
entry_line = f"* {translated_name} ({raw_name})"
# Add gender if present and not Unknown
if len(parts) > 3 and parts[3] and parts[3] != 'Unknown':
entry_line += f" [{parts[3]}]"
# Add any additional fields as description
if len(parts) > 4:
description = ', '.join(parts[4:])
if description.strip():
entry_line += f": {description}"
result.append(entry_line)
result.append("") # Blank line between sections
return result
def _count_glossary_entries(self, lines, use_legacy_format=False):
"""Return (char_count, term_count, total_count) for either format."""
if not lines:
return 0, 0, 0
if use_legacy_format:
data = lines[1:] if lines and lines[0].lower().startswith('type,raw_name') else lines
char_count = sum(1 for ln in data if ln.startswith('character,'))
term_count = sum(1 for ln in data if ln.startswith('term,'))
total = sum(1 for ln in data if ln and ',' in ln)
return char_count, term_count, total
# token-efficient
current = None
char_count = term_count = total = 0
for ln in lines:
s = ln.strip()
if s.startswith('=== ') and 'CHARACTER' in s.upper():
current = 'character'
continue
if s.startswith('=== ') and 'TERM' in s.upper():
current = 'term'
continue
if s.startswith('* '):
total += 1
if current == 'character':
char_count += 1
elif current == 'term':
term_count += 1
return char_count, term_count, total
def _sanitize_final_glossary_lines(self, lines, use_legacy_format=False):
"""Remove stray CSV headers and normalize header placement before saving.
- In legacy CSV mode, ensure exactly one header at the very top.
- In token-efficient mode, remove any CSV header lines entirely.
"""
header_norm = "type,raw_name,translated_name"
if not lines:
return lines
if use_legacy_format:
sanitized = []
header_seen = False
for ln in lines:
txt = ln.strip()
if txt.lower().startswith("type,raw_name"):
if not header_seen:
sanitized.append(header_norm)
header_seen = True
# skip duplicates
else:
sanitized.append(ln)
# ensure header at top
if sanitized and not sanitized[0].strip().lower().startswith("type,raw_name"):
sanitized.insert(0, header_norm)
return sanitized
else:
# remove any CSV header lines anywhere and duplicate top headers/sections
cleaned = []
glossary_header_seen = False
for i, ln in enumerate(lines):
txt = ln.strip()
low = txt.lower()
# Drop CSV headers
if low.startswith("type,raw_name"):
continue
# Keep only the first main glossary header
if low.startswith("glossary:"):
if glossary_header_seen:
continue
glossary_header_seen = True
cleaned.append(ln)
continue
# Remove bogus section like '=== GLOSSARY: ... ==='
if low.startswith("=== glossary:"):
continue
cleaned.append(ln)
return cleaned
def _process_chunks_batch_api(self, chunks_to_process, custom_prompt, language,
min_frequency, max_names, max_titles,
output_dir, strip_honorifics, fuzzy_threshold,
filter_mode, api_batch_size, extraction_workers):
"""Process chunks using batch API calls for AI extraction with thread delay"""
print(f"📑 Using batch API mode with {api_batch_size} chunks per batch")
# Ensure we defer saving and heavy merging when processing chunks
_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
os.environ["GLOSSARY_DEFER_SAVE"] = "1"
# Get thread submission delay
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
if thread_delay > 0:
print(f"📑 Thread submission delay: {thread_delay}s between parallel calls")
# CHANGE: Collect raw CSV lines instead of dictionary
all_csv_lines = [] # Collect all entries as CSV lines
total_chunks = len(chunks_to_process)
completed_chunks = 0
# Ensure per-chunk smart filtering is disabled globally during batch processing
_prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED")
_prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER")
os.environ["_CHUNK_ALREADY_FILTERED"] = "1"
os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1"
# Process in API batches
for batch_start in range(0, len(chunks_to_process), api_batch_size):
if is_stop_requested():
break
batch_end = min(batch_start + api_batch_size, len(chunks_to_process))
batch_chunks = chunks_to_process[batch_start:batch_end]
print(f"📑 Processing API batch {batch_start//api_batch_size + 1}: chunks {batch_start+1}-{batch_end}")
# Use ThreadPoolExecutor for parallel API calls within batch
# Batch mode: issue multiple API calls in parallel within each batch (one worker per chunk)
with ThreadPoolExecutor(max_workers=len(batch_chunks)) as executor:
futures = {}
last_submission_time = 0
for chunk_idx, chunk_text in batch_chunks:
if is_stop_requested():
break
# Apply thread submission delay
if thread_delay > 0 and last_submission_time > 0:
time_since_last = time.time() - last_submission_time
if time_since_last < thread_delay:
sleep_time = thread_delay - time_since_last
print(f"🧵 Thread delay: {sleep_time:.1f}s for chunk {chunk_idx}")
time.sleep(sleep_time)
future = executor.submit(
self._extract_with_custom_prompt,
custom_prompt, chunk_text, language,
min_frequency, max_names, max_titles,
None, output_dir, strip_honorifics,
fuzzy_threshold, filter_mode
)
futures[future] = chunk_idx
last_submission_time = time.time()
# Collect results
for future in as_completed(futures):
if is_stop_requested():
break
try:
chunk_glossary = future.result()
print(f"📑 DEBUG: Chunk {futures[future]} returned type={type(chunk_glossary)}, len={len(chunk_glossary)}")
# Normalize to CSV lines (without header)
chunk_lines = []
if isinstance(chunk_glossary, dict):
for raw_name, translated_name in chunk_glossary.items():
entry_type = "character" if self._has_honorific(raw_name) else "term"
chunk_lines.append(f"{entry_type},{raw_name},{translated_name}")
elif isinstance(chunk_glossary, list):
for line in chunk_glossary:
if line and not line.startswith('type,'):
chunk_lines.append(line)
# Aggregate for end-of-run
all_csv_lines.extend(chunk_lines)
# Incremental update of glossary.csv in token-efficient format
try:
self._incremental_update_glossary(output_dir, chunk_lines, strip_honorifics, language, filter_mode)
print(f"📑 Incremental write: +{len(chunk_lines)} entries")
except Exception as e2:
print(f"⚠️ Incremental write failed: {e2}")
completed_chunks += 1
# Print progress for GUI
progress_percent = (completed_chunks / total_chunks) * 100
print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)")
print(f"📑 Chunk {futures[future]} completed and aggregated")
except Exception as e:
print(f"⚠️ API call for chunk {futures[future]} failed: {e}")
completed_chunks += 1
progress_percent = (completed_chunks / total_chunks) * 100
print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)")
# Add delay between API batches
if batch_end < len(chunks_to_process):
api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
print(f"⏱️ Waiting {api_delay}s before next API batch...")
time.sleep(api_delay)
# CHANGE: Return CSV lines instead of dictionary
# Restore per-chunk filter disabling envs
if _prev_filtered is None:
os.environ.pop("_CHUNK_ALREADY_FILTERED", None)
else:
os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered
if _prev_force_disable is None:
os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None)
else:
os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable
# Restore previous defer setting
if _prev_defer is None:
# Default back to not deferring if it wasn't set
if "GLOSSARY_DEFER_SAVE" in os.environ:
del os.environ["GLOSSARY_DEFER_SAVE"]
else:
os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer
return all_csv_lines
def _incremental_update_glossary(self, output_dir, chunk_lines, strip_honorifics, language, filter_mode):
"""Incrementally update glossary.csv (token-efficient) using an on-disk CSV aggregator.
This keeps glossary.csv present and growing after each chunk while preserving
token-efficient format for the visible file.
"""
if not chunk_lines:
return
# Paths
agg_path = os.path.join(output_dir, "glossary.incremental.csv")
vis_path = os.path.join(output_dir, "glossary.csv")
# Ensure output dir
os.makedirs(output_dir, exist_ok=True)
# Compose CSV with header for merging
new_csv_lines = ["type,raw_name,translated_name"] + chunk_lines
# Load existing aggregator content, if any
existing_csv = None
if os.path.exists(agg_path):
try:
with open(agg_path, 'r', encoding='utf-8') as f:
existing_csv = f.read()
except Exception as e:
print(f"⚠️ Incremental: cannot read aggregator: {e}")
# Merge (exact merge, no fuzzy to keep this fast)
merged_csv_lines = self._merge_csv_entries(new_csv_lines, existing_csv or "", strip_honorifics, language)
# Optional filter mode
merged_csv_lines = self._filter_csv_by_mode(merged_csv_lines, filter_mode)
# Save aggregator (CSV)
self._atomic_write_file(agg_path, "\n".join(merged_csv_lines))
# Convert to token-efficient format for visible glossary.csv
token_lines = self._convert_to_token_efficient_format(merged_csv_lines)
token_lines = self._sanitize_final_glossary_lines(token_lines, use_legacy_format=False)
self._atomic_write_file(vis_path, "\n".join(token_lines))
if not os.path.exists(vis_path):
with open(vis_path, 'w', encoding='utf-8') as f:
f.write("\n".join(token_lines))
def _process_single_chunk(self, chunk_idx, chunk_text, custom_prompt, language,
min_frequency, max_names, max_titles, batch_size,
output_dir, strip_honorifics, fuzzy_threshold, filter_mode,
already_filtered=False):
"""Process a single chunk - wrapper for parallel execution"""
print(f"📑 Worker processing chunk {chunk_idx} ({len(chunk_text):,} chars)...")
if custom_prompt:
# Pass flag to indicate if text is already filtered
os.environ["_CHUNK_ALREADY_FILTERED"] = "1" if already_filtered else "0"
_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
os.environ["GLOSSARY_DEFER_SAVE"] = "1"
try:
result = self._extract_with_custom_prompt(
custom_prompt, chunk_text, language,
min_frequency, max_names, max_titles,
None, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode
)
finally:
os.environ["_CHUNK_ALREADY_FILTERED"] = "0" # Reset
if _prev_defer is None:
if "GLOSSARY_DEFER_SAVE" in os.environ:
del os.environ["GLOSSARY_DEFER_SAVE"]
else:
os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer
return result
else:
return self._extract_with_patterns(
chunk_text, language, min_frequency,
max_names, max_titles, batch_size,
None, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode
)
def _apply_final_filter(self, entries, filter_mode):
"""Apply final filtering based on mode to ensure only requested types are included"""
if filter_mode == "only_with_honorifics":
# Filter to keep only entries that look like they have honorifics
filtered = {}
for key, value in entries.items():
# Check if the key contains known honorific patterns
if self._has_honorific(key):
filtered[key] = value
print(f"📑 Final filter: Kept {len(filtered)} entries with honorifics (from {len(entries)} total)")
return filtered
elif filter_mode == "only_without_honorifics":
# Filter to keep only entries without honorifics
filtered = {}
for key, value in entries.items():
if not self._has_honorific(key):
filtered[key] = value
print(f"📑 Final filter: Kept {len(filtered)} entries without honorifics (from {len(entries)} total)")
return filtered
else:
return entries
def _looks_like_name(self, text):
"""Check if text looks like a character name"""
if not text:
return False
# Check for various name patterns
# Korean names (2-4 hangul characters)
if all(0xAC00 <= ord(char) <= 0xD7AF for char in text) and 2 <= len(text) <= 4:
return True
# Japanese names (mix of kanji/kana, 2-6 chars)
has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in text)
has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in text)
if (has_kanji or has_kana) and 2 <= len(text) <= 6:
return True
# Chinese names (2-4 Chinese characters)
if all(0x4E00 <= ord(char) <= 0x9FFF for char in text) and 2 <= len(text) <= 4:
return True
# English names (starts with capital, mostly letters)
if text[0].isupper() and sum(1 for c in text if c.isalpha()) >= len(text) * 0.8:
return True
return False
def _has_honorific(self, term):
"""Check if a term contains an honorific using PatternManager's comprehensive list"""
if not term:
return False
term_lower = term.lower()
# Check all language honorifics from PatternManager
for language, honorifics_list in self.pattern_manager.CJK_HONORIFICS.items():
for honorific in honorifics_list:
# For romanized/English honorifics with spaces or dashes
if honorific.startswith(' ') or honorific.startswith('-'):
if term_lower.endswith(honorific.lower()):
return True
# For CJK honorifics (no separator)
else:
if honorific in term:
return True
return False
def _strip_all_honorifics(self, term, language='korean'):
"""Strip all honorifics from a term using PatternManager's lists"""
if not term:
return term
result = term
# Get honorifics for the specific language and English romanizations
honorifics_to_strip = []
if language in self.pattern_manager.CJK_HONORIFICS:
honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS[language])
honorifics_to_strip.extend(self.pattern_manager.CJK_HONORIFICS.get('english', []))
# Sort by length (longest first) to avoid partial matches
honorifics_to_strip.sort(key=len, reverse=True)
# Strip honorifics
for honorific in honorifics_to_strip:
if honorific.startswith(' ') or honorific.startswith('-'):
# For romanized honorifics with separators
if result.lower().endswith(honorific.lower()):
result = result[:-len(honorific)]
else:
# For CJK honorifics (no separator)
if result.endswith(honorific):
result = result[:-len(honorific)]
return result.strip()
def _convert_to_csv_format(self, data):
"""Convert various glossary formats to CSV string format with enforced 3 columns"""
csv_lines = ["type,raw_name,translated_name"]
if isinstance(data, str):
# Already CSV string
if data.strip().startswith('type,raw_name'):
return data
# Try to parse as JSON
try:
data = json.loads(data)
except:
return data
if isinstance(data, list):
for item in data:
if isinstance(item, dict):
if 'type' in item and 'raw_name' in item:
# Already in correct format
line = f"{item['type']},{item['raw_name']},{item.get('translated_name', item['raw_name'])}"
csv_lines.append(line)
else:
# Old format - default to 'term' type
entry_type = 'term'
raw_name = item.get('original_name', '')
translated_name = item.get('name', raw_name)
if raw_name and translated_name:
csv_lines.append(f"{entry_type},{raw_name},{translated_name}")
elif isinstance(data, dict):
if 'entries' in data:
# Has metadata wrapper, extract entries
for original, translated in data['entries'].items():
csv_lines.append(f"term,{original},{translated}")
else:
# Plain dictionary - default to 'term' type
for original, translated in data.items():
csv_lines.append(f"term,{original},{translated}")
return '\n'.join(csv_lines)
def _parse_csv_to_dict(self, csv_content):
"""Parse CSV content to dictionary for backward compatibility"""
result = {}
lines = csv_content.strip().split('\n')
for line in lines[1:]: # Skip header
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 3:
result[parts[1]] = parts[2] # raw_name -> translated_name
return result
def _fuzzy_match(self, term1, term2, threshold=0.90):
"""Check if two terms match using fuzzy matching"""
ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio()
return ratio >= threshold
def _fuzzy_match_rapidfuzz(self, term_lower, text_lower, threshold, term_len):
"""Use rapidfuzz library for MUCH faster fuzzy matching"""
from rapidfuzz import fuzz
print(f"📑 Using RapidFuzz (C++ speed)...")
start_time = time.time()
matches_count = 0
threshold_percent = threshold * 100 # rapidfuzz uses 0-100 scale
# Can use smaller step because rapidfuzz is so fast
step = 1 # Check every position - rapidfuzz can handle it
# Process text
for i in range(0, len(text_lower) - term_len + 1, step):
# Check stop flag every 10000 positions
if i > 0 and i % 10000 == 0:
if is_stop_requested():
print(f"📑 RapidFuzz stopped at position {i}")
return matches_count
window = text_lower[i:i + term_len]
# rapidfuzz is fast enough we can check every position
if fuzz.ratio(term_lower, window) >= threshold_percent:
matches_count += 1
elapsed = time.time() - start_time
print(f"📑 RapidFuzz found {matches_count} matches in {elapsed:.2f}s")
return matches_count
def _batch_compute_frequencies(self, terms, all_text, fuzzy_threshold=0.90, min_frequency=2):
"""Compute frequencies for all terms at once - MUCH faster than individual checking"""
print(f"📑 Computing frequencies for {len(terms)} terms in batch mode...")
start_time = time.time()
# Result dictionary
term_frequencies = {}
# First pass: exact matching (very fast)
print(f"📑 Phase 1: Exact matching...")
text_lower = all_text.lower()
for term in terms:
if is_stop_requested():
return term_frequencies
term_lower = term.lower()
count = text_lower.count(term_lower)
term_frequencies[term] = count
exact_time = time.time() - start_time
high_freq_terms = sum(1 for count in term_frequencies.values() if count >= min_frequency)
print(f"📑 Exact matching complete: {high_freq_terms}/{len(terms)} terms meet threshold ({exact_time:.1f}s)")
# If fuzzy matching is disabled, we're done
if fuzzy_threshold >= 1.0:
return term_frequencies
# Second pass: fuzzy matching ONLY for low-frequency terms
low_freq_terms = [term for term, count in term_frequencies.items() if count < min_frequency]
if low_freq_terms:
print(f"📑 Phase 2: Fuzzy matching for {len(low_freq_terms)} low-frequency terms...")
# Try to use RapidFuzz batch processing
try:
from rapidfuzz import process, fuzz
# For very large texts, sample it for fuzzy matching
if len(text_lower) > 500000:
print(f"📑 Text too large ({len(text_lower):,} chars), sampling for fuzzy matching...")
# Sample every Nth character to reduce size
sample_rate = max(1, len(text_lower) // 100000)
sampled_text = text_lower[::sample_rate]
else:
sampled_text = text_lower
# Create chunks of text for fuzzy matching
chunk_size = 1000 # Process text in chunks
text_chunks = [sampled_text[i:i+chunk_size] for i in range(0, len(sampled_text), chunk_size//2)] # Overlapping chunks
print(f"📑 Processing {len(text_chunks)} text chunks...")
threshold_percent = fuzzy_threshold * 100
# Process in batches to avoid memory issues
batch_size = 100 # Process 100 terms at a time
for batch_start in range(0, len(low_freq_terms), batch_size):
if is_stop_requested():
break
batch_end = min(batch_start + batch_size, len(low_freq_terms))
batch_terms = low_freq_terms[batch_start:batch_end]
for term in batch_terms:
if is_stop_requested():
break
# Quick fuzzy search in chunks
fuzzy_count = 0
for chunk in text_chunks[:50]: # Limit to first 50 chunks for speed
if fuzz.partial_ratio(term.lower(), chunk) >= threshold_percent:
fuzzy_count += 1
if fuzzy_count > 0:
# Scale up based on sampling
if len(text_lower) > 500000:
fuzzy_count *= (len(text_lower) // len(sampled_text))
term_frequencies[term] += fuzzy_count
if (batch_end % 500 == 0) or (batch_end == len(low_freq_terms)):
elapsed = time.time() - start_time
print(f"📑 Processed {batch_end}/{len(low_freq_terms)} terms ({elapsed:.1f}s)")
except ImportError:
print("📑 RapidFuzz not available, skipping fuzzy matching")
total_time = time.time() - start_time
final_high_freq = sum(1 for count in term_frequencies.values() if count >= min_frequency)
print(f"📑 Batch frequency computation complete: {final_high_freq}/{len(terms)} terms accepted ({total_time:.1f}s)")
return term_frequencies
def _find_fuzzy_matches(self, term, text, threshold=0.90):
"""Find fuzzy matches of a term in text using efficient method with parallel processing"""
start_time = time.time()
term_lower = term.lower()
text_lower = text.lower()
term_len = len(term)
# Only log for debugging if explicitly enabled
debug_search = os.getenv("GLOSSARY_DEBUG_SEARCH", "0") == "1"
if debug_search and len(text) > 100000:
print(f"📑 Searching for '{term}' in {len(text):,} chars (threshold: {threshold})")
# Strategy 1: Use exact matching first for efficiency
exact_start = time.time()
matches_count = text_lower.count(term_lower)
exact_time = time.time() - exact_start
if matches_count > 0:
if debug_search and len(text) > 100000:
print(f"📑 Found {matches_count} exact matches in {exact_time:.3f}s")
return matches_count
# Strategy 2: Try rapidfuzz if available (much faster)
if matches_count == 0 and threshold < 1.0:
try:
from rapidfuzz import fuzz
return self._fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len)
except ImportError:
pass # Fall back to parallel/sequential
# Strategy 3: Fall back to parallel/sequential if rapidfuzz not available
# Check if parallel processing is enabled
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts
return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers)
else:
return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len)
# Check if parallel processing is enabled
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts
return self._parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers)
else:
return self._sequential_fuzzy_search(term_lower, text_lower, threshold, term_len)
return matches_count
def _parallel_fuzzy_search(self, term_lower, text_lower, threshold, term_len, num_workers):
"""Parallel fuzzy search using ThreadPoolExecutor"""
print(f"📑 Starting parallel fuzzy search with {num_workers} workers...")
text_len = len(text_lower)
matches_count = 0
# Split text into overlapping chunks for parallel processing
chunk_size = max(text_len // num_workers, term_len * 100)
chunks = []
for i in range(0, text_len, chunk_size):
# Add overlap to avoid missing matches at boundaries
end = min(i + chunk_size + term_len - 1, text_len)
chunks.append((i, text_lower[i:end]))
print(f"📑 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each")
# Process chunks in parallel
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = []
for chunk_idx, (start_pos, chunk_text) in enumerate(chunks):
if is_stop_requested():
return matches_count
future = executor.submit(
self._fuzzy_search_chunk,
term_lower, chunk_text, threshold, term_len, chunk_idx, len(chunks)
)
futures.append(future)
# Collect results
for future in as_completed(futures):
if is_stop_requested():
executor.shutdown(wait=False)
return matches_count
try:
chunk_matches = future.result()
matches_count += chunk_matches
except Exception as e:
print(f"📑 ⚠️ Chunk processing error: {e}")
print(f"📑 Parallel fuzzy search found {matches_count} matches")
return matches_count
def _fuzzy_search_chunk(self, term_lower, chunk_text, threshold, term_len, chunk_idx, total_chunks):
"""Process a single chunk for fuzzy matches"""
chunk_matches = 0
# Use a more efficient step size - no need to check every position
step = max(1, term_len // 3) # Check every third of term length
for i in range(0, len(chunk_text) - term_len + 1, step):
# Check stop flag periodically
if i > 0 and i % 1000 == 0:
if is_stop_requested():
return chunk_matches
window = chunk_text[i:i + term_len]
# Use SequenceMatcher for fuzzy matching
if SequenceMatcher(None, term_lower, window).ratio() >= threshold:
chunk_matches += 1
# Log progress for this chunk
if total_chunks > 1:
print(f"📑 Chunk {chunk_idx + 1}/{total_chunks} completed: {chunk_matches} matches")
return chunk_matches
def _sequential_fuzzy_search(self, term_lower, text_lower, threshold, term_len):
"""Sequential fuzzy search (fallback for small texts or single worker)"""
print(f"📑 Starting sequential fuzzy search...")
fuzzy_start = time.time()
matches_count = 0
# More efficient step size
step = max(1, term_len // 3)
total_windows = (len(text_lower) - term_len + 1) // step
print(f"📑 Checking ~{total_windows:,} windows with step size {step}")
windows_checked = 0
for i in range(0, len(text_lower) - term_len + 1, step):
# Check stop flag frequently
if i > 0 and i % (step * 100) == 0:
if is_stop_requested():
return matches_count
# Progress log for very long operations
if windows_checked % 1000 == 0 and windows_checked > 0:
elapsed = time.time() - fuzzy_start
rate = windows_checked / elapsed if elapsed > 0 else 0
eta = (total_windows - windows_checked) / rate if rate > 0 else 0
print(f"📑 Progress: {windows_checked}/{total_windows} windows, {rate:.0f} w/s, ETA: {eta:.1f}s")
window = text_lower[i:i + term_len]
if SequenceMatcher(None, term_lower, window).ratio() >= threshold:
matches_count += 1
windows_checked += 1
fuzzy_time = time.time() - fuzzy_start
print(f"📑 Sequential fuzzy search completed in {fuzzy_time:.2f}s, found {matches_count} matches")
return matches_count
def _fuzzy_match(self, term1, term2, threshold=0.90):
"""Check if two terms match using fuzzy matching (unchanged)"""
ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio()
return ratio >= threshold
def _strip_honorific(self, term, language_hint='unknown'):
"""Strip honorific from a term if present"""
if not term:
return term
# Get honorifics for the detected language
honorifics_to_check = []
if language_hint in self.pattern_manager.CJK_HONORIFICS:
honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS[language_hint])
honorifics_to_check.extend(self.pattern_manager.CJK_HONORIFICS.get('english', []))
# Check and remove honorifics
for honorific in honorifics_to_check:
if honorific.startswith('-') or honorific.startswith(' '):
# English-style suffix
if term.endswith(honorific):
return term[:-len(honorific)].strip()
else:
# CJK-style suffix (no separator)
if term.endswith(honorific):
return term[:-len(honorific)]
return term
def _translate_chunk_traditional(self, chunk_text, chunk_index, total_chunks, chapter_title=""):
"""Simplified translation for traditional APIs (DeepL, Google Translate)"""
print(f"📝 Using traditional translation API for chunk {chunk_index}/{total_chunks}")
# Traditional APIs don't use complex prompts, just need the text
messages = []
# Add minimal system context for language detection
profile = self.active_profile
if profile == 'korean':
lang_hint = "Translating from Korean to English"
elif profile == 'japanese':
lang_hint = "Translating from Japanese to English"
elif profile == 'chinese':
lang_hint = "Translating from Chinese to English"
else:
lang_hint = "Translating to English"
messages.append({
"role": "system",
"content": lang_hint
})
# For traditional APIs, we need to handle glossary differently
# Apply glossary terms as preprocessing if available
processed_text = chunk_text
if hasattr(self, 'glossary_manager') and self.glossary_manager and self.glossary_manager.entries:
# Pre-process: Mark glossary terms with placeholders
glossary_placeholders = {}
placeholder_index = 0
for entry in self.glossary_manager.entries:
source = entry.get('source', '')
target = entry.get('target', '')
if source and target and source in processed_text:
# Create unique placeholder
placeholder = f"[[GLOSS_{placeholder_index}]]"
glossary_placeholders[placeholder] = target
processed_text = processed_text.replace(source, placeholder)
placeholder_index += 1
print(f"📚 Applied {len(glossary_placeholders)} glossary placeholders")
# Add the text to translate
messages.append({
"role": "user",
"content": processed_text
})
# Send to API
try:
response = self.client.send(messages)
if response and response.content:
translated_text = response.content
# Post-process: Replace placeholders with glossary terms
if 'glossary_placeholders' in locals():
for placeholder, target in glossary_placeholders.items():
translated_text = translated_text.replace(placeholder, target)
print(f"✅ Restored {len(glossary_placeholders)} glossary terms")
# Log detected language if available
if hasattr(response, 'usage') and response.usage:
detected_lang = response.usage.get('detected_source_lang')
if detected_lang:
print(f"🔍 Detected source language: {detected_lang}")
return translated_text
else:
print("❌ No translation received from traditional API")
return None
except Exception as e:
print(f"❌ Traditional API translation error: {e}")
return None
def _filter_text_for_glossary(self, text, min_frequency=2):
"""Filter text to extract only meaningful content for glossary extraction"""
import re
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
filter_start_time = time.time()
print(f"📑 Starting smart text filtering...")
print(f"📑 Input text size: {len(text):,} characters")
# Clean HTML if present
print(f"📑 Step 1/7: Cleaning HTML tags...")
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')
clean_text = soup.get_text()
print(f"📑 Clean text size: {len(clean_text):,} characters")
# Detect primary language for better filtering
print(f"📑 Step 2/7: Detecting primary language...")
def detect_primary_language(text_sample):
sample = text_sample[:1000]
korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
if korean_chars > 50:
return 'korean'
elif japanese_kana > 20:
return 'japanese'
elif chinese_chars > 50 and japanese_kana < 10:
return 'chinese'
else:
return 'english'
primary_lang = detect_primary_language(clean_text)
print(f"📑 Detected primary language: {primary_lang}")
# Split into sentences for better context
print(f"📑 Step 3/7: Splitting text into sentences...")
sentences = re.split(r'[.!?。!?]+', clean_text)
print(f"📑 Found {len(sentences):,} sentences")
# Extract potential terms (words/phrases that appear multiple times)
print(f"📑 Step 4/7: Setting up extraction patterns and exclusion rules...")
word_freq = Counter()
# Pattern for detecting potential names/terms based on capitalization or special characters
# Korean names: 2-4 hangul characters WITHOUT honorifics
korean_pattern = r'[가-힣]{2,4}'
# Japanese names: kanji/hiragana/katakana combinations
japanese_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,6}'
# Chinese names: 2-4 Chinese characters
chinese_pattern = r'[\u4e00-\u9fff]{2,4}'
# English proper nouns: Capitalized words
english_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
# Combine patterns
combined_pattern = f'({korean_pattern}|{japanese_pattern}|{chinese_pattern}|{english_pattern})'
print(f"📑 Using combined regex pattern for {primary_lang} text")
# Get honorifics and title patterns for the detected language
honorifics_to_exclude = set()
if primary_lang in self.pattern_manager.CJK_HONORIFICS:
honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS[primary_lang])
# Also add English romanizations
honorifics_to_exclude.update(self.pattern_manager.CJK_HONORIFICS.get('english', []))
# Compile title patterns for the language
title_patterns = []
if primary_lang in self.pattern_manager.TITLE_PATTERNS:
for pattern in self.pattern_manager.TITLE_PATTERNS[primary_lang]:
title_patterns.append(re.compile(pattern))
# Function to check if a term should be excluded
def should_exclude_term(term):
term_lower = term.lower()
# Check if it's a common word
if term in self.pattern_manager.COMMON_WORDS or term_lower in self.pattern_manager.COMMON_WORDS:
return True
# Check if it contains honorifics
for honorific in honorifics_to_exclude:
if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])):
return True
# Check if it matches title patterns
for pattern in title_patterns:
if pattern.search(term):
return True
# Check if it's a number (including Chinese numbers)
if term in self.pattern_manager.CHINESE_NUMS:
return True
# Check if it's just digits
if term.isdigit():
return True
return False
# Extract potential terms from each sentence
print(f"📑 Step 5/7: Extracting and filtering terms from sentences...")
# Check if we should use parallel processing
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
# Auto-detect optimal workers if not set
if extraction_workers == 1 and len(sentences) > 1000:
# Use more cores for better parallelization
cpu_count = os.cpu_count() or 4
extraction_workers = min(cpu_count, 12) # Use up to 12 cores
print(f"📑 Auto-detected {cpu_count} CPU cores, using {extraction_workers} workers")
use_parallel = extraction_workers > 1 and len(sentences) > 100
if use_parallel:
print(f"📑 Using parallel processing with {extraction_workers} workers")
print(f"📑 Estimated speedup: {extraction_workers}x faster")
important_sentences = []
seen_contexts = set()
processed_count = 0
total_sentences = len(sentences)
last_progress_time = time.time()
def process_sentence_batch(batch_sentences, batch_idx):
"""Process a batch of sentences"""
local_word_freq = Counter()
local_important = []
local_seen = set()
for sentence in batch_sentences:
sentence = sentence.strip()
if len(sentence) < 10 or len(sentence) > 500:
continue
# Find all potential terms in this sentence
matches = re.findall(combined_pattern, sentence)
if matches:
# Filter out excluded terms
filtered_matches = []
for match in matches:
if not should_exclude_term(match):
local_word_freq[match] += 1
filtered_matches.append(match)
# Keep sentences with valid potential terms
if filtered_matches:
sentence_key = ' '.join(sorted(filtered_matches))
if sentence_key not in local_seen:
local_important.append(sentence)
local_seen.add(sentence_key)
return local_word_freq, local_important, local_seen, batch_idx
if use_parallel:
# Force SMALL batches for real parallelization
# We want MANY small batches, not few large ones!
# Calculate based on total sentences
total_sentences = len(sentences)
if total_sentences < 1000:
# Small dataset: 50-100 sentences per batch
optimal_batch_size = 100
elif total_sentences < 10000:
# Medium dataset: 200 sentences per batch
optimal_batch_size = 200
elif total_sentences < 50000:
# Large dataset: 300 sentences per batch
optimal_batch_size = 300
else:
# Very large dataset: 400 sentences per batch max
optimal_batch_size = 400
# Ensure we have enough batches for all workers
min_batches = extraction_workers * 3 # At least 3 batches per worker
max_batch_size = max(50, total_sentences // min_batches)
optimal_batch_size = min(optimal_batch_size, max_batch_size)
print(f"📑 Total sentences: {total_sentences:,}")
print(f"📑 Target batch size: {optimal_batch_size} sentences")
# Calculate expected number of batches
expected_batches = (total_sentences + optimal_batch_size - 1) // optimal_batch_size
print(f"📑 Expected batches: {expected_batches} (for {extraction_workers} workers)")
print(f"📑 Batches per worker: ~{expected_batches // extraction_workers} batches")
batches = [sentences[i:i + optimal_batch_size] for i in range(0, len(sentences), optimal_batch_size)]
print(f"📑 Processing {len(batches)} batches of ~{optimal_batch_size} sentences each")
print(f"📑 Expected speedup: {min(extraction_workers, len(batches))}x (using {extraction_workers} workers)")
# Decide between ThreadPoolExecutor and ProcessPoolExecutor
import multiprocessing
in_subprocess = multiprocessing.current_process().name != 'MainProcess'
# Use ProcessPoolExecutor for better parallelism on larger datasets
use_process_pool = (not in_subprocess and len(sentences) > 5000)
if use_process_pool:
print(f"📑 Using ProcessPoolExecutor for maximum performance (true parallelism)")
executor_class = ProcessPoolExecutor
else:
print(f"📑 Using ThreadPoolExecutor for sentence processing")
executor_class = ThreadPoolExecutor
with executor_class(max_workers=extraction_workers) as executor:
futures = []
# Prepare data for ProcessPoolExecutor if needed
if use_process_pool:
# Serialize exclusion check data for process pool
exclude_check_data = (
list(honorifics_to_exclude),
[p.pattern for p in title_patterns], # Convert regex to strings
self.pattern_manager.COMMON_WORDS,
self.pattern_manager.CHINESE_NUMS
)
for idx, batch in enumerate(batches):
if use_process_pool:
# Use module-level function for ProcessPoolExecutor
future = executor.submit(_process_sentence_batch_for_extraction,
(batch, idx, combined_pattern, exclude_check_data))
else:
# Use local function for ThreadPoolExecutor
future = executor.submit(process_sentence_batch, batch, idx)
futures.append(future)
# Yield to GUI when submitting futures
if idx % 10 == 0:
time.sleep(0.001)
# Collect results with progress
completed_batches = 0
batch_start_time = time.time()
for future in as_completed(futures):
# Get result without timeout - as_completed already handles waiting
local_word_freq, local_important, local_seen, batch_idx = future.result()
# Merge results
word_freq.update(local_word_freq)
for sentence in local_important:
sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence)))
if sentence_key not in seen_contexts:
important_sentences.append(sentence)
seen_contexts.add(sentence_key)
processed_count += len(batches[batch_idx])
completed_batches += 1
# Show progress every 10 batches or at key milestones
if completed_batches % 10 == 0 or completed_batches == len(batches):
progress = (processed_count / total_sentences) * 100
elapsed = time.time() - batch_start_time
rate = (processed_count / elapsed) if elapsed > 0 else 0
print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) | Batch {completed_batches}/{len(batches)} | {rate:.0f} sent/sec")
# Yield to GUI after each batch completes
time.sleep(0.001)
else:
# Sequential processing with progress
for idx, sentence in enumerate(sentences):
sentence = sentence.strip()
if len(sentence) < 10 or len(sentence) > 500:
continue
# Find all potential terms in this sentence
matches = re.findall(combined_pattern, sentence)
if matches:
# Filter out excluded terms
filtered_matches = []
for match in matches:
if not should_exclude_term(match):
word_freq[match] += 1
filtered_matches.append(match)
# Keep sentences with valid potential terms
if filtered_matches:
sentence_key = ' '.join(sorted(filtered_matches))
if sentence_key not in seen_contexts:
important_sentences.append(sentence)
seen_contexts.add(sentence_key)
# Show progress every 1000 sentences or 2 seconds
if idx % 1000 == 0 or (time.time() - last_progress_time > 2):
progress = ((idx + 1) / total_sentences) * 100
print(f"📑 Processing sentences: {idx + 1:,}/{total_sentences:,} ({progress:.1f}%)")
last_progress_time = time.time()
# Yield to GUI thread every 1000 sentences
time.sleep(0.001) # Tiny sleep to let GUI update
# Yield to GUI thread every 1000 sentences
time.sleep(0.001) # Tiny sleep to let GUI update
print(f"📑 Found {len(important_sentences):,} sentences with potential glossary terms")
# Step 6/7: Deduplicate and normalize terms
print(f"📑 Step 6/7: Normalizing and deduplicating {len(word_freq):,} unique terms...")
# Since should_exclude_term already filters honorifics, we just need to deduplicate
# based on normalized forms (lowercase, etc.)
combined_freq = Counter()
term_count = 0
for term, count in word_freq.items():
# Normalize term for deduplication (but keep original form)
normalized = term.lower().strip()
# Keep the version with highest count
if normalized in combined_freq:
# If we already have this normalized form, keep the one with higher count
if count > combined_freq[normalized]:
# Remove old entry and add new one
del combined_freq[normalized]
combined_freq[term] = count
else:
combined_freq[term] = count
term_count += 1
# Yield to GUI every 1000 terms
if term_count % 1000 == 0:
time.sleep(0.001)
print(f"📑 Deduplicated to {len(combined_freq):,} unique terms")
# Filter to keep only terms that appear at least min_frequency times
frequent_terms = {term: count for term, count in combined_freq.items() if count >= min_frequency}
# Build filtered text focusing on sentences containing frequent terms
print(f"📑 Step 7/7: Building filtered text from relevant sentences...")
# OPTIMIZATION: Skip sentences that already passed filtering in step 5
# These sentences already contain glossary terms, no need to check again!
# We just need to limit the sample size
filtered_sentences = important_sentences # Already filtered!
print(f"📑 Using {len(filtered_sentences):,} pre-filtered sentences (already contain glossary terms)")
# For extremely large datasets, we can optionally do additional filtering
if len(filtered_sentences) > 10000 and len(frequent_terms) > 1000:
print(f"📑 Large dataset detected - applying frequency-based filtering...")
print(f"📑 Filtering {len(filtered_sentences):,} sentences for top frequent terms...")
# Sort terms by frequency to prioritize high-frequency ones
sorted_terms = sorted(frequent_terms.items(), key=lambda x: x[1], reverse=True)
top_terms = dict(sorted_terms[:1000]) # Focus on top 1000 most frequent terms
print(f"📑 Using top {len(top_terms):,} most frequent terms for final filtering")
# Use parallel processing only if really needed
if use_parallel and len(filtered_sentences) > 5000:
import multiprocessing
in_subprocess = multiprocessing.current_process().name != 'MainProcess'
# Create a simple set of terms for fast lookup (no variations needed)
term_set = set(top_terms.keys())
print(f"📑 Using parallel filtering with {extraction_workers} workers...")
# Optimize batch size
check_batch_size = 500 # Larger batches since we're doing simpler checks
check_batches = [filtered_sentences[i:i + check_batch_size]
for i in range(0, len(filtered_sentences), check_batch_size)]
print(f"📑 Processing {len(check_batches)} batches of ~{check_batch_size} sentences")
# Simple function to check if sentence contains any top term
def check_batch_simple(batch):
result = []
for sentence in batch:
# Simple substring check - much faster than regex
for term in term_set:
if term in sentence:
result.append(sentence)
break
return result
new_filtered = []
with ThreadPoolExecutor(max_workers=extraction_workers) as executor:
futures = [executor.submit(check_batch_simple, batch) for batch in check_batches]
for future in as_completed(futures):
new_filtered.extend(future.result())
filtered_sentences = new_filtered
print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms")
else:
# For smaller datasets, simple sequential filtering
print(f"📑 Using sequential filtering...")
new_filtered = []
for i, sentence in enumerate(filtered_sentences):
for term in top_terms:
if term in sentence:
new_filtered.append(sentence)
break
if i % 1000 == 0:
print(f"📑 Progress: {i:,}/{len(filtered_sentences):,} sentences")
time.sleep(0.001)
filtered_sentences = new_filtered
print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms")
print(f"📑 Selected {len(filtered_sentences):,} sentences containing frequent terms")
# Limit the number of sentences to reduce token usage
max_sentences = int(os.getenv("GLOSSARY_MAX_SENTENCES", "200"))
if len(filtered_sentences) > max_sentences:
print(f"📑 Limiting to {max_sentences} representative sentences (from {len(filtered_sentences):,})")
# Take a representative sample
step = len(filtered_sentences) // max_sentences
filtered_sentences = filtered_sentences[::step][:max_sentences]
filtered_text = ' '.join(filtered_sentences)
# Calculate and display filtering statistics
filter_end_time = time.time()
filter_duration = filter_end_time - filter_start_time
original_length = len(clean_text)
filtered_length = len(filtered_text)
reduction_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0
print(f"\n📑 === FILTERING COMPLETE ===")
print(f"📑 Duration: {filter_duration:.1f} seconds")
print(f"📑 Text reduction: {original_length:,} → {filtered_length:,} chars ({reduction_percent:.1f}% reduction)")
print(f"📑 Terms found: {len(frequent_terms):,} unique terms (min frequency: {min_frequency})")
print(f"📑 Final output: {len(filtered_sentences)} sentences, {filtered_length:,} characters")
print(f"📑 Performance: {(original_length / filter_duration / 1000):.1f}K chars/second")
print(f"📑 ========================\n")
return filtered_text, frequent_terms
def _extract_with_custom_prompt(self, custom_prompt, all_text, language,
min_frequency, max_names, max_titles,
existing_glossary, output_dir,
strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'):
"""Extract glossary using custom AI prompt with proper filtering"""
print("📑 Using custom automatic glossary prompt")
extraction_start = time.time()
# Check stop flag
if is_stop_requested():
print("📑 ❌ Glossary extraction stopped by user")
return {}
# Note: Filter mode can be controlled via the configurable prompt environment variable
# No hardcoded filter instructions are added here
try:
MODEL = os.getenv("MODEL", "gemini-2.0-flash")
API_KEY = (os.getenv("API_KEY") or
os.getenv("OPENAI_API_KEY") or
os.getenv("OPENAI_OR_Gemini_API_KEY") or
os.getenv("GEMINI_API_KEY"))
if is_traditional_translation_api(MODEL):
return self._translate_chunk_traditional(chunk_text, chunk_index, total_chunks, chapter_title)
elif not API_KEY:
print(f"📑 No API key found, falling back to pattern-based extraction")
return self._extract_with_patterns(all_text, language, min_frequency,
max_names, max_titles, 50,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
else:
print(f"📑 Using AI-assisted extraction with custom prompt")
from unified_api_client import UnifiedClient, UnifiedClientError
client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir)
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
# Apply thread submission delay using the client's method
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
if thread_delay > 0:
client._apply_thread_submission_delay()
# Check if cancelled during delay
if hasattr(client, '_cancelled') and client._cancelled:
print("📑 ❌ Glossary extraction stopped during delay")
return {}
# Check if text is already filtered (from chunking)
already_filtered = os.getenv("_CHUNK_ALREADY_FILTERED", "0") == "1"
if already_filtered:
print("📑 Text already filtered during chunking, skipping re-filtering")
text_sample = all_text # Use as-is since it's already filtered
detected_terms = {}
else:
# Apply smart filtering to reduce noise and focus on meaningful content
force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER", "0") == "1"
use_smart_filter = (os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1") and not force_disable
if use_smart_filter:
print("📑 Applying smart text filtering to reduce noise...")
text_sample, detected_terms = self._filter_text_for_glossary(all_text, min_frequency)
else:
print("📑 Smart filter disabled - using raw text sample")
# Fallback to simple truncation
max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "50000"))
text_sample = all_text[:max_text_size] if len(all_text) > max_text_size and max_text_size > 0 else all_text
detected_terms = {}
# Replace placeholders in prompt
prompt = custom_prompt.replace('{language}', language)
prompt = prompt.replace('{min_frequency}', str(min_frequency))
prompt = prompt.replace('{max_names}', str(max_names))
prompt = prompt.replace('{max_titles}', str(max_titles))
# Get the format instructions from environment variable
format_instructions = os.getenv("GLOSSARY_FORMAT_INSTRUCTIONS", "")
# If no format instructions are provided, use a default
if not format_instructions:
format_instructions = """
Return the results in EXACT CSV format with this header:
type,raw_name,translated_name
For example:
character,김상현,Kim Sang-hyu
character,갈편제,Gale Hardest
character,디히릿 아데,Dihirit Ade
Only include entries that actually appear in the text.
Do not use quotes around values unless they contain commas.
Text to analyze:
{text_sample}"""
# Replace placeholders in format instructions
format_instructions = format_instructions.replace('{text_sample}', text_sample)
# Combine the user's prompt with format instructions
enhanced_prompt = f"{prompt}\n\n{format_instructions}"
messages = [
{"role": "system", "content": "You are a glossary extraction assistant. Return ONLY CSV format with exactly 3 columns: type,raw_name,translated_name. The 'type' column should classify entries (e.g., character, term, location, etc.)."},
{"role": "user", "content": enhanced_prompt}
]
# Check stop before API call
if is_stop_requested():
print("📑 ❌ Glossary extraction stopped before API call")
return {}
try:
temperature = float(os.getenv("TEMPERATURE", "0.3"))
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096"))
# Use send_with_interrupt for interruptible API call
chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "900")) # 15 minute default for glossary
print(f"📑 Sending AI extraction request (timeout: {chunk_timeout}s, interruptible)...")
# Before API call
api_start = time.time()
print(f"📑 Preparing API request (text size: {len(text_sample):,} chars)...")
print(f"📑 ⏳ Processing {len(text_sample):,} characters... Please wait, this may take 5-10 minutes")
response = send_with_interrupt(
messages=messages,
client=client,
temperature=temperature,
max_tokens=max_tokens,
stop_check_fn=is_stop_requested,
chunk_timeout=chunk_timeout
)
api_time = time.time() - api_start
print(f"📑 API call completed in {api_time:.1f}s")
# Get the actual text from the response
if hasattr(response, 'content'):
response_text = response.content
else:
response_text = str(response)
# Before processing response
process_start = time.time()
print(f"📑 Processing AI response...")
# Process response and build CSV
csv_lines = self._process_ai_response(response_text, all_text, min_frequency,
strip_honorifics, fuzzy_threshold,
language, filter_mode)
print(f"📑 AI extracted {len(csv_lines) - 1} valid terms (header excluded)")
process_time = time.time() - process_start
print(f"📑 Response processing took {process_time:.1f}s")
# If we're running per-chunk, defer all heavy work and saving
if os.getenv("GLOSSARY_DEFER_SAVE", "0") == "1":
return csv_lines
# Check stop before merging
if is_stop_requested():
print("📑 ❌ Glossary generation stopped before merging")
return {}
# Merge with existing glossary if present
if existing_glossary:
csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language)
# Fuzzy matching deduplication
skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1"
if not skip_frequency_check: # Only dedupe if we're checking frequencies
# Time the deduplication
dedup_start = time.time()
original_count = len(csv_lines) - 1 # Exclude header
csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)
dedup_time = time.time() - dedup_start
final_count = len(csv_lines) - 1 # Exclude header
removed_count = original_count - final_count
print(f"📑 Deduplication completed in {dedup_time:.1f}s")
print(f"📑 - Original entries: {original_count}")
print(f"📑 - Duplicates removed: {removed_count}")
print(f"📑 - Final entries: {final_count}")
# Store for summary statistics
self._dedup_time = getattr(self, '_dedup_time', 0) + dedup_time
else:
print(f"📑 Skipping deduplication (frequency check disabled)")
# Apply filter mode to final results
csv_lines = self._filter_csv_by_mode(csv_lines, filter_mode)
# Check if we should use token-efficient format
use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'
if not use_legacy_format:
# Convert to token-efficient format
csv_lines = self._convert_to_token_efficient_format(csv_lines)
# Final sanitize to prevent stray headers
csv_lines = self._sanitize_final_glossary_lines(csv_lines, use_legacy_format)
# Create final CSV content
csv_content = '\n'.join(csv_lines)
# Save glossary as CSV with proper extension
glossary_path = os.path.join(output_dir, "glossary.csv")
self._atomic_write_file(glossary_path, csv_content)
print(f"\n📑 ✅ AI-ASSISTED GLOSSARY SAVED!")
print(f"📑 File: {glossary_path}")
c_count, t_count, total = self._count_glossary_entries(csv_lines, use_legacy_format)
print(f"📑 Character entries: {c_count}")
print(f"📑 Term entries: {t_count}")
print(f"📑 Total entries: {total}")
total_time = time.time() - extraction_start
print(f"📑 Total extraction time: {total_time:.1f}s")
return self._parse_csv_to_dict(csv_content)
except UnifiedClientError as e:
if "stopped by user" in str(e).lower():
print(f"📑 ❌ AI extraction interrupted by user")
return {}
else:
print(f"⚠️ AI extraction failed: {e}")
print("📑 Falling back to pattern-based extraction")
return self._extract_with_patterns(all_text, language, min_frequency,
max_names, max_titles, 50,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
except Exception as e:
print(f"⚠️ AI extraction failed: {e}")
import traceback
traceback.print_exc()
print("📑 Falling back to pattern-based extraction")
return self._extract_with_patterns(all_text, language, min_frequency,
max_names, max_titles, 50,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
except Exception as e:
print(f"⚠️ Custom prompt processing failed: {e}")
import traceback
traceback.print_exc()
return self._extract_with_patterns(all_text, language, min_frequency,
max_names, max_titles, 50,
existing_glossary, output_dir,
strip_honorifics, fuzzy_threshold, filter_mode)
def _filter_csv_by_mode(self, csv_lines, filter_mode):
"""Filter CSV lines based on the filter mode"""
if filter_mode == "all":
return csv_lines
filtered = [csv_lines[0]] # Keep header
for line in csv_lines[1:]:
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) < 3:
continue
entry_type = parts[0].lower()
raw_name = parts[1]
if filter_mode == "only_with_honorifics":
# Only keep character entries with honorifics
if entry_type == "character" and self._has_honorific(raw_name):
filtered.append(line)
elif filter_mode == "only_without_honorifics":
# Keep terms and characters without honorifics
if entry_type == "term" or (entry_type == "character" and not self._has_honorific(raw_name)):
filtered.append(line)
print(f"📑 Filter '{filter_mode}': {len(filtered)-1} entries kept from {len(csv_lines)-1}")
return filtered
def _process_ai_response(self, response_text, all_text, min_frequency,
strip_honorifics, fuzzy_threshold, language, filter_mode):
"""Process AI response and return CSV lines"""
# option to completely skip frequency validation for speed
skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1"
if skip_all_validation:
print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)")
# Clean response text
response_text = response_text.strip()
# Remove string representation artifacts if they wrap the entire response
if response_text.startswith('("') and response_text.endswith('")'):
response_text = response_text[2:-2]
elif response_text.startswith('"') and response_text.endswith('"'):
response_text = response_text[1:-1]
elif response_text.startswith('(') and response_text.endswith(')'):
response_text = response_text[1:-1]
# Unescape the string
response_text = response_text.replace('\\n', '\n')
response_text = response_text.replace('\\r', '')
response_text = response_text.replace('\\t', '\t')
response_text = response_text.replace('\\"', '"')
response_text = response_text.replace("\\'", "'")
response_text = response_text.replace('\\\\', '\\')
# Clean up markdown code blocks if present
if '```' in response_text:
parts = response_text.split('```')
for part in parts:
if 'csv' in part[:10].lower():
response_text = part[part.find('\n')+1:]
break
elif part.strip() and ('type,raw_name' in part or 'character,' in part or 'term,' in part):
response_text = part
break
# Normalize line endings
response_text = response_text.replace('\r\n', '\n').replace('\r', '\n')
lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()]
csv_lines = []
header_found = False
# Check if we should skip frequency check
skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1"
# Add option to completely skip ALL validation for maximum speed
skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1"
if skip_all_validation:
print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)")
# Always use the enforced 3-column header
csv_lines.append("type,raw_name,translated_name")
# Process the AI response
for line in lines:
# Skip header lines
if 'type' in line.lower() and 'raw_name' in line.lower():
continue
# Parse CSV line
parts = [p.strip().strip('"\"') for p in line.split(',')]
if len(parts) >= 3:
# Has all 3 columns
entry_type = parts[0]
raw_name = parts[1]
translated_name = parts[2]
if raw_name and translated_name:
csv_lines.append(f"{entry_type},{raw_name},{translated_name}")
elif len(parts) == 2:
# Missing type, default to 'term'
raw_name = parts[0]
translated_name = parts[1]
if raw_name and translated_name:
csv_lines.append(f"term,{raw_name},{translated_name}")
print(f"📑 Fast mode: Accepted {len(csv_lines) - 1} entries without validation")
return csv_lines
# For "only_with_honorifics" mode, ALWAYS skip frequency check
if filter_mode == "only_with_honorifics":
skip_frequency_check = True
print("📑 Filter mode 'only_with_honorifics': Bypassing frequency checks")
print(f"📑 Processing {len(lines)} lines from AI response...")
print(f"📑 Text corpus size: {len(all_text):,} chars")
print(f"📑 Frequency checking: {'DISABLED' if skip_frequency_check else f'ENABLED (min: {min_frequency})'}")
print(f"📑 Fuzzy threshold: {fuzzy_threshold}")
# Collect all terms first for batch processing
all_terms_to_check = []
term_info_map = {} # Map term to its full info
if not skip_frequency_check:
# First pass: collect all terms that need frequency checking
for line in lines:
if 'type' in line.lower() and 'raw_name' in line.lower():
continue # Skip header
parts = [p.strip().strip('"\"') for p in line.split(',')]
if len(parts) >= 3:
entry_type = parts[0].lower()
raw_name = parts[1]
translated_name = parts[2]
elif len(parts) == 2:
entry_type = 'term'
raw_name = parts[0]
translated_name = parts[1]
else:
continue
if raw_name and translated_name:
# Store for batch processing
original_raw = raw_name
if strip_honorifics:
raw_name = self._strip_honorific(raw_name, language)
all_terms_to_check.append(raw_name)
term_info_map[raw_name] = {
'entry_type': entry_type,
'original_raw': original_raw,
'translated_name': translated_name,
'line': line
}
# Batch compute all frequencies at once
if all_terms_to_check:
print(f"📑 Computing frequencies for {len(all_terms_to_check)} terms...")
term_frequencies = self._batch_compute_frequencies(
all_terms_to_check, all_text, fuzzy_threshold, min_frequency
)
else:
term_frequencies = {}
# Now process the results using pre-computed frequencies
entries_processed = 0
entries_accepted = 0
# Process based on mode
if filter_mode == "only_with_honorifics" or skip_frequency_check:
# For these modes, accept all entries
csv_lines.append("type,raw_name,translated_name") # Header
for line in lines:
if 'type' in line.lower() and 'raw_name' in line.lower():
continue # Skip header
parts = [p.strip().strip('"\"') for p in line.split(',')]
if len(parts) >= 3:
entry_type = parts[0].lower()
raw_name = parts[1]
translated_name = parts[2]
elif len(parts) == 2:
entry_type = 'term'
raw_name = parts[0]
translated_name = parts[1]
else:
continue
if raw_name and translated_name:
csv_line = f"{entry_type},{raw_name},{translated_name}"
csv_lines.append(csv_line)
entries_accepted += 1
print(f"📑 Accepted {entries_accepted} entries (frequency check disabled)")
else:
# Use pre-computed frequencies
csv_lines.append("type,raw_name,translated_name") # Header
for term, info in term_info_map.items():
count = term_frequencies.get(term, 0)
# Also check original form if it was stripped
if info['original_raw'] != term:
count += term_frequencies.get(info['original_raw'], 0)
if count >= min_frequency:
csv_line = f"{info['entry_type']},{term},{info['translated_name']}"
csv_lines.append(csv_line)
entries_accepted += 1
# Log first few examples
if entries_accepted <= 5:
print(f"📑 ✓ Example: {term} -> {info['translated_name']} (freq: {count})")
print(f"📑 Frequency filtering complete: {entries_accepted}/{len(term_info_map)} terms accepted")
# Ensure we have at least the header
if len(csv_lines) == 0:
csv_lines.append("type,raw_name,translated_name")
# Print final summary
print(f"📑 Processing complete: {entries_accepted} terms accepted")
return csv_lines
def _deduplicate_glossary_with_fuzzy(self, csv_lines, fuzzy_threshold):
"""Apply fuzzy matching to remove duplicate entries from the glossary with stop flag checks"""
from difflib import SequenceMatcher
print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...")
# Check stop flag at start
if is_stop_requested():
print(f"📑 ❌ Deduplication stopped by user")
return csv_lines
header_line = csv_lines[0] # Keep header
entry_lines = csv_lines[1:] # Data lines
deduplicated = [header_line]
seen_entries = {} # Use dict for O(1) lookups instead of list
seen_names_lower = set() # Quick exact match check
removed_count = 0
total_entries = len(entry_lines)
# Pre-process all entries for faster comparison
print(f"📑 Processing {total_entries} entries for deduplication...")
for idx, line in enumerate(entry_lines):
# Check stop flag every 100 entries
if idx > 0 and idx % 100 == 0:
if is_stop_requested():
print(f"📑 ❌ Deduplication stopped at entry {idx}/{total_entries}")
return deduplicated
# Show progress for large glossaries
if total_entries > 500 and idx % 200 == 0:
progress = (idx / total_entries) * 100
print(f"📑 Deduplication progress: {progress:.1f}% ({idx}/{total_entries})")
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) < 3:
continue
entry_type = parts[0]
raw_name = parts[1]
translated_name = parts[2]
raw_name_lower = raw_name.lower()
# Fast exact duplicate check first
if raw_name_lower in seen_names_lower:
removed_count += 1
continue
# For fuzzy matching, only check if threshold is less than 1.0
is_duplicate = False
if fuzzy_threshold < 1.0:
# Use a more efficient approach: only check similar length strings
name_len = len(raw_name)
min_len = int(name_len * 0.7)
max_len = int(name_len * 1.3)
# Only compare with entries of similar length
candidates = []
for seen_name, (seen_type, seen_trans) in seen_entries.items():
if min_len <= len(seen_name) <= max_len:
candidates.append(seen_name)
# Check fuzzy similarity with candidates
for seen_name in candidates:
# Quick character overlap check before expensive SequenceMatcher
char_overlap = len(set(raw_name_lower) & set(seen_name.lower()))
if char_overlap < len(raw_name_lower) * 0.5:
continue # Too different, skip
raw_similarity = SequenceMatcher(None, raw_name_lower, seen_name.lower()).ratio()
if raw_similarity >= fuzzy_threshold:
if removed_count < 10: # Only log first few
print(f"📑 Removing duplicate: '{raw_name}' ~= '{seen_name}' (similarity: {raw_similarity:.2%})")
removed_count += 1
is_duplicate = True
break
if not is_duplicate:
seen_entries[raw_name] = (entry_type, translated_name)
seen_names_lower.add(raw_name_lower)
deduplicated.append(line)
print(f"📑 ✅ Removed {removed_count} duplicates from glossary")
print(f"📑 Final glossary size: {len(deduplicated) - 1} unique entries")
return deduplicated
def _merge_csv_entries(self, new_csv_lines, existing_glossary, strip_honorifics, language):
"""Merge CSV entries with existing glossary with stop flag checks"""
# Check stop flag at start
if is_stop_requested():
print(f"📑 ❌ Glossary merge stopped by user")
return new_csv_lines
# Parse existing glossary
existing_lines = []
existing_names = set()
if isinstance(existing_glossary, str):
# Already CSV format
lines = existing_glossary.strip().split('\n')
total_lines = len(lines)
for idx, line in enumerate(lines):
# Check stop flag every 50 lines
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Merge stopped while processing existing glossary at line {idx}/{total_lines}")
return new_csv_lines
if total_lines > 200:
progress = (idx / total_lines) * 100
print(f"📑 Processing existing glossary: {progress:.1f}%")
if 'type,raw_name' in line.lower():
continue # Skip header
line_stripped = line.strip()
# Skip token-efficient lines and section/bullet markers
if not line_stripped or line_stripped.startswith('===') or line_stripped.startswith('*') or line_stripped.lower().startswith('glossary:'):
continue
parts = [p.strip() for p in line.split(',')]
# Require at least 3 fields (type, raw_name, translated_name)
if len(parts) < 3:
continue
entry_type = parts[0].strip().lower()
# Only accept reasonable type tokens (letters/underscores only)
import re as _re
if not _re.match(r'^[a-z_]+$', entry_type):
continue
raw_name = parts[1]
if strip_honorifics:
raw_name = self._strip_honorific(raw_name, language)
parts[1] = raw_name
if raw_name not in existing_names:
existing_lines.append(','.join(parts))
existing_names.add(raw_name)
# Check stop flag before processing new names
if is_stop_requested():
print(f"📑 ❌ Merge stopped before processing new entries")
return new_csv_lines
# Get new names
new_names = set()
final_lines = []
for idx, line in enumerate(new_csv_lines):
# Check stop flag every 50 lines
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Merge stopped while processing new entries at line {idx}")
return final_lines if final_lines else new_csv_lines
if 'type,raw_name' in line.lower():
final_lines.append(line) # Keep header
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 2:
new_names.add(parts[1])
final_lines.append(line)
# Check stop flag before adding existing entries
if is_stop_requested():
print(f"📑 ❌ Merge stopped before combining entries")
return final_lines
# Add non-duplicate existing entries
added_count = 0
for idx, line in enumerate(existing_lines):
# Check stop flag every 50 additions
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Merge stopped while adding existing entries ({added_count} added)")
return final_lines
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 2 and parts[1] not in new_names:
final_lines.append(line)
added_count += 1
print(f"📑 Merged {added_count} entries from existing glossary")
return final_lines
def _extract_with_patterns(self, all_text, language, min_frequency,
max_names, max_titles, batch_size,
existing_glossary, output_dir,
strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'):
"""Extract glossary using pattern matching with true CSV format output and stop flag checks"""
print("📑 Using pattern-based extraction")
# Check stop flag at start
if is_stop_requested():
print("📑 ❌ Pattern-based extraction stopped by user")
return {}
def is_valid_name(name, language_hint='unknown'):
"""Strict validation for proper names only"""
if not name or len(name.strip()) < 1:
return False
name = name.strip()
if name.lower() in self.pattern_manager.COMMON_WORDS or name in self.pattern_manager.COMMON_WORDS:
return False
if language_hint == 'korean':
if not (2 <= len(name) <= 4):
return False
if not all(0xAC00 <= ord(char) <= 0xD7AF for char in name):
return False
if len(set(name)) == 1:
return False
elif language_hint == 'japanese':
if not (2 <= len(name) <= 6):
return False
has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in name)
has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in name)
if not (has_kanji or has_kana):
return False
elif language_hint == 'chinese':
if not (2 <= len(name) <= 4):
return False
if not all(0x4E00 <= ord(char) <= 0x9FFF for char in name):
return False
elif language_hint == 'english':
if not name[0].isupper():
return False
if sum(1 for c in name if c.isalpha()) < len(name) * 0.8:
return False
if not (2 <= len(name) <= 20):
return False
return True
def detect_language_hint(text_sample):
"""Quick language detection for validation purposes"""
sample = text_sample[:1000]
korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)
if korean_chars > 50:
return 'korean'
elif japanese_kana > 20:
return 'japanese'
elif chinese_chars > 50 and japanese_kana < 10:
return 'chinese'
elif latin_chars > 100:
return 'english'
else:
return 'unknown'
language_hint = detect_language_hint(all_text)
print(f"📑 Detected primary language: {language_hint}")
# Check stop flag after language detection
if is_stop_requested():
print("📑 ❌ Extraction stopped after language detection")
return {}
honorifics_to_use = []
if language_hint in self.pattern_manager.CJK_HONORIFICS:
honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS[language_hint])
honorifics_to_use.extend(self.pattern_manager.CJK_HONORIFICS.get('english', []))
print(f"📑 Using {len(honorifics_to_use)} honorifics for {language_hint}")
names_with_honorifics = {}
standalone_names = {}
# Check if parallel processing is enabled
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
# PARALLEL HONORIFIC PROCESSING
if extraction_workers > 1 and len(honorifics_to_use) > 3:
print(f"📑 Scanning for names with honorifics (parallel with {extraction_workers} workers)...")
# Create a wrapper function that can be called in parallel
def process_honorific(args):
"""Process a single honorific in a worker thread"""
honorific, idx, total = args
# Check stop flag
if is_stop_requested():
return None, None
print(f"📑 Worker processing honorific {idx}/{total}: '{honorific}'")
# Local dictionaries for this worker
local_names_with = {}
local_standalone = {}
# Call the extraction method
self._extract_names_for_honorific(
honorific, all_text, language_hint,
min_frequency, local_names_with,
local_standalone, is_valid_name, fuzzy_threshold
)
return local_names_with, local_standalone
# Prepare arguments for parallel processing
honorific_args = [
(honorific, idx + 1, len(honorifics_to_use))
for idx, honorific in enumerate(honorifics_to_use)
]
# Process honorifics in parallel
with ThreadPoolExecutor(max_workers=min(extraction_workers, len(honorifics_to_use))) as executor:
futures = []
for args in honorific_args:
if is_stop_requested():
executor.shutdown(wait=False)
return {}
future = executor.submit(process_honorific, args)
futures.append(future)
# Collect results as they complete
completed = 0
for future in as_completed(futures):
if is_stop_requested():
executor.shutdown(wait=False)
return {}
try:
result = future.result()
if result and result[0] is not None:
local_names_with, local_standalone = result
# Merge results (thread-safe since we're in main thread)
for name, count in local_names_with.items():
if name not in names_with_honorifics:
names_with_honorifics[name] = count
else:
names_with_honorifics[name] = max(names_with_honorifics[name], count)
for name, count in local_standalone.items():
if name not in standalone_names:
standalone_names[name] = count
else:
standalone_names[name] = max(standalone_names[name], count)
completed += 1
if completed % 5 == 0 or completed == len(honorifics_to_use):
print(f"📑 Honorific processing: {completed}/{len(honorifics_to_use)} completed")
except Exception as e:
print(f"⚠️ Failed to process honorific: {e}")
completed += 1
print(f"📑 Parallel honorific processing completed: found {len(names_with_honorifics)} names")
else:
# SEQUENTIAL PROCESSING (fallback)
print("📑 Scanning for names with honorifics...")
# Extract names with honorifics
total_honorifics = len(honorifics_to_use)
for idx, honorific in enumerate(honorifics_to_use):
# Check stop flag before each honorific
if is_stop_requested():
print(f"📑 ❌ Extraction stopped at honorific {idx}/{total_honorifics}")
return {}
print(f"📑 Processing honorific {idx + 1}/{total_honorifics}: '{honorific}'")
self._extract_names_for_honorific(honorific, all_text, language_hint,
min_frequency, names_with_honorifics,
standalone_names, is_valid_name, fuzzy_threshold)
# Check stop flag before processing terms
if is_stop_requested():
print("📑 ❌ Extraction stopped before processing terms")
return {}
# Apply filter mode
filtered_names = {}
if filter_mode == 'only_with_honorifics':
# Only keep names that have honorifics (no standalone names)
filtered_names = names_with_honorifics.copy()
print(f"📑 Filter: Keeping only names with honorifics ({len(filtered_names)} names)")
elif filter_mode == 'only_without_honorifics':
# Keep standalone names that were NOT found with honorifics
for name, count in standalone_names.items():
# Check if this name also appears with honorifics
appears_with_honorific = False
for honorific_name in names_with_honorifics.keys():
if self._strip_honorific(honorific_name, language_hint) == name:
appears_with_honorific = True
break
# Only add if it doesn't appear with honorifics
if not appears_with_honorific:
filtered_names[name] = count
print(f"📑 Filter: Keeping only names without honorifics ({len(filtered_names)} names)")
else: # 'all' mode
# Keep all names (both with and without honorifics)
filtered_names = names_with_honorifics.copy()
# Also add standalone names
for name, count in standalone_names.items():
if name not in filtered_names and not any(
self._strip_honorific(n, language_hint) == name for n in filtered_names.keys()
):
filtered_names[name] = count
print(f"📑 Filter: Keeping all names ({len(filtered_names)} names)")
# Process extracted terms
final_terms = {}
term_count = 0
total_terms = len(filtered_names)
for term, count in filtered_names.items():
term_count += 1
# Check stop flag every 20 terms
if term_count % 20 == 0:
if is_stop_requested():
print(f"📑 ❌ Term processing stopped at {term_count}/{total_terms}")
return {}
if strip_honorifics:
clean_term = self._strip_honorific(term, language_hint)
if clean_term in final_terms:
final_terms[clean_term] = final_terms[clean_term] + count
else:
final_terms[clean_term] = count
else:
final_terms[term] = count
# Check stop flag before finding titles
if is_stop_requested():
print("📑 ❌ Extraction stopped before finding titles")
return {}
# Find titles (but respect filter mode)
print("📑 Scanning for titles...")
found_titles = {}
# Extract titles for all modes EXCEPT "only_with_honorifics"
# (titles are included in "only_without_honorifics" since titles typically don't have honorifics)
if filter_mode != 'only_with_honorifics':
title_patterns_to_use = []
if language_hint in self.pattern_manager.TITLE_PATTERNS:
title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS[language_hint])
title_patterns_to_use.extend(self.pattern_manager.TITLE_PATTERNS.get('english', []))
total_patterns = len(title_patterns_to_use)
for pattern_idx, pattern in enumerate(title_patterns_to_use):
# Check stop flag before each pattern
if is_stop_requested():
print(f"📑 ❌ Title extraction stopped at pattern {pattern_idx}/{total_patterns}")
return {}
print(f"📑 Processing title pattern {pattern_idx + 1}/{total_patterns}")
matches = list(re.finditer(pattern, all_text, re.IGNORECASE if 'english' in pattern else 0))
for match_idx, match in enumerate(matches):
# Check stop flag every 50 matches
if match_idx > 0 and match_idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Title extraction stopped at match {match_idx}")
return {}
title = match.group(0)
# Skip if this title is already in names
if title in filtered_names or title in names_with_honorifics:
continue
count = self._find_fuzzy_matches(title, all_text, fuzzy_threshold)
# Check if stopped during fuzzy matching
if is_stop_requested():
print(f"📑 ❌ Title extraction stopped during fuzzy matching")
return {}
if count >= min_frequency:
if re.match(r'[A-Za-z]', title):
title = title.title()
if strip_honorifics:
title = self._strip_honorific(title, language_hint)
if title not in found_titles:
found_titles[title] = count
if filter_mode == 'only_without_honorifics':
print(f"📑 Found {len(found_titles)} titles (included in 'without honorifics' mode)")
else:
print(f"📑 Found {len(found_titles)} unique titles")
else:
print(f"📑 Skipping title extraction (filter mode: only_with_honorifics)")
# Check stop flag before sorting and translation
if is_stop_requested():
print("📑 ❌ Extraction stopped before sorting terms")
return {}
# Combine and sort
sorted_names = sorted(final_terms.items(), key=lambda x: x[1], reverse=True)[:max_names]
sorted_titles = sorted(found_titles.items(), key=lambda x: x[1], reverse=True)[:max_titles]
all_terms = []
for name, count in sorted_names:
all_terms.append(name)
for title, count in sorted_titles:
all_terms.append(title)
print(f"📑 Total terms to translate: {len(all_terms)}")
# Check stop flag before translation
if is_stop_requested():
print("📑 ❌ Extraction stopped before translation")
return {}
# Translate terms
if os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1":
print("📑 Translation disabled - keeping original terms")
translations = {term: term for term in all_terms}
else:
print(f"📑 Translating {len(all_terms)} terms...")
translations = self._translate_terms_batch(all_terms, language_hint, batch_size, output_dir)
# Check if translation was stopped
if is_stop_requested():
print("📑 ❌ Extraction stopped after translation")
return translations # Return partial results
# Build CSV lines
csv_lines = ["type,raw_name,translated_name"]
for name, _ in sorted_names:
if name in translations:
csv_lines.append(f"character,{name},{translations[name]}")
for title, _ in sorted_titles:
if title in translations:
csv_lines.append(f"term,{title},{translations[title]}")
# Check stop flag before merging
if is_stop_requested():
print("📑 ❌ Extraction stopped before merging with existing glossary")
# Still save what we have
csv_content = '\n'.join(csv_lines)
glossary_path = os.path.join(output_dir, "glossary.json")
self._atomic_write_file(glossary_path, csv_content)
return self._parse_csv_to_dict(csv_content)
# Merge with existing glossary
if existing_glossary:
csv_lines = self._merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language_hint)
# Check stop flag before deduplication
if is_stop_requested():
print("📑 ❌ Extraction stopped before deduplication")
csv_content = '\n'.join(csv_lines)
glossary_path = os.path.join(output_dir, "glossary.json")
self._atomic_write_file(glossary_path, csv_content)
return self._parse_csv_to_dict(csv_content)
# Fuzzy matching deduplication
csv_lines = self._deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)
# Create CSV content
csv_content = '\n'.join(csv_lines)
# Save glossary as CSV
glossary_path = os.path.join(output_dir, "glossary.csv")
self._atomic_write_file(glossary_path, csv_content)
print(f"\n📑 ✅ TARGETED GLOSSARY SAVED!")
print(f"📑 File: {glossary_path}")
print(f"📑 Total entries: {len(csv_lines) - 1}") # Exclude header
return self._parse_csv_to_dict(csv_content)
def _translate_terms_batch(self, term_list, profile_name, batch_size=50, output_dir=None):
"""Use fully configurable prompts for translation with interrupt support"""
if not term_list or os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1":
print(f"📑 Glossary translation disabled or no terms to translate")
return {term: term for term in term_list}
# Check stop flag
if is_stop_requested():
print("📑 ❌ Glossary translation stopped by user")
return {term: term for term in term_list}
try:
MODEL = os.getenv("MODEL", "gemini-1.5-flash")
API_KEY = (os.getenv("API_KEY") or
os.getenv("OPENAI_API_KEY") or
os.getenv("OPENAI_OR_Gemini_API_KEY") or
os.getenv("GEMINI_API_KEY"))
if is_traditional_translation_api(MODEL):
return
if not API_KEY:
print(f"📑 No API key found, skipping translation")
return {term: term for term in term_list}
print(f"📑 Translating {len(term_list)} {profile_name} terms to English using batch size {batch_size}...")
from unified_api_client import UnifiedClient, UnifiedClientError
client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir)
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
# Get custom translation prompt from environment
translation_prompt_template = os.getenv("GLOSSARY_TRANSLATION_PROMPT", "")
if not translation_prompt_template:
translation_prompt_template = """You are translating {language} character names and important terms to English.
For character names, provide English transliterations or keep as romanized.
Keep honorifics/suffixes only if they are integral to the name.
Respond with the same numbered format.
Terms to translate:
{terms_list}
Provide translations in the same numbered format."""
all_translations = {}
chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "300")) # 5 minute default
for i in range(0, len(term_list), batch_size):
# Check stop flag before each batch
if is_stop_requested():
print(f"📑 ❌ Translation stopped at batch {(i // batch_size) + 1}")
# Return partial translations
for term in term_list:
if term not in all_translations:
all_translations[term] = term
return all_translations
batch = term_list[i:i + batch_size]
batch_num = (i // batch_size) + 1
total_batches = (len(term_list) + batch_size - 1) // batch_size
print(f"📑 Processing batch {batch_num}/{total_batches} ({len(batch)} terms)...")
# Format terms list
terms_text = ""
for idx, term in enumerate(batch, 1):
terms_text += f"{idx}. {term}\n"
# Replace placeholders in prompt
prompt = translation_prompt_template.replace('{language}', profile_name)
prompt = prompt.replace('{terms_list}', terms_text.strip())
prompt = prompt.replace('{batch_size}', str(len(batch)))
messages = [
{"role": "user", "content": prompt}
]
try:
temperature = float(os.getenv("TEMPERATURE", "0.3"))
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "4096"))
# Use send_with_interrupt for interruptible API call
print(f"📑 Sending translation request for batch {batch_num} (interruptible)...")
response = send_with_interrupt(
messages=messages,
client=client,
temperature=temperature,
max_tokens=max_tokens,
stop_check_fn=is_stop_requested,
chunk_timeout=chunk_timeout
)
# Handle response properly
if hasattr(response, 'content'):
response_text = response.content
else:
response_text = str(response)
batch_translations = self._parse_translation_response(response_text, batch)
all_translations.update(batch_translations)
print(f"📑 Batch {batch_num} completed: {len(batch_translations)} translations")
# Small delay between batches to avoid rate limiting (configurable)
if i + batch_size < len(term_list):
# Check stop before sleep
if is_stop_requested():
print(f"📑 ❌ Translation stopped after batch {batch_num}")
# Fill in missing translations
for term in term_list:
if term not in all_translations:
all_translations[term] = term
return all_translations
# Use configurable batch delay or default to 0.1s (much faster than 0.5s)
batch_delay = float(os.getenv("GLOSSARY_BATCH_DELAY", "0.001"))
if batch_delay > 0:
time.sleep(batch_delay)
except UnifiedClientError as e:
if "stopped by user" in str(e).lower():
print(f"📑 ❌ Translation interrupted by user at batch {batch_num}")
# Fill in remaining terms with originals
for term in term_list:
if term not in all_translations:
all_translations[term] = term
return all_translations
else:
print(f"⚠️ Translation failed for batch {batch_num}: {e}")
for term in batch:
all_translations[term] = term
except Exception as e:
print(f"⚠️ Translation failed for batch {batch_num}: {e}")
for term in batch:
all_translations[term] = term
# Ensure all terms have translations
for term in term_list:
if term not in all_translations:
all_translations[term] = term
translated_count = sum(1 for term, translation in all_translations.items()
if translation != term and translation.strip())
print(f"📑 Successfully translated {translated_count}/{len(term_list)} terms")
return all_translations
except Exception as e:
print(f"⚠️ Glossary translation failed: {e}")
return {term: term for term in term_list}
def _extract_names_for_honorific(self, honorific, all_text, language_hint,
min_frequency, names_with_honorifics,
standalone_names, is_valid_name, fuzzy_threshold=0.90):
"""Extract names for a specific honorific with fuzzy matching and stop flag checks"""
# Check stop flag at start
if is_stop_requested():
print(f"📑 ❌ Name extraction for '{honorific}' stopped by user")
return
if language_hint == 'korean' and not honorific.startswith('-'):
pattern = r'([\uac00-\ud7af]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,.\!?]|$))'
matches = list(re.finditer(pattern, all_text))
total_matches = len(matches)
for idx, match in enumerate(matches):
# Check stop flag every 50 matches
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Korean name extraction stopped at {idx}/{total_matches}")
return
# Show progress for large sets
if total_matches > 500:
progress = (idx / total_matches) * 100
print(f"📑 Processing Korean names: {progress:.1f}% ({idx}/{total_matches})")
potential_name = match.group(1)
if is_valid_name(potential_name, 'korean'):
full_form = potential_name + honorific
# Use fuzzy matching for counting with stop check
count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold)
# Check if stopped during fuzzy matching
if is_stop_requested():
print(f"📑 ❌ Name extraction stopped during fuzzy matching")
return
if count >= min_frequency:
context_patterns = [
full_form + r'[은는이가]',
full_form + r'[을를]',
full_form + r'[에게한테]',
r'["]' + full_form,
full_form + r'[,]',
]
context_count = 0
for ctx_pattern in context_patterns:
context_count += len(re.findall(ctx_pattern, all_text))
if context_count > 0:
names_with_honorifics[full_form] = count
standalone_names[potential_name] = count
elif language_hint == 'japanese' and not honorific.startswith('-'):
pattern = r'([\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,5})(?=' + re.escape(honorific) + r'(?:\s|[、。!?]|$))'
matches = list(re.finditer(pattern, all_text))
total_matches = len(matches)
for idx, match in enumerate(matches):
# Check stop flag every 50 matches
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Japanese name extraction stopped at {idx}/{total_matches}")
return
if total_matches > 500:
progress = (idx / total_matches) * 100
print(f"📑 Processing Japanese names: {progress:.1f}% ({idx}/{total_matches})")
potential_name = match.group(1)
if is_valid_name(potential_name, 'japanese'):
full_form = potential_name + honorific
count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold)
if is_stop_requested():
print(f"📑 ❌ Name extraction stopped during fuzzy matching")
return
if count >= min_frequency:
names_with_honorifics[full_form] = count
standalone_names[potential_name] = count
elif language_hint == 'chinese' and not honorific.startswith('-'):
pattern = r'([\u4e00-\u9fff]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,。!?]|$))'
matches = list(re.finditer(pattern, all_text))
total_matches = len(matches)
for idx, match in enumerate(matches):
# Check stop flag every 50 matches
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ Chinese name extraction stopped at {idx}/{total_matches}")
return
if total_matches > 500:
progress = (idx / total_matches) * 100
print(f"📑 Processing Chinese names: {progress:.1f}% ({idx}/{total_matches})")
potential_name = match.group(1)
if is_valid_name(potential_name, 'chinese'):
full_form = potential_name + honorific
count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold)
if is_stop_requested():
print(f"📑 ❌ Name extraction stopped during fuzzy matching")
return
if count >= min_frequency:
names_with_honorifics[full_form] = count
standalone_names[potential_name] = count
elif honorific.startswith('-') or honorific.startswith(' '):
is_space_separated = honorific.startswith(' ')
if is_space_separated:
pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'(?=\s|[,.\!?]|$)'
else:
pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'\b'
matches = list(re.finditer(pattern_english, all_text))
total_matches = len(matches)
for idx, match in enumerate(matches):
# Check stop flag every 50 matches
if idx > 0 and idx % 50 == 0:
if is_stop_requested():
print(f"📑 ❌ English name extraction stopped at {idx}/{total_matches}")
return
if total_matches > 500:
progress = (idx / total_matches) * 100
print(f"📑 Processing English names: {progress:.1f}% ({idx}/{total_matches})")
potential_name = match.group(1)
if is_valid_name(potential_name, 'english'):
full_form = potential_name + honorific
count = self._find_fuzzy_matches(full_form, all_text, fuzzy_threshold)
if is_stop_requested():
print(f"📑 ❌ Name extraction stopped during fuzzy matching")
return
if count >= min_frequency:
names_with_honorifics[full_form] = count
standalone_names[potential_name] = count
def _parse_translation_response(self, response, original_terms):
"""Parse translation response - handles numbered format"""
translations = {}
# Handle UnifiedResponse object
if hasattr(response, 'content'):
response_text = response.content
else:
response_text = str(response)
lines = response_text.strip().split('\n')
for line in lines:
line = line.strip()
if not line or not line[0].isdigit():
continue
try:
number_match = re.match(r'^(\d+)\.?\s*(.+)', line)
if number_match:
num = int(number_match.group(1)) - 1
content = number_match.group(2).strip()
if 0 <= num < len(original_terms):
original_term = original_terms[num]
for separator in ['->', '→', ':', '-', '—', '=']:
if separator in content:
parts = content.split(separator, 1)
if len(parts) == 2:
translation = parts[1].strip()
translation = translation.strip('"\'()[]')
if translation and translation != original_term:
translations[original_term] = translation
break
else:
if content != original_term:
translations[original_term] = content
except (ValueError, IndexError):
continue
return translations
# =====================================================
# UNIFIED UTILITIES
# =====================================================
def sanitize_resource_filename(filename):
"""Sanitize resource filenames for filesystem compatibility"""
filename = unicodedata.normalize('NFC', filename)
replacements = {
'/': '_', '\\': '_', ':': '_', '*': '_',
'?': '_', '"': '_', '<': '_', '>': '_',
'|': '_', '\0': '', '\n': '_', '\r': '_'
}
for old, new in replacements.items():
filename = filename.replace(old, new)
filename = ''.join(char for char in filename if ord(char) >= 32)
name, ext = os.path.splitext(filename)
if not name:
name = 'resource'
return name + ext
def should_retain_source_extension():
"""Read GUI toggle for retaining original extension and no 'response_' prefix.
This is stored in config or env by the GUI; we read env as bridge.
"""
return os.getenv('RETAIN_SOURCE_EXTENSION', os.getenv('retain_source_extension', '0')) in ('1', 'true', 'True')
def make_safe_filename(title, actual_num):
"""Create a safe filename that works across different filesystems"""
if not title:
return f"chapter_{actual_num:03d}"
title = unicodedata.normalize('NFC', str(title))
dangerous_chars = {
'/': '_', '\\': '_', ':': '_', '*': '_', '?': '_',
'"': '_', '<': '_', '>': '_', '|': '_', '\0': '',
'\n': ' ', '\r': ' ', '\t': ' '
}
for old, new in dangerous_chars.items():
title = title.replace(old, new)
title = ''.join(char for char in title if ord(char) >= 32)
title = re.sub(r'\s+', '_', title)
title = title.strip('_.• \t')
if not title or title == '_' * len(title):
title = f"chapter_{actual_num:03d}"
return title
def get_content_hash(html_content):
"""Create a stable hash of content"""
return ContentProcessor.get_content_hash(html_content)
def clean_ai_artifacts(text, remove_artifacts=True):
"""Remove AI response artifacts from text"""
return ContentProcessor.clean_ai_artifacts(text, remove_artifacts)
def find_glossary_file(output_dir):
"""Return path to glossary file preferring CSV over JSON, or None if not found"""
candidates = [
os.path.join(output_dir, "glossary.csv"),
os.path.join(output_dir, "glossary.json"),
]
for p in candidates:
if os.path.exists(p):
return p
return None
def clean_memory_artifacts(text):
"""Remove any memory/summary artifacts"""
return ContentProcessor.clean_memory_artifacts(text)
def emergency_restore_paragraphs(text, original_html=None, verbose=True):
"""Emergency restoration when AI returns wall of text"""
return ContentProcessor.emergency_restore_paragraphs(text, original_html, verbose)
def is_meaningful_text_content(html_content):
"""Check if chapter has meaningful text beyond just structure"""
return ContentProcessor.is_meaningful_text_content(html_content)
# =====================================================
# GLOBAL SETTINGS AND FLAGS
# =====================================================
logging.basicConfig(level=logging.DEBUG)
try:
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='ignore')
except AttributeError:
if sys.stdout is None:
devnull = open(os.devnull, "wb")
sys.stdout = io.TextIOWrapper(devnull, encoding='utf-8', errors='ignore')
elif hasattr(sys.stdout, 'buffer'):
try:
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
except:
pass
_stop_requested = False
def set_stop_flag(value):
"""Set the global stop flag"""
global _stop_requested
_stop_requested = value
def is_stop_requested():
"""Check if stop was requested"""
global _stop_requested
return _stop_requested
def set_output_redirect(log_callback=None):
"""Redirect print statements to a callback function for GUI integration"""
if log_callback:
class CallbackWriter:
def __init__(self, callback):
self.callback = callback
def write(self, text):
if text.strip():
self.callback(text.strip())
def flush(self):
pass
sys.stdout = CallbackWriter(log_callback)
# =====================================================
# EPUB AND FILE PROCESSING
# =====================================================
def extract_chapter_number_from_filename(filename, opf_spine_position=None, opf_spine_data=None):
"""Extract chapter number from filename, prioritizing OPF spine order"""
# Priority 1: Use OPF spine position if available
if opf_spine_position is not None:
# Handle special non-chapter files (always chapter 0)
filename_lower = filename.lower()
name_without_ext = os.path.splitext(filename)[0].lower()
# Check for special keywords OR no numbers present
special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface', 'nav']
has_special_keyword = any(name in filename_lower for name in special_keywords)
has_no_numbers = not re.search(r'\d', name_without_ext)
if has_special_keyword or has_no_numbers:
return 0, 'opf_special_file'
# Use spine position for regular chapters (0, 1, 2, 3...)
return opf_spine_position, 'opf_spine_order'
# Priority 2: Check if this looks like a special file (even without OPF)
name_without_ext = os.path.splitext(filename)[0].lower()
special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface']
has_special_keyword = any(name in name_without_ext for name in special_keywords)
has_no_numbers = not re.search(r'\d', name_without_ext)
if has_special_keyword or has_no_numbers:
return 0, 'special_file'
# Priority 3: Try to extract sequential numbers (000, 001, 002...)
name_without_ext = os.path.splitext(filename)[0]
# Look for simple sequential patterns first
# Priority 3: Try to extract sequential numbers and decimals
sequential_patterns = [
(r'^(\d+)\.(\d+)$', 'decimal_number'), # 1.5, 2.3 (NEW!)
(r'^(\d{3,4})$', 'sequential_number'), # 000, 001, 0001
(r'^(\d+)$', 'direct_number'), # 0, 1, 2
]
for pattern, method in sequential_patterns:
match = re.search(pattern, name_without_ext)
if match:
if method == 'decimal_number':
# Return as float for decimal chapters
return float(f"{match.group(1)}.{match.group(2)}"), method
else:
return int(match.group(1)), method
# Priority 4: Fall back to existing filename parsing patterns
fallback_patterns = [
(r'^response_(\d+)[_\.]', 'response_prefix'),
(r'[Cc]hapter[_\s]*(\d+)', 'chapter_word'),
(r'[Cc]h[_\s]*(\d+)', 'ch_abbreviation'),
(r'No(\d+)', 'no_prefix'),
(r'第(\d+)[章话回]', 'chinese_chapter'),
(r'-h-(\d+)', 'h_suffix'), # For your -h-16 pattern
(r'_(\d+)', 'underscore_suffix'),
(r'-(\d+)', 'dash_suffix'),
(r'(\d+)', 'trailing_number'),
]
for pattern, method in fallback_patterns:
match = re.search(pattern, name_without_ext, re.IGNORECASE)
if match:
return int(match.group(1)), method
return None, None
def process_chapter_images(chapter_html: str, actual_num: int, image_translator: ImageTranslator,
check_stop_fn=None) -> Tuple[str, Dict[str, str]]:
"""Process and translate images in a chapter"""
from bs4 import BeautifulSoup
images = image_translator.extract_images_from_chapter(chapter_html)
if not images:
return chapter_html, {}
print(f"🖼️ Found {len(images)} images in chapter {actual_num}")
soup = BeautifulSoup(chapter_html, 'html.parser')
image_translations = {}
translated_count = 0
max_images_per_chapter = int(os.getenv('MAX_IMAGES_PER_CHAPTER', '10'))
if len(images) > max_images_per_chapter:
print(f" ⚠️ Chapter has {len(images)} images - processing first {max_images_per_chapter} only")
images = images[:max_images_per_chapter]
for idx, img_info in enumerate(images, 1):
if check_stop_fn and check_stop_fn():
print("❌ Image translation stopped by user")
break
img_src = img_info['src']
if img_src.startswith('../'):
img_path = os.path.join(image_translator.output_dir, img_src[3:])
elif img_src.startswith('./'):
img_path = os.path.join(image_translator.output_dir, img_src[2:])
elif img_src.startswith('/'):
img_path = os.path.join(image_translator.output_dir, img_src[1:])
else:
possible_paths = [
os.path.join(image_translator.images_dir, os.path.basename(img_src)),
os.path.join(image_translator.output_dir, img_src),
os.path.join(image_translator.output_dir, 'images', os.path.basename(img_src)),
os.path.join(image_translator.output_dir, os.path.basename(img_src)),
os.path.join(image_translator.output_dir, os.path.dirname(img_src), os.path.basename(img_src))
]
img_path = None
for path in possible_paths:
if os.path.exists(path):
img_path = path
print(f" ✅ Found image at: {path}")
break
if not img_path:
print(f" ❌ Image not found in any location for: {img_src}")
print(f" Tried: {possible_paths}")
continue
img_path = os.path.normpath(img_path)
if not os.path.exists(img_path):
print(f" ⚠️ Image not found: {img_path}")
print(f" 📁 Images directory: {image_translator.images_dir}")
print(f" 📁 Output directory: {image_translator.output_dir}")
print(f" 📁 Working directory: {os.getcwd()}")
if os.path.exists(image_translator.images_dir):
files = os.listdir(image_translator.images_dir)
print(f" 📁 Files in images dir: {files[:5]}...")
continue
print(f" 🔍 Processing image {idx}/{len(images)}: {os.path.basename(img_path)}")
context = ""
if img_info.get('alt'):
context += f", Alt text: {img_info['alt']}"
if translated_count > 0:
delay = float(os.getenv('IMAGE_API_DELAY', '1.0'))
time.sleep(delay)
translation_result = image_translator.translate_image(img_path, context, check_stop_fn)
print(f"\n🔍 DEBUG: Image {idx}/{len(images)}")
print(f" Translation result: {'Success' if translation_result and '[Image Translation Error:' not in translation_result else 'Failed'}")
if translation_result and "[Image Translation Error:" in translation_result:
print(f" Error message: {translation_result}")
if translation_result:
img_tag = None
for img in soup.find_all('img'):
if img.get('src') == img_src:
img_tag = img
break
if img_tag:
hide_label = os.getenv("HIDE_IMAGE_TRANSLATION_LABEL", "0") == "1"
print(f" 🔍 DEBUG: Integration Phase")
print(f" 🏷️ Hide label mode: {hide_label}")
print(f" 📍 Found img tag: {img_tag.get('src')}")
# Store the translation result in the dictionary FIRST
image_translations[img_path] = translation_result
# Parse the translation result to integrate into the chapter HTML
if '' in translation_result:
trans_soup = BeautifulSoup(translation_result, 'html.parser')
# Try to get the full container first
full_container = trans_soup.find('div', class_=['translated-text-only', 'image-with-translation'])
if full_container:
# Clone the container to avoid issues
new_container = BeautifulSoup(str(full_container), 'html.parser').find('div')
img_tag.replace_with(new_container)
print(f" ✅ Replaced image with full translation container")
else:
# Fallback: manually build the structure
trans_div = trans_soup.find('div', class_='image-translation')
if trans_div:
container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'})
img_tag.replace_with(container)
if not hide_label:
new_img = soup.new_tag('img', src=img_src)
if img_info.get('alt'):
new_img['alt'] = img_info.get('alt')
container.append(new_img)
# Clone the translation div content
new_trans_div = soup.new_tag('div', **{'class': 'image-translation'})
# Copy all children from trans_div to new_trans_div
for child in trans_div.children:
if hasattr(child, 'name'):
new_trans_div.append(BeautifulSoup(str(child), 'html.parser'))
else:
new_trans_div.append(str(child))
container.append(new_trans_div)
print(f" ✅ Built container with translation div")
else:
print(f" ⚠️ No translation div found in result")
continue
else:
# Plain text translation - build structure manually
container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'})
img_tag.replace_with(container)
if not hide_label:
new_img = soup.new_tag('img', src=img_src)
if img_info.get('alt'):
new_img['alt'] = img_info.get('alt')
container.append(new_img)
# Create translation div with content
translation_div = soup.new_tag('div', **{'class': 'image-translation'})
if not hide_label:
label_p = soup.new_tag('p')
label_em = soup.new_tag('em')
#label_em.string = "[Image text translation:]"
label_p.append(label_em)
translation_div.append(label_p)
trans_p = soup.new_tag('p')
trans_p.string = translation_result
translation_div.append(trans_p)
container.append(translation_div)
print(f" ✅ Created plain text translation structure")
translated_count += 1
# Save to translated_images folder
trans_filename = f"ch{actual_num:03d}_img{idx:02d}_translation.html"
trans_filepath = os.path.join(image_translator.translated_images_dir, trans_filename)
# Extract just the translation content for saving
save_soup = BeautifulSoup(translation_result, 'html.parser')
save_div = save_soup.find('div', class_='image-translation')
if not save_div:
# Create a simple div for plain text
save_div = f'
'
with open(trans_filepath, 'w', encoding='utf-8') as f:
f.write(f"""
Chapter {actual_num} - Image {idx} Translation
Chapter {actual_num} - Image {idx}
Original: {os.path.basename(img_path)}
{save_div}
""")
print(f" ✅ Saved translation to: {trans_filename}")
else:
print(f" ⚠️ Could not find image tag in HTML for: {img_src}")
if translated_count > 0:
print(f" 🖼️ Successfully translated {translated_count} images")
# Debug output
final_html = str(soup)
trans_count = final_html.count('
')
print(f" 📊 Final HTML has {trans_count} translation divs")
print(f" 📊 image_translations dict has {len(image_translations)} entries")
prog = image_translator.load_progress()
if "image_chunks" in prog:
completed_images = []
for img_key, img_data in prog["image_chunks"].items():
if len(img_data["completed"]) == img_data["total"]:
completed_images.append(img_key)
for img_key in completed_images:
del prog["image_chunks"][img_key]
if completed_images:
image_translator.save_progress(prog)
print(f" 🧹 Cleaned up progress for {len(completed_images)} completed images")
image_translator.save_translation_log(actual_num, image_translations)
return str(soup), image_translations
else:
print(f" ℹ️ No images were successfully translated")
return chapter_html, {}
def detect_novel_numbering(chapters):
"""Detect if the novel uses 0-based or 1-based chapter numbering with improved accuracy"""
print("[DEBUG] Detecting novel numbering system...")
if not chapters:
return False
if isinstance(chapters[0], str):
print("[DEBUG] Text file detected, skipping numbering detection")
return False
patterns = PatternManager.FILENAME_EXTRACT_PATTERNS
# Special check for prefix_suffix pattern like "0000_1.xhtml"
prefix_suffix_pattern = r'^(\d+)_(\d+)[_\.]'
# Track chapter numbers from different sources
filename_numbers = []
content_numbers = []
has_prefix_suffix = False
prefix_suffix_numbers = []
for idx, chapter in enumerate(chapters):
extracted_num = None
# Check filename patterns
if 'original_basename' in chapter and chapter['original_basename']:
filename = chapter['original_basename']
elif 'filename' in chapter:
filename = os.path.basename(chapter['filename'])
else:
continue
# First check for prefix_suffix pattern
prefix_match = re.search(prefix_suffix_pattern, filename, re.IGNORECASE)
if prefix_match:
has_prefix_suffix = True
# Use the SECOND number (after underscore)
suffix_num = int(prefix_match.group(2))
prefix_suffix_numbers.append(suffix_num)
extracted_num = suffix_num
print(f"[DEBUG] Prefix_suffix pattern matched: {filename} -> Chapter {suffix_num}")
else:
# Try other patterns
for pattern in patterns:
match = re.search(pattern, filename)
if match:
extracted_num = int(match.group(1))
#print(f"[DEBUG] Pattern '{pattern}' matched: {filename} -> Chapter {extracted_num}")
break
if extracted_num is not None:
filename_numbers.append(extracted_num)
# Also check chapter content for chapter declarations
if 'body' in chapter:
# Look for "Chapter N" in the first 1000 characters
content_preview = chapter['body'][:1000]
content_match = re.search(r'Chapter\s+(\d+)', content_preview, re.IGNORECASE)
if content_match:
content_num = int(content_match.group(1))
content_numbers.append(content_num)
print(f"[DEBUG] Found 'Chapter {content_num}' in content")
# Decision logic with improved heuristics
# 1. If using prefix_suffix pattern, trust those numbers exclusively
if has_prefix_suffix and prefix_suffix_numbers:
min_suffix = min(prefix_suffix_numbers)
if min_suffix >= 1:
print(f"[DEBUG] ✅ 1-based novel detected (prefix_suffix pattern starts at {min_suffix})")
return False
else:
print(f"[DEBUG] ✅ 0-based novel detected (prefix_suffix pattern starts at {min_suffix})")
return True
# 2. If we have content numbers, prefer those over filename numbers
if content_numbers:
min_content = min(content_numbers)
# Check if we have a good sequence starting from 0 or 1
if 0 in content_numbers and 1 in content_numbers:
print(f"[DEBUG] ✅ 0-based novel detected (found both Chapter 0 and Chapter 1 in content)")
return True
elif min_content == 1:
print(f"[DEBUG] ✅ 1-based novel detected (content chapters start at 1)")
return False
# 3. Fall back to filename numbers
if filename_numbers:
min_filename = min(filename_numbers)
max_filename = max(filename_numbers)
# Check for a proper sequence
# If we have 0,1,2,3... it's likely 0-based
# If we have 1,2,3,4... it's likely 1-based
# Count how many chapters we have in sequence starting from 0
zero_sequence_count = 0
for i in range(len(chapters)):
if i in filename_numbers:
zero_sequence_count += 1
else:
break
# Count how many chapters we have in sequence starting from 1
one_sequence_count = 0
for i in range(1, len(chapters) + 1):
if i in filename_numbers:
one_sequence_count += 1
else:
break
print(f"[DEBUG] Zero-based sequence length: {zero_sequence_count}")
print(f"[DEBUG] One-based sequence length: {one_sequence_count}")
# If we have a better sequence starting from 1, it's 1-based
if one_sequence_count > zero_sequence_count and min_filename >= 1:
print(f"[DEBUG] ✅ 1-based novel detected (better sequence match starting from 1)")
return False
# If we have any 0 in filenames and it's part of a sequence
if 0 in filename_numbers and zero_sequence_count >= 3:
print(f"[DEBUG] ✅ 0-based novel detected (found 0 in sequence)")
return True
# 4. Default to 1-based if uncertain
print(f"[DEBUG] ✅ Defaulting to 1-based novel (insufficient evidence for 0-based)")
return False
def validate_chapter_continuity(chapters):
"""Validate chapter continuity and warn about issues"""
if not chapters:
print("No chapters to translate")
return
issues = []
# Get all chapter numbers
chapter_nums = [c['num'] for c in chapters]
actual_nums = [c.get('actual_chapter_num', c['num']) for c in chapters]
# Check for duplicates
duplicates = [num for num in chapter_nums if chapter_nums.count(num) > 1]
if duplicates:
issues.append(f"Duplicate chapter numbers found: {set(duplicates)}")
# Check for gaps in sequence
min_num = min(chapter_nums)
max_num = max(chapter_nums)
expected = set(range(min_num, max_num + 1))
actual = set(chapter_nums)
missing = expected - actual
if missing:
issues.append(f"Missing chapter numbers: {sorted(missing)}")
# Show gaps more clearly
gaps = []
sorted_missing = sorted(missing)
if sorted_missing:
start = sorted_missing[0]
end = sorted_missing[0]
for num in sorted_missing[1:]:
if num == end + 1:
end = num
else:
gaps.append(f"{start}-{end}" if start != end else str(start))
start = end = num
gaps.append(f"{start}-{end}" if start != end else str(start))
issues.append(f"Gap ranges: {', '.join(gaps)}")
# Check for duplicate titles
title_map = {}
for c in chapters:
title_lower = c['title'].lower().strip()
if title_lower in title_map:
title_map[title_lower].append(c['num'])
else:
title_map[title_lower] = [c['num']]
for title, nums in title_map.items():
if len(nums) > 1:
issues.append(f"Duplicate title '{title}' in chapters: {nums}")
# Print summary
print("\n" + "="*60)
print("📚 CHAPTER VALIDATION SUMMARY")
print("="*60)
print(f"Total chapters: {len(chapters)}")
print(f"Chapter range: {min_num} to {max_num}")
print(f"Expected count: {max_num - min_num + 1}")
print(f"Actual count: {len(chapters)}")
if len(chapters) != (max_num - min_num + 1):
print(f"⚠️ Chapter count mismatch - missing {(max_num - min_num + 1) - len(chapters)} chapters")
if issues:
print("\n⚠️ Issues found:")
for issue in issues:
print(f" - {issue}")
else:
print("✅ No continuity issues detected")
print("="*60 + "\n")
def validate_epub_structure(output_dir):
"""Validate that all necessary EPUB structure files are present"""
print("🔍 Validating EPUB structure...")
required_files = {
'container.xml': 'META-INF container file (critical)',
'*.opf': 'OPF package file (critical)',
'*.ncx': 'Navigation file (recommended)'
}
found_files = {}
missing_files = []
container_path = os.path.join(output_dir, 'container.xml')
if os.path.exists(container_path):
found_files['container.xml'] = 'Found'
print(" ✅ container.xml - Found")
else:
missing_files.append('container.xml')
print(" ❌ container.xml - Missing (CRITICAL)")
opf_files = []
ncx_files = []
for file in os.listdir(output_dir):
if file.lower().endswith('.opf'):
opf_files.append(file)
elif file.lower().endswith('.ncx'):
ncx_files.append(file)
if opf_files:
found_files['opf'] = opf_files
print(f" ✅ OPF file(s) - Found: {', '.join(opf_files)}")
else:
missing_files.append('*.opf')
print(" ❌ OPF file - Missing (CRITICAL)")
if ncx_files:
found_files['ncx'] = ncx_files
print(f" ✅ NCX file(s) - Found: {', '.join(ncx_files)}")
else:
missing_files.append('*.ncx')
print(" ⚠️ NCX file - Missing (navigation may not work)")
html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')]
if html_files:
print(f" ✅ Translated chapters - Found: {len(html_files)} files")
else:
print(" ⚠️ No translated chapter files found")
critical_missing = [f for f in missing_files if f in ['container.xml', '*.opf']]
if not critical_missing:
print("✅ EPUB structure validation PASSED")
print(" All critical files present for EPUB reconstruction")
return True
else:
print("❌ EPUB structure validation FAILED")
print(f" Missing critical files: {', '.join(critical_missing)}")
print(" EPUB reconstruction may fail without these files")
return False
def check_epub_readiness(output_dir):
"""Check if the output directory is ready for EPUB compilation"""
print("📋 Checking EPUB compilation readiness...")
issues = []
if not validate_epub_structure(output_dir):
issues.append("Missing critical EPUB structure files")
html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')]
if not html_files:
issues.append("No translated chapter files found")
else:
print(f" ✅ Found {len(html_files)} translated chapters")
metadata_path = os.path.join(output_dir, 'metadata.json')
if os.path.exists(metadata_path):
print(" ✅ Metadata file present")
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
if 'title' not in metadata:
issues.append("Metadata missing title")
except Exception as e:
issues.append(f"Metadata file corrupted: {e}")
else:
issues.append("Missing metadata.json file")
resource_dirs = ['css', 'fonts', 'images']
found_resources = 0
for res_dir in resource_dirs:
res_path = os.path.join(output_dir, res_dir)
if os.path.exists(res_path):
files = [f for f in os.listdir(res_path) if os.path.isfile(os.path.join(res_path, f))]
if files:
found_resources += len(files)
print(f" ✅ Found {len(files)} {res_dir} files")
if found_resources > 0:
print(f" ✅ Total resources: {found_resources} files")
else:
print(" ⚠️ No resource files found (this may be normal)")
if not issues:
print("🎉 EPUB compilation readiness: READY")
print(" All necessary files present for EPUB creation")
return True
else:
print("⚠️ EPUB compilation readiness: ISSUES FOUND")
for issue in issues:
print(f" • {issue}")
return False
def cleanup_previous_extraction(output_dir):
"""Clean up any files from previous extraction runs (preserves CSS files)"""
# Remove 'css' from cleanup_items to preserve CSS files
cleanup_items = [
'images', # Removed 'css' from this list
'.resources_extracted'
]
epub_structure_files = [
'container.xml', 'content.opf', 'toc.ncx'
]
cleaned_count = 0
# Clean up directories (except CSS)
for item in cleanup_items:
if item.startswith('.'):
continue
item_path = os.path.join(output_dir, item)
try:
if os.path.isdir(item_path):
shutil.rmtree(item_path)
print(f"🧹 Removed directory: {item}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove directory {item}: {e}")
# Clean up EPUB structure files
for epub_file in epub_structure_files:
file_path = os.path.join(output_dir, epub_file)
try:
if os.path.isfile(file_path):
os.remove(file_path)
print(f"🧹 Removed EPUB file: {epub_file}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove {epub_file}: {e}")
# Clean up any loose .opf and .ncx files
try:
for file in os.listdir(output_dir):
if file.lower().endswith(('.opf', '.ncx')):
file_path = os.path.join(output_dir, file)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"🧹 Removed EPUB file: {file}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Error scanning for EPUB files: {e}")
# Remove extraction marker
marker_path = os.path.join(output_dir, '.resources_extracted')
try:
if os.path.isfile(marker_path):
os.remove(marker_path)
print(f"🧹 Removed extraction marker")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove extraction marker: {e}")
# Check if CSS files exist and inform user they're being preserved
css_path = os.path.join(output_dir, 'css')
if os.path.exists(css_path):
try:
css_files = [f for f in os.listdir(css_path) if os.path.isfile(os.path.join(css_path, f))]
if css_files:
print(f"📚 Preserving {len(css_files)} CSS files")
except Exception:
pass
if cleaned_count > 0:
print(f"🧹 Cleaned up {cleaned_count} items from previous runs (CSS files preserved)")
return cleaned_count
# =====================================================
# API AND TRANSLATION UTILITIES
# =====================================================
def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn, chunk_timeout=None, request_id=None, context=None):
"""Send API request with interrupt capability and optional timeout retry.
Optional context parameter is passed through to the client to improve payload labeling.
"""
# Import UnifiedClientError at function level to avoid scoping issues
from unified_api_client import UnifiedClientError
# The client.send() call will handle multi-key rotation automatically
# Generate request_id if not provided
#if request_id is None:
# request_id = str(uuid.uuid4())[:8]
result_queue = queue.Queue()
def api_call():
try:
start_time = time.time()
# Check if client.send accepts request_id parameter
send_params = {
'messages': messages,
'temperature': temperature,
'max_tokens': max_tokens
}
# Add context if supported
sig = inspect.signature(client.send)
if 'context' in sig.parameters and context is not None:
send_params['context'] = context
# Add request_id if the client supports it
sig = inspect.signature(client.send)
#if 'request_id' in sig.parameters:
# send_params['request_id'] = request_id
result = client.send(**send_params)
elapsed = time.time() - start_time
result_queue.put((result, elapsed))
except Exception as e:
result_queue.put(e)
api_thread = threading.Thread(target=api_call)
api_thread.daemon = True
api_thread.start()
timeout = chunk_timeout if chunk_timeout is not None else 86400
check_interval = 0.5
elapsed = 0
while elapsed < timeout:
try:
result = result_queue.get(timeout=check_interval)
if isinstance(result, Exception):
# For expected errors like rate limits, preserve the error type without extra traceback
if hasattr(result, 'error_type') and result.error_type == "rate_limit":
raise result
elif "429" in str(result) or "rate limit" in str(result).lower():
# Convert generic exceptions to UnifiedClientError for rate limits
raise UnifiedClientError(str(result), error_type="rate_limit")
else:
raise result
if isinstance(result, tuple):
api_result, api_time = result
if chunk_timeout and api_time > chunk_timeout:
# Set cleanup flag when chunk timeout occurs
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)")
return api_result
return result
except queue.Empty:
if stop_check_fn():
# Set cleanup flag when user stops
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
raise UnifiedClientError("Translation stopped by user")
elapsed += check_interval
# Set cleanup flag when timeout occurs
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
raise UnifiedClientError(f"API call timed out after {timeout} seconds")
def handle_api_error(processor, error, chunk_info=""):
"""Handle API errors with multi-key support"""
error_str = str(error)
# Check for rate limit
if "429" in error_str or "rate limit" in error_str.lower():
if processor.config.use_multi_api_keys:
print(f"⚠️ Rate limit hit {chunk_info}, client should rotate to next key")
stats = processor.client.get_stats()
print(f"📊 API Stats - Active keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}")
if stats.get('active_keys', 0) == 0:
print("⏳ All API keys are cooling down - will wait and retry")
print(f"🔄 Multi-key error handling: Rate limit processed, preparing for key rotation...")
time.sleep(0.1) # Brief pause after rate limit detection for stability
return True # Always retry
else:
print(f"⚠️ Rate limit hit {chunk_info}, waiting before retry...")
time.sleep(60)
print(f"🔄 Single-key error handling: Rate limit wait completed, ready for retry...")
time.sleep(0.1) # Brief pause after rate limit wait for stability
return True # Always retry
# Other errors
print(f"❌ API Error {chunk_info}: {error_str}")
return False
def parse_token_limit(env_value):
"""Parse token limit from environment variable"""
if not env_value or env_value.strip() == "":
return None, "unlimited"
env_value = env_value.strip()
if env_value.lower() == "unlimited":
return None, "unlimited"
if env_value.isdigit() and int(env_value) > 0:
limit = int(env_value)
return limit, str(limit)
return 1000000, "1000000 (default)"
def build_system_prompt(user_prompt, glossary_path=None):
"""Build the system prompt with glossary - TRUE BRUTE FORCE VERSION"""
append_glossary = os.getenv("APPEND_GLOSSARY", "1") == "1"
actual_glossary_path = glossary_path
system = user_prompt if user_prompt else ""
if append_glossary and actual_glossary_path and os.path.exists(actual_glossary_path):
try:
print(f"[DEBUG] ✅ Loading glossary from: {os.path.abspath(actual_glossary_path)}")
# Try to load as JSON first
try:
with open(actual_glossary_path, "r", encoding="utf-8") as gf:
glossary_data = json.load(gf)
glossary_text = json.dumps(glossary_data, ensure_ascii=False, indent=2)
print(f"[DEBUG] Loaded as JSON")
except json.JSONDecodeError:
# If JSON fails, just read as raw text
#print(f"[DEBUG] JSON parse failed, reading as raw text")
with open(actual_glossary_path, "r", encoding="utf-8") as gf:
glossary_text = gf.read()
if system:
system += "\n\n"
custom_prompt = os.getenv("APPEND_GLOSSARY_PROMPT", "Character/Term Glossary (use these translations consistently):").strip()
if not custom_prompt:
custom_prompt = "Character/Term Glossary (use these translations consistently):"
system += f"{custom_prompt}\n{glossary_text}"
print(f"[DEBUG] ✅ Entire glossary appended!")
print(f"[DEBUG] Glossary text length: {len(glossary_text)} characters")
except Exception as e:
print(f"[ERROR] Could not load glossary: {e}")
import traceback
print(f"[ERROR] Full traceback: {traceback.format_exc()}")
else:
if not append_glossary:
#print(f"[DEBUG] ❌ Glossary append disabled")
pass
elif not actual_glossary_path:
print(f"[DEBUG] ❌ No glossary path provided")
elif not os.path.exists(actual_glossary_path):
print(f"[DEBUG] ❌ Glossary file does not exist: {actual_glossary_path}")
print(f"🎯 Final system prompt length: {len(system)} characters")
return system
def translate_title(title, client, system_prompt, user_prompt, temperature=0.3):
"""Translate the book title using the configured settings"""
if not title or not title.strip():
return title
print(f"📚 Processing book title: {title}")
try:
if os.getenv("TRANSLATE_BOOK_TITLE", "1") == "0":
print(f"📚 Book title translation disabled - keeping original")
return title
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the text without AI prompts
print(f"📚 Using translation service ({client_type}) - sending text directly")
messages = [
{"role": "user", "content": title}
]
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens)
else:
# For AI services, use prompts as before
book_title_prompt = os.getenv("BOOK_TITLE_PROMPT",
"Translate this book title to English while retaining any acronyms:")
# Get the system prompt for book titles, with fallback to default
book_title_system_prompt = os.getenv("BOOK_TITLE_SYSTEM_PROMPT",
"You are a translator. Respond with only the translated text, nothing else. Do not add any explanation or additional content.")
messages = [
{"role": "system", "content": book_title_system_prompt},
{"role": "user", "content": f"{book_title_prompt}\n\n{title}"}
]
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens)
print(f"[DEBUG] Raw API response: '{translated_title}'")
print(f"[DEBUG] Response length: {len(translated_title)} (original: {len(title)})")
newline = '\n'
print(f"[DEBUG] Has newlines: {repr(translated_title) if newline in translated_title else 'No'}")
translated_title = translated_title.strip()
if ((translated_title.startswith('"') and translated_title.endswith('"')) or
(translated_title.startswith("'") and translated_title.endswith("'"))):
translated_title = translated_title[1:-1].strip()
if '\n' in translated_title:
print(f"⚠️ API returned multi-line content, keeping original title")
return title
# Check for JSON-like structured content, but allow simple brackets like [END]
if (any(char in translated_title for char in ['{', '}']) or
'"role":' in translated_title or
'"content":' in translated_title or
('[[' in translated_title and ']]' in translated_title)): # Only flag double brackets
print(f"⚠️ API returned structured content, keeping original title")
return title
if any(tag in translated_title.lower() for tag in ['
', '
', '
', '
', '= 2:
return True
# Single strong error indicator in very short response
if len(content_str) < 50 and error_count >= 1:
return True
return False
# Additional helper function for debugging
def get_failure_reason(content):
"""
Returns the specific reason why content was marked as qa_failed
Useful for debugging and logging
"""
if not content:
return "Empty content"
content_str = str(content).strip()
content_lower = content_str.lower()
# Check each category and return the first match
failure_categories = {
"Explicit Failure Marker": [
"[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]",
"[IMAGE TRANSLATION FAILED]",
"API response unavailable",
"[]"
],
"HTTP Error": [
"authentication_error", "rate_limit_error", "api_error"
],
"Content Filter": [
"content_filter", "safety filter", "blocked by safety"
],
"Timeout": [
"timeout", "timed out", "apitimeouterror"
],
"Rate Limit": [
"rate limit exceeded", "quota exceeded", "too many requests"
],
"Refusal Pattern": [
"i cannot", "i can't", "unable to process"
],
"Empty Response": [
'"text": ""', "choices: [ { text: ''"
]
}
for category, markers in failure_categories.items():
for marker in markers:
if marker in content_str or marker in content_lower:
return f"{category}: {marker}"
if len(content_str) < 50:
return f"Short response with error indicators: {content_str[:30]}..."
return "Unknown failure pattern"
def convert_enhanced_text_to_html(plain_text, chapter_info=None):
"""Convert markdown/plain text back to HTML after translation (for enhanced mode)
This function handles the conversion of translated markdown back to HTML.
The input is the TRANSLATED text that was originally extracted using html2text.
"""
import re
preserve_structure = chapter_info.get('preserve_structure', False) if chapter_info else False
# First, try to use markdown2 for proper markdown conversion
try:
import markdown2
# Check if the text contains markdown patterns
has_markdown = any([
'##' in plain_text, # Headers
'**' in plain_text, # Bold
'*' in plain_text and not '**' in plain_text, # Italic
'[' in plain_text and '](' in plain_text, # Links
'```' in plain_text, # Code blocks
'> ' in plain_text, # Blockquotes
'- ' in plain_text or '* ' in plain_text or '1. ' in plain_text # Lists
])
if has_markdown or preserve_structure:
# Use markdown2 for proper conversion
html = markdown2.markdown(plain_text, extras=[
'cuddled-lists', # Lists without blank lines
'fenced-code-blocks', # Code blocks with ```
'break-on-newline', # Treat single newlines as
'smarty-pants', # Smart quotes and dashes
'tables', # Markdown tables
])
# Post-process to ensure proper paragraph structure
if not '
' in html:
# If markdown2 didn't create paragraphs, wrap content
lines = html.split('\n')
processed_lines = []
for line in lines:
line = line.strip()
if line and not line.startswith('<') and not line.endswith('>'):
processed_lines.append(f'
{line}
')
elif line:
processed_lines.append(line)
html = '\n'.join(processed_lines)
return html
except ImportError:
print("⚠️ markdown2 not available, using fallback HTML conversion")
# Fallback: Manual markdown-to-HTML conversion
lines = plain_text.strip().split('\n')
html_parts = []
in_code_block = False
code_block_content = []
for line in lines:
# Handle code blocks
if line.strip().startswith('```'):
if in_code_block:
# End code block
html_parts.append('
' + '\n'.join(code_block_content) + '
')
code_block_content = []
in_code_block = False
else:
# Start code block
in_code_block = True
continue
if in_code_block:
code_block_content.append(line)
continue
line = line.strip()
if not line:
# Preserve empty lines as paragraph breaks
if html_parts and not html_parts[-1].endswith(''):
# Only add break if not already after a closing tag
html_parts.append('
')
continue
# Check for markdown headers
if line.startswith('#'):
match = re.match(r'^(#+)\s*(.+)$', line)
if match:
level = min(len(match.group(1)), 6)
header_text = match.group(2).strip()
html_parts.append(f'
{header_text}')
continue
# Check for blockquotes
if line.startswith('> '):
quote_text = line[2:].strip()
html_parts.append(f'
{quote_text}
')
continue
# Check for lists
if re.match(r'^[*\-+]\s+', line):
list_text = re.sub(r'^[*\-+]\s+', '', line)
html_parts.append(f'
{list_text}')
continue
if re.match(r'^\d+\.\s+', line):
list_text = re.sub(r'^\d+\.\s+', '', line)
html_parts.append(f'
{list_text}')
continue
# Convert inline markdown
# Bold
line = re.sub(r'\*\*(.+?)\*\*', r'
\1', line)
line = re.sub(r'__(.+?)__', r'
\1', line)
# Italic
line = re.sub(r'\*(.+?)\*', r'
\1', line)
line = re.sub(r'_(.+?)_', r'
\1', line)
# Links
line = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'
\1', line)
# Code inline
line = re.sub(r'`([^`]+)`', r'
\1', line)
# Regular paragraph
html_parts.append(f'
{line}
')
# Post-process lists to wrap in ul/ol tags
final_html = []
in_list = False
list_type = None
for part in html_parts:
if part.startswith('
'):
if not in_list:
# Determine list type based on context (simplified)
list_type = 'ul' # Default to unordered
final_html.append(f'<{list_type}>')
in_list = True
final_html.append(part)
else:
if in_list:
final_html.append(f'{list_type}>')
in_list = False
final_html.append(part)
# Close any open list
if in_list:
final_html.append(f'{list_type}>')
return '\n'.join(final_html)
# =====================================================
# MAIN TRANSLATION FUNCTION
# =====================================================
def main(log_callback=None, stop_callback=None):
"""Main translation function with enhanced duplicate detection and progress tracking"""
config = TranslationConfig()
builtins._DISABLE_ZERO_DETECTION = config.DISABLE_ZERO_DETECTION
if config.DISABLE_ZERO_DETECTION:
print("=" * 60)
print("⚠️ 0-BASED DETECTION DISABLED BY USER")
print("⚠️ All chapter numbers will be used exactly as found")
print("=" * 60)
args = None
chapters_completed = 0
chunks_completed = 0
args = None
chapters_completed = 0
chunks_completed = 0
input_path = config.input_path
if not input_path and len(sys.argv) > 1:
input_path = sys.argv[1]
is_text_file = input_path.lower().endswith('.txt')
if is_text_file:
os.environ["IS_TEXT_FILE_TRANSLATION"] = "1"
import json as _json
_original_load = _json.load
def debug_json_load(fp, *args, **kwargs):
result = _original_load(fp, *args, **kwargs)
if isinstance(result, list) and len(result) > 0:
if isinstance(result[0], dict) and 'original_name' in result[0]:
print(f"[DEBUG] Loaded glossary list with {len(result)} items from {fp.name if hasattr(fp, 'name') else 'unknown'}")
return result
_json.load = debug_json_load
if log_callback:
set_output_redirect(log_callback)
def check_stop():
if stop_callback and stop_callback():
print("❌ Translation stopped by user request.")
return True
return is_stop_requested()
if config.EMERGENCY_RESTORE:
print("✅ Emergency paragraph restoration is ENABLED")
else:
print("⚠️ Emergency paragraph restoration is DISABLED")
print(f"[DEBUG] REMOVE_AI_ARTIFACTS environment variable: {os.getenv('REMOVE_AI_ARTIFACTS', 'NOT SET')}")
print(f"[DEBUG] REMOVE_AI_ARTIFACTS parsed value: {config.REMOVE_AI_ARTIFACTS}")
if config.REMOVE_AI_ARTIFACTS:
print("⚠️ AI artifact removal is ENABLED - will clean AI response artifacts")
else:
print("✅ AI artifact removal is DISABLED - preserving all content as-is")
if '--epub' in sys.argv or (len(sys.argv) > 1 and sys.argv[1].endswith(('.epub', '.txt'))):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('epub', help='Input EPUB or text file')
args = parser.parse_args()
input_path = args.epub
is_text_file = input_path.lower().endswith('.txt')
if is_text_file:
file_base = os.path.splitext(os.path.basename(input_path))[0]
else:
epub_base = os.path.splitext(os.path.basename(input_path))[0]
file_base = epub_base
out = file_base
os.makedirs(out, exist_ok=True)
print(f"[DEBUG] Created output folder → {out}")
cleanup_previous_extraction(out)
os.environ["EPUB_OUTPUT_DIR"] = out
payloads_dir = out
# clear history if CONTEXTUAL is disabled
if not config.CONTEXTUAL:
history_file = os.path.join(payloads_dir, "translation_history.json")
if os.path.exists(history_file):
os.remove(history_file)
print("[DEBUG] CONTEXTUAL disabled - cleared translation history")
history_manager = HistoryManager(payloads_dir)
chapter_splitter = ChapterSplitter(model_name=config.MODEL)
chunk_context_manager = ChunkContextManager()
progress_manager = ProgressManager(payloads_dir)
# Create ChapterExtractor with progress callback if available
chapter_progress_callback = None
if log_callback:
# Create a wrapper that formats progress messages for the log
def chapter_progress_callback(msg):
log_callback(f"📊 {msg}")
chapter_extractor = ChapterExtractor(progress_callback=chapter_progress_callback)
glossary_manager = GlossaryManager()
history_file = os.path.join(payloads_dir, "translation_history.json")
if os.path.exists(history_file):
os.remove(history_file)
print(f"[DEBUG] Purged translation history → {history_file}")
print("🔍 Checking for deleted output files...")
progress_manager.cleanup_missing_files(out)
progress_manager.save()
if check_stop():
return
if not config.API_KEY:
print("❌ Error: Set API_KEY, OPENAI_API_KEY, or OPENAI_OR_Gemini_API_KEY in your environment.")
return
#print(f"[DEBUG] Found API key: {config.API_KEY[:10]}...")
print(f"[DEBUG] Using model = {config.MODEL}")
print(f"[DEBUG] Max output tokens = {config.MAX_OUTPUT_TOKENS}")
client = UnifiedClient(model=config.MODEL, api_key=config.API_KEY, output_dir=out)
if hasattr(client, 'use_multi_keys') and client.use_multi_keys:
stats = client.get_stats()
print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded")
print(f" Active keys: {stats.get('active_keys', 0)}")
else:
print(f"🔑 Single-key mode: Using {config.MODEL}")
# Reset cleanup state when starting new translation
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
if is_text_file:
print("📄 Processing text file...")
try:
txt_processor = TextFileProcessor(input_path, out)
chapters = txt_processor.extract_chapters()
txt_processor.save_original_structure()
metadata = {
"title": os.path.splitext(os.path.basename(input_path))[0],
"type": "text",
"chapter_count": len(chapters)
}
except ImportError as e:
print(f"❌ Error: Text file processor not available: {e}")
if log_callback:
log_callback(f"❌ Error: Text file processor not available: {e}")
return
except Exception as e:
print(f"❌ Error processing text file: {e}")
if log_callback:
log_callback(f"❌ Error processing text file: {e}")
return
else:
# Check if we should use async extraction (for GUI mode)
use_async_extraction = os.getenv("USE_ASYNC_CHAPTER_EXTRACTION", "0") == "1"
if use_async_extraction and log_callback:
print("🚀 Using async chapter extraction (subprocess mode)...")
from chapter_extraction_manager import ChapterExtractionManager
# Create manager with log callback
extraction_manager = ChapterExtractionManager(log_callback=log_callback)
# Get extraction mode
extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower()
# Define completion callback
extraction_result = {"completed": False, "result": None}
def on_extraction_complete(result):
extraction_result["completed"] = True
extraction_result["result"] = result
# Safety check for None result
if result is None:
log_callback("❌ Chapter extraction failed: No result returned")
return
if result.get("success"):
log_callback(f"✅ Chapter extraction completed: {result.get('chapters', 0)} chapters")
else:
log_callback(f"❌ Chapter extraction failed: {result.get('error', 'Unknown error')}")
# Start async extraction
extraction_manager.extract_chapters_async(
input_path,
out,
extraction_mode=extraction_mode,
progress_callback=lambda msg: log_callback(f"📊 {msg}"),
completion_callback=on_extraction_complete
)
# Wait for completion (with timeout)
timeout = 300 # 5 minutes timeout
start_time = time.time()
while not extraction_result["completed"]:
if check_stop():
extraction_manager.stop_extraction()
return
if time.time() - start_time > timeout:
log_callback("⚠️ Chapter extraction timeout")
extraction_manager.stop_extraction()
return
time.sleep(0.1) # Check every 100ms
# Check if extraction was successful
if not extraction_result["result"] or not extraction_result["result"].get("success"):
log_callback("❌ Chapter extraction failed")
return
# Load the extracted data
metadata_path = os.path.join(out, "metadata.json")
if os.path.exists(metadata_path):
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
else:
metadata = extraction_result["result"].get("metadata", {})
# The async extraction should have saved chapters directly, similar to the sync version
# We need to reconstruct the chapters list with body content
# Check if the extraction actually created a chapters.json file with full content
chapters_full_path = os.path.join(out, "chapters_full.json")
chapters_info_path = os.path.join(out, "chapters_info.json")
chapters = []
# First try to load full chapters if saved
if os.path.exists(chapters_full_path):
log_callback("Loading full chapters data...")
with open(chapters_full_path, 'r', encoding='utf-8') as f:
chapters = json.load(f)
log_callback(f"✅ Loaded {len(chapters)} chapters with content")
elif os.path.exists(chapters_info_path):
# Fall back to loading from individual files
log_callback("Loading chapter info and searching for content files...")
with open(chapters_info_path, 'r', encoding='utf-8') as f:
chapters_info = json.load(f)
# List all files in the output directory
all_files = os.listdir(out)
log_callback(f"Found {len(all_files)} files in output directory")
# Try to match chapter files
for info in chapters_info:
chapter_num = info['num']
found = False
# Try different naming patterns
patterns = [
f"chapter_{chapter_num:04d}_", # With leading zeros
f"chapter_{chapter_num}_", # Without leading zeros
f"ch{chapter_num:04d}_", # Shortened with zeros
f"ch{chapter_num}_", # Shortened without zeros
f"{chapter_num:04d}_", # Just number with zeros
f"{chapter_num}_" # Just number
]
for pattern in patterns:
# Find files matching this pattern (any extension)
matching_files = [f for f in all_files if f.startswith(pattern)]
if matching_files:
# Prefer HTML/XHTML files
html_files = [f for f in matching_files if f.endswith(('.html', '.xhtml', '.htm'))]
if html_files:
chapter_file = html_files[0]
else:
chapter_file = matching_files[0]
chapter_path = os.path.join(out, chapter_file)
try:
with open(chapter_path, 'r', encoding='utf-8') as f:
content = f.read()
chapters.append({
"num": chapter_num,
"title": info.get("title", f"Chapter {chapter_num}"),
"body": content,
"filename": info.get("original_filename", ""),
"has_images": info.get("has_images", False),
"file_size": len(content),
"content_hash": info.get("content_hash", "")
})
found = True
break
except Exception as e:
log_callback(f"⚠️ Error reading {chapter_file}: {e}")
if not found:
log_callback(f"⚠️ No file found for Chapter {chapter_num}")
# Log available files for debugging
if len(all_files) < 50:
similar_files = [f for f in all_files if str(chapter_num) in f]
if similar_files:
log_callback(f" Similar files: {similar_files[:3]}")
if not chapters:
log_callback("❌ No chapters could be loaded!")
log_callback(f"❌ Output directory: {out}")
log_callback(f"❌ Files in directory: {len(os.listdir(out))} files")
# Show first few files for debugging
sample_files = os.listdir(out)[:10]
log_callback(f"❌ Sample files: {sample_files}")
return
# Sort chapters by OPF spine order if available
opf_path = os.path.join(out, 'content.opf')
if os.path.exists(opf_path) and chapters:
log_callback("📋 Sorting chapters according to OPF spine order...")
# Use the existing chapter_extractor instance to sort
chapters = chapter_extractor._sort_by_opf_spine(chapters, opf_path)
log_callback("✅ Chapters sorted according to OPF reading order")
else:
print("🚀 Using comprehensive chapter extraction with resource handling...")
with zipfile.ZipFile(input_path, 'r') as zf:
metadata = chapter_extractor._extract_epub_metadata(zf)
chapters = chapter_extractor.extract_chapters(zf, out)
print(f"\n📚 Extraction Summary:")
print(f" Total chapters extracted: {len(chapters)}")
if chapters:
nums = [c.get('num', 0) for c in chapters]
print(f" Chapter range: {min(nums)} to {max(nums)}")
# Check for gaps in the sequence
expected_count = max(nums) - min(nums) + 1
if len(chapters) < expected_count:
print(f"\n⚠️ Potential missing chapters detected:")
print(f" Expected {expected_count} chapters (from {min(nums)} to {max(nums)})")
print(f" Actually found: {len(chapters)} chapters")
print(f" Potentially missing: {expected_count - len(chapters)} chapters")
validate_chapter_continuity(chapters)
print("\n" + "="*50)
validate_epub_structure(out)
print("="*50 + "\n")
progress_manager.migrate_to_content_hash(chapters)
progress_manager.save()
if check_stop():
return
metadata_path = os.path.join(out, "metadata.json")
if os.path.exists(metadata_path):
with open(metadata_path, 'r', encoding='utf-8') as mf:
metadata = json.load(mf)
metadata["chapter_count"] = len(chapters)
metadata["chapter_titles"] = {str(c["num"]): c["title"] for c in chapters}
print(f"[DEBUG] Initializing client with model = {config.MODEL}")
client = UnifiedClient(api_key=config.API_KEY, model=config.MODEL, output_dir=out)
if hasattr(client, 'use_multi_keys') and client.use_multi_keys:
stats = client.get_stats()
print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded")
print(f" Active keys: {stats.get('active_keys', 0)}")
else:
print(f"🔑 Single-key mode: Using {config.MODEL}")
# Reset cleanup state when starting new translation
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
if "title" in metadata and config.TRANSLATE_BOOK_TITLE and not metadata.get("title_translated", False):
original_title = metadata["title"]
print(f"📚 Original title: {original_title}")
if not check_stop():
translated_title = translate_title(
original_title,
client,
None,
None,
config.TEMP
)
metadata["original_title"] = original_title
metadata["title"] = translated_title
metadata["title_translated"] = True
print(f"📚 Translated title: {translated_title}")
else:
print("❌ Title translation skipped due to stop request")
# Translate other metadata fields if configured
translate_metadata_fields_str = os.getenv('TRANSLATE_METADATA_FIELDS', '{}')
metadata_translation_mode = os.getenv('METADATA_TRANSLATION_MODE', 'together')
try:
translate_metadata_fields = json.loads(translate_metadata_fields_str)
if translate_metadata_fields and any(translate_metadata_fields.values()):
# Filter out fields that should be translated (excluding already translated fields)
fields_to_translate = {}
skipped_fields = []
for field_name, should_translate in translate_metadata_fields.items():
if should_translate and field_name != 'title' and field_name in metadata:
# Check if already translated
if metadata.get(f"{field_name}_translated", False):
skipped_fields.append(field_name)
print(f"✓ Skipping {field_name} - already translated")
else:
fields_to_translate[field_name] = should_translate
if fields_to_translate:
print("\n" + "="*50)
print("📋 METADATA TRANSLATION PHASE")
print("="*50)
print(f"🌐 Translating {len(fields_to_translate)} metadata fields...")
# Get ALL configuration from environment - NO DEFAULTS
system_prompt = os.getenv('BOOK_TITLE_SYSTEM_PROMPT', '')
if not system_prompt:
print("❌ No system prompt configured, skipping metadata translation")
else:
# Get field-specific prompts
field_prompts_str = os.getenv('METADATA_FIELD_PROMPTS', '{}')
try:
field_prompts = json.loads(field_prompts_str)
except:
field_prompts = {}
if not field_prompts and not field_prompts.get('_default'):
print("❌ No field prompts configured, skipping metadata translation")
else:
# Get language configuration
lang_behavior = os.getenv('LANG_PROMPT_BEHAVIOR', 'auto')
forced_source_lang = os.getenv('FORCED_SOURCE_LANG', 'Korean')
output_language = os.getenv('OUTPUT_LANGUAGE', 'English')
# Determine source language
source_lang = metadata.get('language', '').lower()
if lang_behavior == 'never':
lang_str = ""
elif lang_behavior == 'always':
lang_str = forced_source_lang
else: # auto
if 'zh' in source_lang or 'chinese' in source_lang:
lang_str = 'Chinese'
elif 'ja' in source_lang or 'japanese' in source_lang:
lang_str = 'Japanese'
elif 'ko' in source_lang or 'korean' in source_lang:
lang_str = 'Korean'
else:
lang_str = ''
# Check if batch translation is enabled for parallel processing
batch_translate_enabled = os.getenv('BATCH_TRANSLATION', '0') == '1'
batch_size = int(os.getenv('BATCH_SIZE', '50')) # Default batch size
if batch_translate_enabled and len(fields_to_translate) > 1:
print(f"⚡ Using parallel metadata translation mode ({len(fields_to_translate)} fields, batch size: {batch_size})...")
# Import ThreadPoolExecutor for parallel processing
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Thread-safe results storage
translation_results = {}
results_lock = threading.Lock()
def translate_metadata_field(field_name, original_value):
"""Translate a single metadata field"""
try:
print(f"\n📋 Translating {field_name}: {original_value[:100]}..."
if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}")
# Get field-specific prompt
prompt_template = field_prompts.get(field_name, field_prompts.get('_default', ''))
if not prompt_template:
print(f"⚠️ No prompt configured for field '{field_name}', skipping")
return None
# Replace variables in prompt
field_prompt = prompt_template.replace('{source_lang}', lang_str)
field_prompt = field_prompt.replace('{output_lang}', output_language)
field_prompt = field_prompt.replace('English', output_language)
field_prompt = field_prompt.replace('{field_value}', str(original_value))
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the field value without AI prompts
print(f"🌐 Using translation service ({client_type}) - sending field directly")
messages = [
{"role": "user", "content": str(original_value)}
]
else:
# For AI services, use prompts as before
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{field_prompt}\n\n{original_value}"}
]
# Add delay for rate limiting
if config.DELAY > 0:
time.sleep(config.DELAY)
# Make API call
content, finish_reason = client.send(
messages,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS
)
translated_value = content.strip()
# Store result thread-safely
with results_lock:
translation_results[field_name] = {
'original': original_value,
'translated': translated_value,
'success': True
}
print(f"✅ Translated {field_name}: {translated_value}")
return translated_value
except Exception as e:
print(f"❌ Failed to translate {field_name}: {e}")
with results_lock:
translation_results[field_name] = {
'original': original_value,
'translated': None,
'success': False,
'error': str(e)
}
return None
# Execute parallel translations with limited workers
max_workers = min(len(fields_to_translate), batch_size)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all translation tasks
futures = {}
for field_name in fields_to_translate:
if field_name in metadata and not check_stop():
original_value = metadata[field_name]
future = executor.submit(translate_metadata_field, field_name, original_value)
futures[future] = field_name
# Wait for completion
for future in as_completed(futures):
if check_stop():
print("❌ Metadata translation stopped by user")
break
# Apply results to metadata
for field_name, result in translation_results.items():
if result['success'] and result['translated']:
metadata[f"original_{field_name}"] = result['original']
metadata[field_name] = result['translated']
metadata[f"{field_name}_translated"] = True
else:
# Sequential translation mode (individual translation)
mode_desc = "sequential" if not batch_translate_enabled else "sequential (single field)"
print(f"📝 Using {mode_desc} translation mode...")
for field_name in fields_to_translate:
if not check_stop() and field_name in metadata:
original_value = metadata[field_name]
print(f"\n📋 Translating {field_name}: {original_value[:100]}..."
if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}")
# Get field-specific prompt
prompt_template = field_prompts.get(field_name, field_prompts.get('_default', ''))
if not prompt_template:
print(f"⚠️ No prompt configured for field '{field_name}', skipping")
continue
# Replace variables in prompt
field_prompt = prompt_template.replace('{source_lang}', lang_str)
field_prompt = field_prompt.replace('{output_lang}', output_language)
field_prompt = field_prompt.replace('English', output_language)
field_prompt = field_prompt.replace('{field_value}', str(original_value))
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the field value without AI prompts
print(f"🌐 Using translation service ({client_type}) - sending field directly")
messages = [
{"role": "user", "content": str(original_value)}
]
else:
# For AI services, use prompts as before
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{field_prompt}\n\n{original_value}"}
]
try:
# Add delay using the config instance from main()
if config.DELAY > 0: # ✅ FIXED - use config.DELAY instead of config.SEND_INTERVAL
time.sleep(config.DELAY)
# Use the same client instance from main()
# ✅ FIXED - Properly unpack tuple response and provide max_tokens
content, finish_reason = client.send(
messages,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS # ✅ FIXED - provide max_tokens to avoid NoneType error
)
translated_value = content.strip() # ✅ FIXED - use content from unpacked tuple
metadata[f"original_{field_name}"] = original_value
metadata[field_name] = translated_value
metadata[f"{field_name}_translated"] = True
print(f"✅ Translated {field_name}: {translated_value}")
except Exception as e:
print(f"❌ Failed to translate {field_name}: {e}")
else:
if check_stop():
print("❌ Metadata translation stopped by user")
break
else:
print("📋 No additional metadata fields to translate")
except Exception as e:
print(f"⚠️ Error processing metadata translation settings: {e}")
import traceback
traceback.print_exc()
with open(metadata_path, 'w', encoding='utf-8') as mf:
json.dump(metadata, mf, ensure_ascii=False, indent=2)
print(f"💾 Saved metadata with {'translated' if metadata.get('title_translated', False) else 'original'} title")
print("\n" + "="*50)
print("📑 GLOSSARY GENERATION PHASE")
print("="*50)
print(f"📑 DEBUG: ENABLE_AUTO_GLOSSARY = '{os.getenv('ENABLE_AUTO_GLOSSARY', 'NOT SET')}'")
print(f"📑 DEBUG: MANUAL_GLOSSARY = '{config.MANUAL_GLOSSARY}'")
print(f"📑 DEBUG: Manual glossary exists? {os.path.isfile(config.MANUAL_GLOSSARY) if config.MANUAL_GLOSSARY else False}")
# Check if glossary.csv already exists in the source folder
existing_glossary_csv = os.path.join(out, "glossary.csv")
existing_glossary_json = os.path.join(out, "glossary.json")
print(f"📑 DEBUG: Existing glossary.csv? {os.path.exists(existing_glossary_csv)}")
print(f"📑 DEBUG: Existing glossary.json? {os.path.exists(existing_glossary_json)}")
if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY):
ext = os.path.splitext(config.MANUAL_GLOSSARY)[1].lower()
target_name = "glossary.csv" if ext == ".csv" else "glossary.json"
target_path = os.path.join(out, target_name)
if os.path.abspath(config.MANUAL_GLOSSARY) != os.path.abspath(target_path):
shutil.copy(config.MANUAL_GLOSSARY, target_path)
print("📑 Using manual glossary from:", config.MANUAL_GLOSSARY)
else:
print("📑 Using existing glossary:", config.MANUAL_GLOSSARY)
elif os.path.exists(existing_glossary_csv) or os.path.exists(existing_glossary_json):
print("📑 Existing glossary file detected in source folder - skipping automatic generation")
if os.path.exists(existing_glossary_csv):
print(f"📑 Using existing glossary.csv: {existing_glossary_csv}")
elif os.path.exists(existing_glossary_json):
print(f"📑 Using existing glossary.json: {existing_glossary_json}")
elif os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1":
model = os.getenv("MODEL", "gpt-4")
if is_traditional_translation_api(model):
print("📑 Automatic glossary generation disabled")
print(f" {model} does not support glossary extraction")
print(" Traditional translation APIs cannot identify character names/terms")
else:
print("📑 Starting automatic glossary generation...")
try:
# Use the new process-safe glossary worker
from glossary_process_worker import generate_glossary_in_process
import concurrent.futures
import multiprocessing
instructions = ""
# Get extraction workers setting
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
if extraction_workers == 1:
# Auto-detect for better performance
extraction_workers = min(os.cpu_count() or 4, 4)
print(f"📑 Using {extraction_workers} CPU cores for glossary generation")
# Collect environment variables to pass to subprocess
env_vars = {}
important_vars = [
'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
]
for var in important_vars:
if var in os.environ:
env_vars[var] = os.environ[var]
# Create a Queue for real-time log streaming
manager = multiprocessing.Manager()
log_queue = manager.Queue()
# Use ProcessPoolExecutor for true parallelism (completely bypasses GIL)
print("📑 Starting glossary generation in separate process...")
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
# Submit to separate process WITH log queue
future = executor.submit(
generate_glossary_in_process,
out,
chapters,
instructions,
env_vars,
log_queue # Pass the queue for real-time logs
)
# Poll for completion and stream logs in real-time
poll_count = 0
while not future.done():
poll_count += 1
# Check for logs from subprocess and print them immediately
try:
while not log_queue.empty():
log_line = log_queue.get_nowait()
print(log_line) # Print to GUI
except:
pass
# Super short sleep to yield to GUI
time.sleep(0.001)
# Check for stop every 100 polls
if poll_count % 100 == 0:
if check_stop():
print("📑 ❌ Glossary generation cancelled")
executor.shutdown(wait=False, cancel_futures=True)
return
# Get any remaining logs from queue
try:
while not log_queue.empty():
log_line = log_queue.get_nowait()
print(log_line)
except:
pass
# Get result
if future.done():
try:
result = future.result(timeout=0.1)
if isinstance(result, dict):
if result.get('success'):
print(f"📑 ✅ Glossary generation completed successfully")
else:
print(f"📑 ❌ Glossary generation failed: {result.get('error')}")
if result.get('traceback'):
print(f"📑 Error details:\n{result.get('traceback')}")
except Exception as e:
print(f"📑 ❌ Error retrieving glossary result: {e}")
print("✅ Automatic glossary generation COMPLETED")
# Handle deferred glossary appending
if os.getenv('DEFER_GLOSSARY_APPEND') == '1':
print("📑 Processing deferred glossary append to system prompt...")
glossary_path = find_glossary_file(out)
if glossary_path and os.path.exists(glossary_path):
try:
glossary_block = None
if glossary_path.lower().endswith('.csv'):
with open(glossary_path, 'r', encoding='utf-8') as f:
glossary_block = f.read()
else:
with open(glossary_path, 'r', encoding='utf-8') as f:
glossary_data = json.load(f)
formatted_entries = {}
if isinstance(glossary_data, dict) and 'entries' in glossary_data:
formatted_entries = glossary_data['entries']
elif isinstance(glossary_data, dict):
formatted_entries = {k: v for k, v in glossary_data.items() if k != "metadata"}
if formatted_entries:
glossary_block = json.dumps(formatted_entries, ensure_ascii=False, indent=2)
else:
glossary_block = None
if glossary_block:
glossary_prompt = os.getenv('GLOSSARY_APPEND_PROMPT',
"Character/Term Glossary (use these translations consistently):")
current_prompt = config.PROMPT
if current_prompt:
current_prompt += "\n\n"
current_prompt += f"{glossary_prompt}\n{glossary_block}"
config.PROMPT = current_prompt
print(f"✅ Added auto-generated glossary to system prompt ({os.path.basename(glossary_path)})")
if 'DEFER_GLOSSARY_APPEND' in os.environ:
del os.environ['DEFER_GLOSSARY_APPEND']
if 'GLOSSARY_APPEND_PROMPT' in os.environ:
del os.environ['GLOSSARY_APPEND_PROMPT']
else:
print("⚠️ Auto-generated glossary has no entries - skipping append")
if 'DEFER_GLOSSARY_APPEND' in os.environ:
del os.environ['DEFER_GLOSSARY_APPEND']
if 'GLOSSARY_APPEND_PROMPT' in os.environ:
del os.environ['GLOSSARY_APPEND_PROMPT']
except Exception as e:
print(f"⚠️ Failed to append auto-generated glossary: {e}")
else:
print("⚠️ No glossary file found after automatic generation")
except Exception as e:
print(f"❌ Glossary generation failed: {e}")
else:
print("📑 Automatic glossary generation disabled")
# Don't create an empty glossary - let any existing manual glossary remain
glossary_file = find_glossary_file(out)
if glossary_file and os.path.exists(glossary_file):
try:
if glossary_file.lower().endswith('.csv'):
# Quick CSV stats
with open(glossary_file, 'r', encoding='utf-8') as f:
lines = [ln.strip() for ln in f.readlines() if ln.strip()]
entry_count = max(0, len(lines) - 1) if lines and ',' in lines[0] else len(lines)
print(f"📑 Glossary ready (CSV) with {entry_count} entries")
print("📑 Sample glossary lines:")
for ln in lines[1:4]:
print(f" • {ln}")
else:
with open(glossary_file, 'r', encoding='utf-8') as f:
glossary_data = json.load(f)
if isinstance(glossary_data, dict):
if 'entries' in glossary_data and isinstance(glossary_data['entries'], dict):
entry_count = len(glossary_data['entries'])
sample_items = list(glossary_data['entries'].items())[:3]
else:
entry_count = len(glossary_data)
sample_items = list(glossary_data.items())[:3]
print(f"📑 Glossary ready with {entry_count} entries")
print("📑 Sample glossary entries:")
for key, value in sample_items:
print(f" • {key} → {value}")
elif isinstance(glossary_data, list):
print(f"📑 Glossary ready with {len(glossary_data)} entries")
print("📑 Sample glossary entries:")
for i, entry in enumerate(glossary_data[:3]):
if isinstance(entry, dict):
original = entry.get('original_name', '?')
translated = entry.get('name', original)
print(f" • {original} → {translated}")
else:
print(f"⚠️ Unexpected glossary format: {type(glossary_data)}")
except Exception as e:
print(f"⚠️ Failed to inspect glossary file: {e}")
else:
print("📑 No glossary file found")
print("="*50)
print("🚀 STARTING MAIN TRANSLATION PHASE")
print("="*50 + "\n")
glossary_path = find_glossary_file(out)
if glossary_path and os.path.exists(glossary_path) and glossary_path.lower().endswith('.json'):
try:
with open(glossary_path, 'r', encoding='utf-8') as f:
g_data = json.load(f)
print(f"[DEBUG] Glossary type before translation: {type(g_data)}")
if isinstance(g_data, list):
print(f"[DEBUG] Glossary is a list")
except Exception as e:
print(f"[DEBUG] Error checking glossary: {e}")
glossary_path = find_glossary_file(out)
system = build_system_prompt(config.SYSTEM_PROMPT, glossary_path)
base_msg = [{"role": "system", "content": system}]
# Preserve the original system prompt to avoid in-place mutations
original_system_prompt = system
last_summary_block_text = None # Will hold the last rolling summary text for the NEXT chapter only
image_translator = None
if config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Image translation enabled for model: {config.MODEL}")
print("🖼️ Image translation will use your custom system prompt and glossary")
image_translator = ImageTranslator(
client,
out,
config.PROFILE_NAME,
system,
config.TEMP,
log_callback ,
progress_manager,
history_manager,
chunk_context_manager
)
known_vision_models = [
'gemini-1.5-pro', 'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro',
'gpt-4-turbo', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini', 'gpt-4.1-mini'
]
if config.MODEL.lower() not in known_vision_models:
print(f"⚠️ Note: {config.MODEL} may not have vision capabilities. Image translation will be attempted anyway.")
else:
print("ℹ️ Image translation disabled by user")
total_chapters = len(chapters)
# Only detect numbering if the toggle is not disabled
if config.DISABLE_ZERO_DETECTION:
print(f"📊 0-based detection disabled by user setting")
uses_zero_based = False
# Important: Set a flag that can be checked throughout the codebase
config._force_disable_zero_detection = True
else:
if chapters:
uses_zero_based = detect_novel_numbering(chapters)
print(f"📊 Novel numbering detected: {'0-based' if uses_zero_based else '1-based'}")
else:
uses_zero_based = False
config._force_disable_zero_detection = False
# Store this for later use
config._uses_zero_based = uses_zero_based
rng = os.getenv("CHAPTER_RANGE", "")
start = None
end = None
if rng and re.match(r"^\d+\s*-\s*\d+$", rng):
start, end = map(int, rng.split("-", 1))
if config.DISABLE_ZERO_DETECTION:
print(f"📊 0-based detection disabled - using range as specified: {start}-{end}")
elif uses_zero_based:
print(f"📊 0-based novel detected")
print(f"📊 User range {start}-{end} will be used as-is (chapters are already adjusted)")
else:
print(f"📊 1-based novel detected")
print(f"📊 Using range as specified: {start}-{end}")
print("📊 Calculating total chunks needed...")
total_chunks_needed = 0
chunks_per_chapter = {}
chapters_to_process = 0
# When setting actual chapter numbers (in the main function)
for idx, c in enumerate(chapters):
chap_num = c["num"]
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Extract the raw chapter number from the file
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
#print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}")
# Apply the offset
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
# When toggle is disabled, use raw numbers without any 0-based adjustment
if config.DISABLE_ZERO_DETECTION:
c['actual_chapter_num'] = raw_num
# Store raw number for consistency
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply adjustment only if this is a 0-based novel
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
# Now we can safely use actual_num
actual_num = c['actual_chapter_num']
if start is not None:
if not (start <= c['actual_chapter_num'] <= end):
#print(f"[SKIP] Chapter {c['actual_chapter_num']} outside range {start}-{end}")
continue
needs_translation, skip_reason, _ = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out
)
if not needs_translation:
chunks_per_chapter[idx] = 0
continue
chapters_to_process += 1
chapter_key = str(actual_num)
if chapter_key in progress_manager.prog["chapters"] and progress_manager.prog["chapters"][chapter_key].get("status") == "in_progress":
pass
# Calculate based on OUTPUT limit only
max_output_tokens = config.MAX_OUTPUT_TOKENS
safety_margin_output = 500
# Korean to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
#print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
# For mixed content chapters, calculate on clean text
# For mixed content chapters, calculate on clean text
if c.get('has_images', False) and ContentProcessor.is_meaningful_text_content(c["body"]):
# Don't modify c["body"] at all during chunk calculation
# Just pass the body as-is, the chunking will be slightly off but that's OK
chunks = chapter_splitter.split_chapter(c["body"], available_tokens)
else:
chunks = chapter_splitter.split_chapter(c["body"], available_tokens)
chapter_key_str = content_hash
old_key_str = str(idx)
if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}):
progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str]
del progress_manager.prog["chapter_chunks"][old_key_str]
#print(f"[PROGRESS] Migrated chunks for chapter {actual_num} to new tracking system")
# Always count actual chunks - ignore "completed" tracking
chunks_per_chapter[idx] = len(chunks)
total_chunks_needed += chunks_per_chapter[idx]
terminology = "Sections" if is_text_file else "Chapters"
print(f"📊 Total chunks to translate: {total_chunks_needed}")
print(f"📚 {terminology} to process: {chapters_to_process}")
multi_chunk_chapters = [(idx, count) for idx, count in chunks_per_chapter.items() if count > 1]
if multi_chunk_chapters:
# Determine terminology based on file type
terminology = "Sections" if is_text_file else "Chapters"
print(f"📄 {terminology} requiring multiple chunks:")
for idx, chunk_count in multi_chunk_chapters:
chap = chapters[idx]
section_term = "Section" if is_text_file else "Chapter"
print(f" • {section_term} {idx+1} ({chap['title'][:30]}...): {chunk_count} chunks")
translation_start_time = time.time()
chunks_completed = 0
chapters_completed = 0
current_chunk_number = 0
if config.BATCH_TRANSLATION:
print(f"\n📦 PARALLEL TRANSLATION MODE ENABLED")
print(f"📦 Processing chapters with up to {config.BATCH_SIZE} concurrent API calls")
import concurrent.futures
from threading import Lock
progress_lock = Lock()
chapters_to_translate = []
# FIX: First pass to set actual chapter numbers for ALL chapters
# This ensures batch mode has the same chapter numbering as non-batch mode
print("📊 Setting chapter numbers...")
for idx, c in enumerate(chapters):
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
# Apply offset if configured
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
if config.DISABLE_ZERO_DETECTION:
# Use raw numbers without adjustment
c['actual_chapter_num'] = raw_num
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply 0-based adjustment if detected
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
for idx, c in enumerate(chapters):
chap_num = c["num"]
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Check if this is a pre-split text chunk with decimal number
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num'] # Preserve the decimal for text files only
else:
actual_num = c.get('actual_chapter_num', c['num']) # Now this will exist!
# Skip chapters outside the range
if start is not None and not (start <= actual_num <= end):
continue
# Check if chapter needs translation
needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, c # Pass the chapter object
)
# Add explicit file check for supposedly completed chapters
if not needs_translation and existing_file:
file_path = os.path.join(out, existing_file)
if not os.path.exists(file_path):
print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}")
needs_translation = True
skip_reason = None
# Update status to file_missing
progress_manager.update(idx, actual_num, content_hash, None, status="file_missing")
progress_manager.save()
if not needs_translation:
# Modify skip_reason to use appropriate terminology
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Replace "Chapter" with appropriate terminology in skip_reason
skip_reason_modified = skip_reason.replace("Chapter", terminology)
print(f"[SKIP] {skip_reason_modified}")
chapters_completed += 1
continue
# Check for empty or image-only chapters
has_images = c.get('has_images', False)
has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"])
text_size = c.get('file_size', 0)
is_empty_chapter = (not has_images and text_size < 10)
is_image_only_chapter = (has_images and not has_meaningful_text)
# Handle empty chapters
if is_empty_chapter:
print(f"📄 Empty chapter {chap_num} - will process individually")
safe_title = make_safe_filename(c['title'], c['num'])
if isinstance(c['num'], float):
fname = FileUtilities.create_chapter_filename(c, c['num'])
else:
fname = FileUtilities.create_chapter_filename(c, c['num'])
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(c["body"])
progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty")
progress_manager.save()
chapters_completed += 1
continue
# Add to chapters to translate
chapters_to_translate.append((idx, c))
print(f"📊 Found {len(chapters_to_translate)} chapters to translate in parallel")
# Continue with the rest of the existing batch processing code...
batch_processor = BatchTranslationProcessor(
config, client, base_msg, out, progress_lock,
progress_manager.save,
lambda idx, actual_num, content_hash, output_file=None, status="completed", **kwargs: progress_manager.update(idx, actual_num, content_hash, output_file, status, **kwargs),
check_stop,
image_translator,
is_text_file=is_text_file
)
total_to_process = len(chapters_to_translate)
processed = 0
# Apply conservative batching setting
batch_multiplier = 3 if os.getenv('CONSERVATIVE_BATCHING', '0') == '1' else 1
batch_group_size = config.BATCH_SIZE * batch_multiplier
if batch_multiplier > 1:
print(f"📦 Using conservative batching: {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel")
else:
print(f"📦 Using direct batching (default): {batch_group_size} chapters per group, {config.BATCH_SIZE} parallel")
with concurrent.futures.ThreadPoolExecutor(max_workers=config.BATCH_SIZE) as executor:
for batch_start in range(0, total_to_process, batch_group_size):
if check_stop():
print("❌ Translation stopped during parallel processing")
executor.shutdown(wait=False)
return
batch_end = min(batch_start + batch_group_size, total_to_process)
current_batch = chapters_to_translate[batch_start:batch_end]
batch_number = (batch_start // batch_group_size) + 1
print(f"\n📦 Submitting batch {batch_number}: {len(current_batch)} chapters")
future_to_chapter = {
executor.submit(batch_processor.process_single_chapter, chapter_data): chapter_data
for chapter_data in current_batch
}
active_count = 0
completed_in_batch = 0
failed_in_batch = 0
for future in concurrent.futures.as_completed(future_to_chapter):
if check_stop():
print("❌ Translation stopped")
executor.shutdown(wait=False)
return
chapter_data = future_to_chapter[future]
idx, chapter = chapter_data
try:
success, chap_num = future.result()
if success:
completed_in_batch += 1
print(f"✅ Chapter {chap_num} done ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)")
else:
failed_in_batch += 1
print(f"❌ Chapter {chap_num} failed ({completed_in_batch + failed_in_batch}/{len(current_batch)} in batch)")
except Exception as e:
failed_in_batch += 1
print(f"❌ Chapter thread error: {e}")
processed += 1
progress_percent = (processed / total_to_process) * 100
print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)")
print(f"\n📦 Batch Summary:")
print(f" ✅ Successful: {completed_in_batch}")
print(f" ❌ Failed: {failed_in_batch}")
if batch_end < total_to_process:
print(f"⏳ Waiting {config.DELAY}s before next batch...")
time.sleep(config.DELAY)
chapters_completed = batch_processor.chapters_completed
chunks_completed = batch_processor.chunks_completed
print(f"\n🎉 Parallel translation complete!")
print(f" Total chapters processed: {processed}")
# Count qa_failed chapters correctly
qa_failed_count = 0
actual_successful = 0
for idx, c in enumerate(chapters):
# Get the chapter's actual number
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
# Check if this chapter was processed and has qa_failed status
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Check if this chapter exists in progress
chapter_info = progress_manager.prog["chapters"].get(content_hash, {})
status = chapter_info.get("status")
if status == "qa_failed":
qa_failed_count += 1
elif status == "completed":
actual_successful += 1
# Correct the displayed counts
print(f" Successful: {actual_successful}")
if qa_failed_count > 0:
print(f"\n⚠️ {qa_failed_count} chapters failed due to content policy violations:")
qa_failed_chapters = []
for idx, c in enumerate(chapters):
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
chapter_info = progress_manager.prog["chapters"].get(content_hash, {})
if chapter_info.get("status") == "qa_failed":
qa_failed_chapters.append(actual_num)
print(f" Failed chapters: {', '.join(map(str, sorted(qa_failed_chapters)))}")
# Stop translation completely after batch mode
print("\n📌 Batch translation completed.")
elif not config.BATCH_TRANSLATION:
translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file)
if config.DUPLICATE_DETECTION_MODE == 'ai-hunter':
# Build the main config from environment variables and config object
main_config = {
'duplicate_lookback_chapters': config.DUPLICATE_LOOKBACK_CHAPTERS,
'duplicate_detection_mode': config.DUPLICATE_DETECTION_MODE,
}
# Check if AI Hunter config was passed via environment variable
ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG')
if ai_hunter_config_str:
try:
ai_hunter_config = json.loads(ai_hunter_config_str)
main_config['ai_hunter_config'] = ai_hunter_config
print("🤖 AI Hunter: Loaded configuration from environment")
except json.JSONDecodeError:
print("⚠️ AI Hunter: Failed to parse AI_HUNTER_CONFIG from environment")
# If no AI Hunter config in environment, try to load from file as fallback
if 'ai_hunter_config' not in main_config:
# Try multiple locations for config.json
config_paths = [
os.path.join(os.getcwd(), 'config.json'),
os.path.join(out, '..', 'config.json'),
]
if getattr(sys, 'frozen', False):
config_paths.append(os.path.join(os.path.dirname(sys.executable), 'config.json'))
else:
script_dir = os.path.dirname(os.path.abspath(__file__))
config_paths.extend([
os.path.join(script_dir, 'config.json'),
os.path.join(os.path.dirname(script_dir), 'config.json')
])
for config_path in config_paths:
if os.path.exists(config_path):
try:
with open(config_path, 'r', encoding='utf-8') as f:
file_config = json.load(f)
if 'ai_hunter_config' in file_config:
main_config['ai_hunter_config'] = file_config['ai_hunter_config']
print(f"🤖 AI Hunter: Loaded configuration from {config_path}")
break
except Exception as e:
print(f"⚠️ Failed to load config from {config_path}: {e}")
# Always create and inject the improved AI Hunter when ai-hunter mode is selected
ai_hunter = ImprovedAIHunterDetection(main_config)
# The TranslationProcessor class has a method that checks for duplicates
# We need to replace it with our enhanced AI Hunter
# Create a wrapper to match the expected signature
def enhanced_duplicate_check(self, result, idx, prog, out, actual_num=None):
# If actual_num is not provided, try to get it from progress
if actual_num is None:
# Look for the chapter being processed
for ch_key, ch_info in prog.get("chapters", {}).items():
if ch_info.get("chapter_idx") == idx:
actual_num = ch_info.get("actual_num", idx + 1)
break
# Fallback to idx+1 if not found
if actual_num is None:
actual_num = idx + 1
return ai_hunter.detect_duplicate_ai_hunter_enhanced(result, idx, prog, out, actual_num)
# Bind the enhanced method to the processor instance
translation_processor.check_duplicate_content = enhanced_duplicate_check.__get__(translation_processor, TranslationProcessor)
print("🤖 AI Hunter: Using enhanced detection with configurable thresholds")
# First pass: set actual chapter numbers respecting the config
for idx, c in enumerate(chapters):
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
#print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}")
# Apply offset if configured
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
if config.DISABLE_ZERO_DETECTION:
# Use raw numbers without adjustment
c['actual_chapter_num'] = raw_num
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply 0-based adjustment if detected
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
# Second pass: process chapters
for idx, c in enumerate(chapters):
chap_num = c["num"]
# Check if this is a pre-split text chunk with decimal number
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num'] # Preserve the decimal for text files only
else:
actual_num = c.get('actual_chapter_num', c['num'])
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
if start is not None and not (start <= actual_num <= end):
#print(f"[SKIP] Chapter {actual_num} (file: {c.get('original_basename', 'unknown')}) outside range {start}-{end}")
continue
needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, c # Pass the chapter object
)
# Add explicit file check for supposedly completed chapters
if not needs_translation and existing_file:
file_path = os.path.join(out, existing_file)
if not os.path.exists(file_path):
print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}")
needs_translation = True
skip_reason = None
# Update status to file_missing
progress_manager.update(idx, actual_num, content_hash, None, status="file_missing")
progress_manager.save()
if not needs_translation:
# Modify skip_reason to use appropriate terminology
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Replace "Chapter" with appropriate terminology in skip_reason
skip_reason_modified = skip_reason.replace("Chapter", terminology)
print(f"[SKIP] {skip_reason_modified}")
continue
chapter_position = f"{chapters_completed + 1}/{chapters_to_process}"
# Determine if this is a text file
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Determine file reference based on type
if c.get('is_chunk', False):
file_ref = f"Section_{c['num']}"
else:
file_ref = c.get('original_basename', f'{terminology}_{actual_num}')
print(f"\n🔄 Processing #{idx+1}/{total_chapters} (Actual: {terminology} {actual_num}) ({chapter_position} to translate): {c['title']} [File: {file_ref}]")
chunk_context_manager.start_chapter(chap_num, c['title'])
has_images = c.get('has_images', False)
has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"])
text_size = c.get('file_size', 0)
is_empty_chapter = (not has_images and text_size < 10)
is_image_only_chapter = (has_images and not has_meaningful_text)
is_mixed_content = (has_images and has_meaningful_text)
is_text_only = (not has_images and has_meaningful_text)
if is_empty_chapter:
print(f"📄 Empty chapter {actual_num} detected")
# Create filename for empty chapter
if isinstance(c['num'], float):
fname = FileUtilities.create_chapter_filename(c, c['num'])
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
# Save original content
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(c["body"])
# Update progress tracking
progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty")
progress_manager.save()
chapters_completed += 1
# CRITICAL: Skip translation!
continue
elif is_image_only_chapter:
print(f"📸 Image-only chapter: {c.get('image_count', 0)} images")
translated_html = c["body"]
image_translations = {}
# Step 1: Process images if image translation is enabled
if image_translator and config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Translating {c.get('image_count', 0)} images...")
image_translator.set_current_chapter(chap_num)
translated_html, image_translations = process_chapter_images(
c["body"],
actual_num,
image_translator,
check_stop
)
if image_translations:
print(f"✅ Translated {len(image_translations)} images")
# Step 2: Check for headers/titles that need translation
from bs4 import BeautifulSoup
soup = BeautifulSoup(c["body"], 'html.parser')
# Look for headers
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
# If we have headers, we should translate them even in "image-only" chapters
if headers and any(h.get_text(strip=True) for h in headers):
print(f"📝 Found headers to translate in image-only chapter")
# Create a minimal HTML with just the headers for translation
headers_html = ""
for header in headers:
if header.get_text(strip=True):
headers_html += str(header) + "\n"
if headers_html:
print(f"📤 Translating chapter headers...")
# Send just the headers for translation
header_msgs = base_msg + [{"role": "user", "content": headers_html}]
# Use the standard filename
fname = FileUtilities.create_chapter_filename(c, actual_num)
client.set_output_filename(fname)
# Simple API call for headers
header_result, _ = client.send(
header_msgs,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS
)
if header_result:
# Clean the result
header_result = re.sub(r"^```(?:html)?\s*\n?", "", header_result, count=1, flags=re.MULTILINE)
header_result = re.sub(r"\n?```\s*$", "", header_result, count=1, flags=re.MULTILINE)
# Parse both the translated headers and the original body
soup_headers = BeautifulSoup(header_result, 'html.parser')
soup_body = BeautifulSoup(translated_html, 'html.parser')
# Replace headers in the body with translated versions
translated_headers = soup_headers.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
original_headers = soup_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
# Match and replace headers
for orig, trans in zip(original_headers, translated_headers):
if trans and trans.get_text(strip=True):
orig.string = trans.get_text(strip=True)
translated_html = str(soup_body)
print(f"✅ Headers translated successfully")
status = "completed"
else:
print(f"⚠️ Failed to translate headers")
status = "completed_image_only"
else:
status = "completed_image_only"
else:
print(f"ℹ️ No headers found to translate")
status = "completed_image_only"
# Step 3: Save with correct filename
fname = FileUtilities.create_chapter_filename(c, actual_num)
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(translated_html)
print(f"[Chapter {idx+1}/{total_chapters}] ✅ Saved image-only chapter")
progress_manager.update(idx, actual_num, content_hash, fname, status=status)
progress_manager.save()
chapters_completed += 1
continue
else:
# Set default text to translate
text_to_translate = c["body"]
image_translations = {}
if is_mixed_content and image_translator and config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Processing {c.get('image_count', 0)} images first...")
print(f"[DEBUG] Content before image processing (first 200 chars):")
print(c["body"][:200])
print(f"[DEBUG] Has h1 tags: {'' in c['body']}")
print(f"[DEBUG] Has h2 tags: {'' in c['body']}")
image_translator.set_current_chapter(chap_num)
# Store the original body before processing
original_body = c["body"]
# Calculate original chapter tokens before modification
original_chapter_tokens = chapter_splitter.count_tokens(original_body)
# Process images and get body with translations
body_with_images, image_translations = process_chapter_images(
c["body"],
actual_num,
image_translator,
check_stop
)
if image_translations:
print(f"✅ Translated {len(image_translations)} images")
# Store the body with images for later merging
c["body_with_images"] = c["body"]
# For chapters with only images and title, we still need to translate the title
# Extract clean text for translation from ORIGINAL body
from bs4 import BeautifulSoup
soup_clean = BeautifulSoup(original_body, 'html.parser')
# Remove images from the original to get pure text
for img in soup_clean.find_all('img'):
img.decompose()
# Set clean text for translation - use prettify() or str() on the full document
c["body"] = str(soup_clean) if soup_clean.body else original_body
# If there's no meaningful text content after removing images,
# the text translation will just translate the title, which is correct
print(f" 📝 Clean text for translation: {len(c['body'])} chars")
# Update text_size to reflect actual text to translate
text_size = len(c["body"])
# Recalculate the actual token count for clean text
actual_text_tokens = chapter_splitter.count_tokens(c["body"])
print(f" 📊 Actual text tokens: {actual_text_tokens} (was counting {original_chapter_tokens} with images)")
else:
print(f"ℹ️ No translatable text found in images")
# Keep original body if no image translations
c["body"] = original_body
print(f"📖 Translating text content ({text_size} characters)")
progress_manager.update(idx, actual_num, content_hash, output_file=None, status="in_progress")
progress_manager.save()
# Apply ignore filtering to the content before chunk splitting
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
if (ignore_title_tag or ignore_header_tags) and c["body"]:
from bs4 import BeautifulSoup
content_soup = BeautifulSoup(c["body"], 'html.parser')
# Remove title tags if ignored
if ignore_title_tag:
for title_tag in content_soup.find_all('title'):
title_tag.decompose()
# Remove header tags if ignored
if ignore_header_tags:
for header_tag in content_soup.find_all(['h1', 'h2', 'h3']):
header_tag.decompose()
c["body"] = str(content_soup) # Update the chapter body
# Check if this chapter is already a chunk from text file splitting
if c.get('is_chunk', False):
# This is already a pre-split chunk, but still check if it needs further splitting
# Calculate based on OUTPUT limit only
max_output_tokens = config.MAX_OUTPUT_TOKENS
safety_margin_output = 500
# CJK to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
chapter_tokens = chapter_splitter.count_tokens(c["body"])
if chapter_tokens > available_tokens:
# Even pre-split chunks might need further splitting
chunks = chapter_splitter.split_chapter(c["body"], available_tokens)
print(f"📄 Section {c['num']} (pre-split from text file) needs further splitting into {len(chunks)} chunks")
else:
chunks = [(c["body"], 1, 1)]
print(f"📄 Section {c['num']} (pre-split from text file)")
else:
# Normal splitting logic for non-text files
# Calculate based on OUTPUT limit only
max_output_tokens = config.MAX_OUTPUT_TOKENS
safety_margin_output = 500
# CJK to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
chunks = chapter_splitter.split_chapter(c["body"], available_tokens)
# Use consistent terminology
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f"📄 {terminology} will be processed in {len(chunks)} chunk(s)")
# Recalculate tokens on the actual text to be translated
actual_chapter_tokens = chapter_splitter.count_tokens(c["body"])
if len(chunks) > 1:
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (limit: {available_tokens:,} tokens per chunk)")
else:
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (within limit of {available_tokens:,} tokens)")
chapter_key_str = str(idx)
if chapter_key_str not in progress_manager.prog["chapter_chunks"]:
progress_manager.prog["chapter_chunks"][chapter_key_str] = {
"total": len(chunks),
"completed": [],
"chunks": {}
}
progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks)
translated_chunks = []
for chunk_html, chunk_idx, total_chunks in chunks:
chapter_key_str = content_hash
old_key_str = str(idx)
if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}):
progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str]
del progress_manager.prog["chapter_chunks"][old_key_str]
#print(f"[PROGRESS] Migrated chunks for chapter {chap_num} to new tracking system")
if chapter_key_str not in progress_manager.prog["chapter_chunks"]:
progress_manager.prog["chapter_chunks"][chapter_key_str] = {
"total": len(chunks),
"completed": [],
"chunks": {}
}
progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks)
# Get chapter status to check for qa_failed
chapter_info = progress_manager.prog["chapters"].get(chapter_key_str, {})
chapter_status = chapter_info.get("status")
if chapter_status == "qa_failed":
# Force retranslation of qa_failed chapters
print(f" [RETRY] Chunk {chunk_idx}/{total_chunks} - retranslating due to QA failure")
if config.CONTEXTUAL and history_manager.will_reset_on_next_append(config.HIST_LIMIT):
print(f" 📌 History will reset after this chunk (current: {len(history_manager.load_history())//2}/{config.HIST_LIMIT} exchanges)")
if check_stop():
print(f"❌ Translation stopped during chapter {actual_num}, chunk {chunk_idx}")
return
current_chunk_number += 1
progress_percent = (current_chunk_number / total_chunks_needed) * 100 if total_chunks_needed > 0 else 0
if chunks_completed > 0:
elapsed_time = time.time() - translation_start_time
avg_time_per_chunk = elapsed_time / chunks_completed
remaining_chunks = total_chunks_needed - current_chunk_number + 1
eta_seconds = remaining_chunks * avg_time_per_chunk
eta_hours = int(eta_seconds // 3600)
eta_minutes = int((eta_seconds % 3600) // 60)
eta_str = f"{eta_hours}h {eta_minutes}m" if eta_hours > 0 else f"{eta_minutes}m"
else:
eta_str = "calculating..."
if total_chunks > 1:
print(f" 🔄 Translating chunk {chunk_idx}/{total_chunks} for #{idx+1} (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str})")
print(f" ⏳ Chunk size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)")
else:
# Determine terminology and file reference
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Consistent file reference
if c.get('is_chunk', False):
file_ref = f"Section_{c['num']}"
else:
file_ref = c.get('original_basename', f'{terminology}_{actual_num}')
print(f" 📄 Translating {terminology.lower()} content (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str}) [File: {file_ref}]")
print(f" 📊 {terminology} {actual_num} size: {len(chunk_html):,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)")
print(f" ℹ️ This may take 30-60 seconds. Stop will take effect after completion.")
if log_callback:
if hasattr(log_callback, '__self__') and hasattr(log_callback.__self__, 'append_chunk_progress'):
if total_chunks == 1:
# Determine terminology based on source type
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
log_callback.__self__.append_chunk_progress(
1, 1, "text",
f"{terminology} {actual_num}",
overall_current=current_chunk_number,
overall_total=total_chunks_needed,
extra_info=f"{len(chunk_html):,} chars"
)
else:
log_callback.__self__.append_chunk_progress(
chunk_idx,
total_chunks,
"text",
f"{terminology} {actual_num}",
overall_current=current_chunk_number,
overall_total=total_chunks_needed
)
else:
# Determine terminology based on source type
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
terminology_lower = "section" if is_text_source else "chapter"
if total_chunks == 1:
log_callback(f"📄 Processing {terminology} {actual_num} ({chapters_completed + 1}/{chapters_to_process}) - {progress_percent:.1f}% complete")
else:
log_callback(f"📄 processing chunk {chunk_idx}/{total_chunks} for {terminology_lower} {actual_num} - {progress_percent:.1f}% complete")
# Get custom chunk prompt template from environment
chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]\n{chunk_html}")
if total_chunks > 1:
user_prompt = chunk_prompt_template.format(
chunk_idx=chunk_idx,
total_chunks=total_chunks,
chunk_html=chunk_html
)
else:
user_prompt = chunk_html
if config.CONTEXTUAL:
history = history_manager.load_history()
trimmed = history[-config.HIST_LIMIT*2:]
chunk_context = chunk_context_manager.get_context_messages(limit=2)
else:
history = [] # Set empty history when not contextual
trimmed = []
chunk_context = []
# Build the current system prompt from the original each time, and append the last summary block if present
current_system_content = original_system_prompt
if config.USE_ROLLING_SUMMARY and last_summary_block_text:
current_system_content = (
current_system_content
+ "\n\n[Rolling Summary of Previous Chapter]\n"
+ "(For AI: Use as context only; do not include in output)\n"
+ last_summary_block_text
+ "\n[End of Rolling Summary]"
)
current_base = [{"role": "system", "content": current_system_content}]
# If we have a prepared rolling summary from previous chapter, include it as a separate message (do NOT mutate system prompt)
summary_msgs_list = []
if config.USE_ROLLING_SUMMARY and last_summary_block_text:
summary_msgs_list = [{
"role": os.getenv("SUMMARY_ROLE", "user"),
"content": (
"CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n"
"[MEMORY] Previous context summary:\n\n"
f"{last_summary_block_text}\n\n"
"[END MEMORY]\n"
"END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:"
)
}]
msgs = current_base + summary_msgs_list + chunk_context + trimmed + [{"role": "user", "content": user_prompt}]
c['__index'] = idx
c['__progress'] = progress_manager.prog
c['history_manager'] = history_manager
result, finish_reason = translation_processor.translate_with_retry(
msgs, chunk_html, c, chunk_idx, total_chunks
)
if result is None:
progress_manager.update(idx, actual_num, content_hash, output_file=None, status="failed")
progress_manager.save()
continue
if config.REMOVE_AI_ARTIFACTS:
result = ContentProcessor.clean_ai_artifacts(result, True)
if config.EMERGENCY_RESTORE:
result = ContentProcessor.emergency_restore_paragraphs(result, chunk_html)
if config.REMOVE_AI_ARTIFACTS:
lines = result.split('\n')
json_line_count = 0
for i, line in enumerate(lines[:5]):
if line.strip() and any(pattern in line for pattern in [
'"role":', '"content":', '"messages":',
'{"role"', '{"content"', '[{', '}]'
]):
json_line_count = i + 1
else:
break
if json_line_count > 0 and json_line_count < len(lines):
remaining = '\n'.join(lines[json_line_count:])
if remaining.strip() and len(remaining) > 100:
result = remaining
print(f"✂️ Removed {json_line_count} lines of JSON artifacts")
result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE)
translated_chunks.append((result, chunk_idx, total_chunks))
chunk_context_manager.add_chunk(user_prompt, result, chunk_idx, total_chunks)
progress_manager.prog["chapter_chunks"][chapter_key_str]["completed"].append(chunk_idx)
progress_manager.prog["chapter_chunks"][chapter_key_str]["chunks"][str(chunk_idx)] = result
progress_manager.save()
chunks_completed += 1
will_reset = history_manager.will_reset_on_next_append(
config.HIST_LIMIT if config.CONTEXTUAL else 0,
config.TRANSLATION_HISTORY_ROLLING
)
history = history_manager.append_to_history(
user_prompt,
result,
config.HIST_LIMIT if config.CONTEXTUAL else 0,
reset_on_limit=True,
rolling_window=config.TRANSLATION_HISTORY_ROLLING
)
if chunk_idx < total_chunks:
# Handle float delays while checking for stop
full_seconds = int(config.DELAY)
fractional_second = config.DELAY - full_seconds
# Check stop signal every second for full seconds
for i in range(full_seconds):
if check_stop():
print("❌ Translation stopped during delay")
return
time.sleep(1)
# Handle the fractional part if any
if fractional_second > 0:
if check_stop():
print("❌ Translation stopped during delay")
return
time.sleep(fractional_second)
if check_stop():
print(f"❌ Translation stopped before saving chapter {actual_num}")
return
if len(translated_chunks) > 1:
print(f" 📎 Merging {len(translated_chunks)} chunks...")
translated_chunks.sort(key=lambda x: x[1])
merged_result = chapter_splitter.merge_translated_chunks(translated_chunks)
else:
merged_result = translated_chunks[0][0] if translated_chunks else ""
if config.CONTEXTUAL and len(translated_chunks) > 1:
user_summary, assistant_summary = chunk_context_manager.get_summary_for_history()
if user_summary and assistant_summary:
history_manager.append_to_history(
user_summary,
assistant_summary,
config.HIST_LIMIT,
reset_on_limit=False,
rolling_window=config.TRANSLATION_HISTORY_ROLLING
)
print(f" 📝 Added chapter summary to history")
chunk_context_manager.clear()
# For text file chunks, ensure we pass the decimal number
if is_text_file and c.get('is_chunk', False) and isinstance(c.get('num'), float):
fname = FileUtilities.create_chapter_filename(c, c['num']) # Use the decimal num directly
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
client.set_output_filename(fname)
cleaned = re.sub(r"^```(?:html)?\s*\n?", "", merged_result, count=1, flags=re.MULTILINE)
cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE)
cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=config.REMOVE_AI_ARTIFACTS)
if is_mixed_content and image_translations:
print(f"🔀 Merging {len(image_translations)} image translations with text...")
from bs4 import BeautifulSoup
# Parse the translated text (which has the translated title/header)
soup_translated = BeautifulSoup(cleaned, 'html.parser')
# For each image translation, insert it into the document
for img_path, translation_html in image_translations.items():
if translation_html and '
0:
combined.write(f"\n\n{'='*50}\n\n")
# Write the original chapter title (without Part X/Y suffix)
original_title = chapter_data['title']
# Remove the (Part X/Y) suffix if present
if ' (Part ' in original_title:
original_title = original_title.split(' (Part ')[0]
combined.write(f"{original_title}\n\n")
# Add the chunk content
combined.write(content)
# Add spacing between chunks of the same chapter
if chunk_idx < total_chunks:
combined.write("\n\n")
else:
# This is a standalone chapter
current_main_chapter = chapter_data['num']
# Add separator if not first chapter
if i > 0:
combined.write(f"\n\n{'='*50}\n\n")
# Write the chapter title
combined.write(f"{chapter_data['title']}\n\n")
# Add the content
combined.write(content)
print(f" • Combined file with preserved sections: {combined_path}")
total_time = time.time() - translation_start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = int(total_time % 60)
print(f"\n⏱️ Total translation time: {hours}h {minutes}m {seconds}s")
print(f"📊 Chapters completed: {chapters_completed}")
print(f"✅ Text file translation complete!")
if log_callback:
log_callback(f"✅ Text file translation complete! Created {combined_path}")
except Exception as e:
print(f"❌ Error creating combined text file: {e}")
if log_callback:
log_callback(f"❌ Error creating combined text file: {e}")
else:
print("🔍 Checking for translated chapters...")
# Respect retain extension toggle: if enabled, don't look for response_ prefix
if should_retain_source_extension():
response_files = [f for f in os.listdir(out) if f.endswith('.html') and not f.startswith('chapter_')]
else:
response_files = [f for f in os.listdir(out) if f.startswith('response_') and f.endswith('.html')]
chapter_files = [f for f in os.listdir(out) if f.startswith('chapter_') and f.endswith('.html')]
if not response_files and chapter_files:
if should_retain_source_extension():
print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist")
print("ℹ️ Retain-source-extension mode is ON: skipping placeholder creation and using original files for EPUB compilation.")
else:
print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist")
print("📝 Creating placeholder response files for EPUB compilation...")
for chapter_file in chapter_files:
response_file = chapter_file.replace('chapter_', 'response_', 1)
src = os.path.join(out, chapter_file)
dst = os.path.join(out, response_file)
try:
with open(src, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
notice = soup.new_tag('p')
notice.string = "[Note: This chapter could not be translated - showing original content]"
notice['style'] = "color: red; font-style: italic;"
if soup.body:
soup.body.insert(0, notice)
with open(dst, 'w', encoding='utf-8') as f:
f.write(str(soup))
except Exception as e:
print(f"⚠️ Error processing {chapter_file}: {e}")
try:
shutil.copy2(src, dst)
except:
pass
print(f"✅ Created {len(chapter_files)} placeholder response files")
print("⚠️ Note: The EPUB will contain untranslated content")
print("📘 Building final EPUB…")
try:
from epub_converter import fallback_compile_epub
fallback_compile_epub(out, log_callback=log_callback)
print("✅ All done: your final EPUB is in", out)
total_time = time.time() - translation_start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = int(total_time % 60)
print(f"\n📊 Translation Statistics:")
print(f" • Total chunks processed: {chunks_completed}")
print(f" • Total time: {hours}h {minutes}m {seconds}s")
if chunks_completed > 0:
avg_time = total_time / chunks_completed
print(f" • Average time per chunk: {avg_time:.1f} seconds")
stats = progress_manager.get_stats(out)
print(f"\n📊 Progress Tracking Summary:")
print(f" • Total chapters tracked: {stats['total_tracked']}")
print(f" • Successfully completed: {stats['completed']}")
print(f" • Missing files: {stats['missing_files']}")
print(f" • In progress: {stats['in_progress']}")
except Exception as e:
print("❌ EPUB build failed:", e)
print("TRANSLATION_COMPLETE_SIGNAL")
if __name__ == "__main__":
main()