Toadoum commited on
Commit
dbd7e17
·
verified ·
1 Parent(s): ed81e05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +698 -170
app.py CHANGED
@@ -1,3 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import io
3
  import re
@@ -5,7 +616,7 @@ from typing import List, Tuple, Dict
5
 
6
  import torch
7
  import gradio as gr
8
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
 
10
  # --- NEW: docs ---
11
  import docx
@@ -17,16 +628,13 @@ import fitz # PyMuPDF
17
  from reportlab.lib.pagesizes import A4
18
  from reportlab.lib.styles import getSampleStyleSheet
19
  from reportlab.lib.enums import TA_JUSTIFY
20
- from reportlab.platypus import SimpleDocTemplate, Paragraph as RLParagraph, Spacer, PageBreak
21
  from reportlab.lib.units import cm
22
 
23
  # ================= CONFIG =================
24
  MODEL_REPO = "Toadoum/ngambay-fr-v1"
25
-
26
- # Use the lang tokens that actually exist in your tokenizer.
27
- # Switch FR_CODE to "fra_Latn" only if your tokenizer truly has it.
28
- FR_CODE = "fr_Latn" # Français (source)
29
- NG_CODE = "sba_Latn" # Ngambay (cible)
30
 
31
  # Inference
32
  MAX_NEW_TOKENS = 256
@@ -34,125 +642,41 @@ TEMPERATURE = 0.0
34
  NUM_BEAMS = 1
35
 
36
  # Performance knobs
37
- MAX_SRC_TOKENS = 420 # per chunk
38
- BATCH_SIZE_DEFAULT = 12 # base batch size (autoscaled below)
39
-
40
- # ================= Helpers =================
41
- def auto_batch_size(default=BATCH_SIZE_DEFAULT):
42
- if not torch.cuda.is_available():
43
- return max(2, min(6, default)) # CPU
44
- try:
45
- free, total = torch.cuda.mem_get_info()
46
- gb = free / (1024**3)
47
- if gb < 2: return 2
48
- if gb < 4: return 6
49
- if gb < 8: return 10
50
- return default
51
- except Exception:
52
- return default
53
-
54
- BATCH_SIZE = auto_batch_size()
55
-
56
- # -------- Load model & tokenizer (meta-safe) --------
57
- USE_CUDA = torch.cuda.is_available()
58
-
59
- tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
60
-
61
- model = AutoModelForSeq2SeqLM.from_pretrained(
62
- MODEL_REPO,
63
- device_map="auto" if USE_CUDA else None, # let Accelerate place weights if GPU
64
- torch_dtype=torch.float16 if USE_CUDA else torch.float32,
65
- low_cpu_mem_usage=False,
66
- trust_remote_code=True,
67
- )
68
 
69
- # --- Ensure pad/eos/bos exist and are INTS (not tensors) ---
70
- def _to_int_or_list(x):
71
- if isinstance(x, torch.Tensor):
72
- return int(x.item()) if x.numel() == 1 else [int(v) for v in x.tolist()]
73
- if isinstance(x, (list, tuple)):
74
- return [int(v) for v in x]
75
- return int(x) if x is not None else None
76
-
77
- # Safeguard pad token
78
- if tokenizer.pad_token is None and tokenizer.eos_token is not None:
79
- tokenizer.pad_token = tokenizer.eos_token
80
- elif tokenizer.pad_token is None:
81
- tokenizer.add_special_tokens({"pad_token": "<pad>"})
82
- model.resize_token_embeddings(len(tokenizer))
83
-
84
- # Normalize generation config + mirror on model.config
85
- gc = model.generation_config
86
- for attr in ["pad_token_id", "eos_token_id", "bos_token_id", "decoder_start_token_id"]:
87
- tok_val = getattr(tokenizer, attr, None)
88
- cfg_val = getattr(gc, attr, None)
89
- val = tok_val if tok_val is not None else cfg_val
90
- if val is not None:
91
- setattr(gc, attr, _to_int_or_list(val))
92
- # mirror on model.config
93
- val2 = getattr(model.generation_config, attr, None)
94
- if val2 is not None:
95
- setattr(model.config, attr, _to_int_or_list(val2))
96
-
97
- # ================= Low-level NLLB-style generation =================
98
- def _forced_bos_id(lang_code: str):
99
- # Try common mappings first
100
- if hasattr(tokenizer, "lang_code_to_id") and isinstance(tokenizer.lang_code_to_id, dict):
101
- if lang_code in tokenizer.lang_code_to_id:
102
- return int(tokenizer.lang_code_to_id[lang_code])
103
- # Fallback: treat lang code as a token
104
- try:
105
- tok_id = tokenizer.convert_tokens_to_ids(lang_code)
106
- if isinstance(tok_id, int) and tok_id != tokenizer.unk_token_id:
107
- return tok_id
108
- except Exception:
109
- pass
110
- # Final fallback: keep whatever the model already has
111
- return model.generation_config.forced_bos_token_id
112
-
113
- def _encode(texts: List[str], src_lang: str):
114
- # NLLB/M2M-style: set source lang on tokenizer if supported
115
- if hasattr(tokenizer, "src_lang"):
116
- tokenizer.src_lang = src_lang
117
- return tokenizer(
118
- texts,
119
- return_tensors="pt",
120
- padding=True,
121
- truncation=True,
122
- add_special_tokens=True,
123
- )
124
 
125
- def _generate_batch(texts: List[str], src_lang: str, tgt_lang: str) -> List[str]:
126
- if not texts:
127
- return []
128
- inputs = _encode(texts, src_lang)
129
-
130
- # NOTE: Do NOT move inputs; with device_map="auto" the hooks handle it.
131
- # Keep tensors on CPU; accelerate offloads as needed.
132
-
133
- forced_bos = _forced_bos_id(tgt_lang)
134
- gen_kwargs = dict(
135
- max_new_tokens=MAX_NEW_TOKENS,
136
- do_sample=False,
137
- num_beams=NUM_BEAMS,
138
- eos_token_id=model.generation_config.eos_token_id,
139
- pad_token_id=model.generation_config.pad_token_id,
140
- forced_bos_token_id=forced_bos,
141
- )
142
 
143
- with torch.no_grad():
144
- output_ids = model.generate(**inputs, **gen_kwargs)
145
- return tokenizer.batch_decode(output_ids, skip_special_tokens=True)
 
 
 
146
 
147
- # ================= Simple text translation =================
148
  def translate_text_simple(text: str) -> str:
149
  if not text or not text.strip():
150
  return ""
151
- return _generate_batch([text], FR_CODE, NG_CODE)[0]
 
 
 
 
 
 
 
 
 
152
 
153
- # ================= Chunking + Batched Translation + Cache =================
154
  def tokenize_len(s: str) -> int:
155
- return tokenizer(s, add_special_tokens=False, return_length=True)["length"][0]
156
 
157
  def chunk_text_for_translation(text: str, max_src_tokens: int = MAX_SRC_TOKENS) -> List[str]:
158
  """Split text by sentence-ish boundaries and merge under token limit."""
@@ -179,37 +703,35 @@ def chunk_text_for_translation(text: str, max_src_tokens: int = MAX_SRC_TOKENS)
179
  chunks.append(current.strip())
180
  return chunks
181
 
182
- # Small bounded cache (LRU-like using dict + cap)
183
  TRANSLATION_CACHE: Dict[str, str] = {}
184
- CACHE_CAP = 20000
185
-
186
- def _cache_set(k: str, v: str):
187
- if len(TRANSLATION_CACHE) >= CACHE_CAP:
188
- # drop ~5% oldest items
189
- for i, key in enumerate(list(TRANSLATION_CACHE.keys())):
190
- del TRANSLATION_CACHE[key]
191
- if i > CACHE_CAP // 20:
192
- break
193
- TRANSLATION_CACHE[k] = v
194
 
195
  def translate_chunks_list(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[str]:
196
  """
197
  Translate a list of chunks with de-dup + batching.
198
  Returns translations in the same order as input.
199
  """
 
200
  norm_chunks = [c.strip() for c in chunks]
201
- unique_to_translate = []
202
- seen = set()
203
  for c in norm_chunks:
204
- if c and c not in TRANSLATION_CACHE and c not in seen:
205
- seen.add(c)
206
- unique_to_translate.append(c)
207
 
208
- for i in range(0, len(unique_to_translate), batch_size):
209
- batch = unique_to_translate[i:i + batch_size]
210
- outs = _generate_batch(batch, FR_CODE, NG_CODE)
211
- for src, o in zip(batch, outs):
212
- _cache_set(src, o)
 
 
 
 
 
 
 
 
 
213
 
214
  return [TRANSLATION_CACHE.get(c, "") for c in norm_chunks]
215
 
@@ -219,15 +741,15 @@ def translate_long_text(text: str) -> str:
219
  if not chs:
220
  return ""
221
  trs = translate_chunks_list(chs)
 
222
  return " ".join(trs).strip()
223
 
224
- # ================= DOCX helpers =================
225
  def is_heading(par: Paragraph) -> Tuple[bool, int]:
226
- # Works with English and French Word styles
227
- name = (par.style.name or "").lower()
228
- if any(c in name for c in ["heading", "title", "titre"]):
229
  for lvl in range(1, 10):
230
- if str(lvl) in name:
231
  return True, lvl
232
  return True, 1
233
  return False, 0
@@ -253,6 +775,7 @@ def translate_docx_bytes(file_bytes: bytes) -> bytes:
253
 
254
  is_head, lvl = is_heading(par)
255
  if is_head:
 
256
  work.append({"kind": "heading", "level": min(max(lvl, 1), 9), "range": (len(all_chunks), len(all_chunks)+1)})
257
  all_chunks.append(txt.strip())
258
  else:
@@ -265,11 +788,11 @@ def translate_docx_bytes(file_bytes: bytes) -> bytes:
265
  work.append({"kind": "blank"})
266
 
267
  # tables
268
- for table in src_doc.tables:
269
  t_desc = {"kind": "table", "rows": len(table.rows), "cols": len(table.columns), "cells": []}
270
- for row in table.rows:
271
  row_cells = []
272
- for cell in row.cells:
273
  cell_text = "\n".join([p.text for p in cell.paragraphs]).strip()
274
  if cell_text:
275
  chs = chunk_text_for_translation(cell_text)
@@ -285,17 +808,23 @@ def translate_docx_bytes(file_bytes: bytes) -> bytes:
285
  work.append(t_desc)
286
 
287
  # 2) Translate all chunks at once (de-dup + batching)
288
- translated_all = translate_chunks_list(all_chunks) if all_chunks else []
 
 
 
289
 
290
  # 3) Rebuild new document with justified paragraphs
291
  new_doc = docx.Document()
 
292
 
 
293
  def join_range(rng: Tuple[int, int]) -> str:
294
  if rng is None:
295
  return ""
296
  s, e = rng
297
  return " ".join(translated_all[s:e]).strip()
298
 
 
299
  for item in work:
300
  if item["kind"] == "blank":
301
  new_doc.add_paragraph("")
@@ -321,7 +850,7 @@ def translate_docx_bytes(file_bytes: bytes) -> bytes:
321
  new_doc.save(out)
322
  return out.getvalue()
323
 
324
- # ================= PDF helpers =================
325
  def extract_pdf_text_blocks(pdf_bytes: bytes) -> List[List[str]]:
326
  """
327
  Returns list of pages, each a list of block texts (visual order).
@@ -342,8 +871,7 @@ def extract_pdf_text_blocks(pdf_bytes: bytes) -> List[List[str]]:
342
 
343
  def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
344
  """
345
- Build a clean paginated PDF with justified paragraphs.
346
- Keeps one translated page per original page via PageBreak.
347
  """
348
  buf = io.BytesIO()
349
  doc = SimpleDocTemplate(
@@ -358,9 +886,11 @@ def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
358
  body.leading = 14
359
 
360
  story = []
361
- for p_idx, blocks in enumerate(translated_pages):
362
- if p_idx > 0:
363
- story.append(PageBreak())
 
 
364
  for blk in blocks:
365
  story.append(RLParagraph(blk.replace("\n", "<br/>"), body))
366
  story.append(Spacer(1, 0.35*cm))
@@ -370,7 +900,7 @@ def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
370
 
371
  def translate_pdf_bytes(file_bytes: bytes) -> bytes:
372
  """
373
- Read PDF → collect ALL block chunks across pages → single batched translation → rebuild PDF.
374
  """
375
  pages_blocks = extract_pdf_text_blocks(file_bytes)
376
 
@@ -406,7 +936,7 @@ def translate_pdf_bytes(file_bytes: bytes) -> bytes:
406
 
407
  return build_pdf_from_blocks(translated_pages)
408
 
409
- # ================= Gradio file handler =================
410
  def translate_document(file_obj):
411
  """
412
  Accepts gr.File input (NamedString, filepath str, or dict with binary).
@@ -443,6 +973,9 @@ def translate_document(file_obj):
443
  if data is None:
444
  return None, "Impossible de lire le fichier sélectionné."
445
 
 
 
 
446
  if name.lower().endswith(".docx"):
447
  out_bytes = translate_docx_bytes(data)
448
  out_path = "translated_ngambay.docx"
@@ -475,9 +1008,9 @@ theme = gr.themes.Soft(
475
 
476
  CUSTOM_CSS = """
477
  .gradio-container {max-width: 980px !important;}
478
- .header-card {
479
- background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
480
- color: white; padding: 22px; border-radius: 18px;
481
  box-shadow: 0 10px 30px rgba(79,70,229,.25);
482
  transition: transform .2s ease;
483
  }
@@ -485,9 +1018,9 @@ CUSTOM_CSS = """
485
  .header-title { font-size: 26px; font-weight: 800; margin: 0 0 6px 0; letter-spacing: .2px; }
486
  .header-sub { opacity: .98; font-size: 14px; }
487
  .brand { display:flex; align-items:center; gap:10px; justify-content:space-between; flex-wrap:wrap; }
488
- .badge {
489
- display:inline-block; background: rgba(255,255,255,.18);
490
- padding: 4px 10px; border-radius: 999px; font-size: 12px;
491
  border: 1px solid rgba(255,255,255,.25);
492
  }
493
  .footer-note {
@@ -561,7 +1094,7 @@ with gr.Blocks(
561
  interactive=False,
562
  show_copy_button=True
563
  )
564
- gr.Markdown('<div class="footer-note">Astuce : collez un paragraphe complet pour un meilleur contexte. Les noms propres et sigles peuvent nécessiter une relecture humaine.</div>')
565
 
566
  # -------- Tab 2: Documents --------
567
  with gr.Tab("Traduction de document (.docx / .pdf)"):
@@ -575,13 +1108,9 @@ with gr.Blocks(
575
  run_doc = gr.Button("Traduire le document", variant="primary")
576
  with gr.Column(scale=5):
577
  doc_out = gr.File(label="Fichier traduit (télécharger)")
578
- doc_status = gr.Markdown(visible=False)
579
-
580
- def _wrap_translate_document(f):
581
- path, msg = translate_document(f)
582
- return path, gr.update(value=msg, visible=True)
583
 
584
- run_doc.click(_wrap_translate_document, inputs=doc_inp, outputs=[doc_out, doc_status])
585
 
586
  # Contribution banner
587
  gr.HTML(
@@ -605,5 +1134,4 @@ with gr.Blocks(
605
  clear_btn.click(lambda: ("", ""), outputs=[src, tgt])
606
 
607
  if __name__ == "__main__":
608
- # No .to(...) anywhere; model stays where Accelerate placed it (or CPU).
609
- demo.queue(default_concurrency_limit=4).launch(share=True)
 
1
+ # import os
2
+ # import io
3
+ # import re
4
+ # from typing import List, Tuple, Dict
5
+
6
+ # import torch
7
+ # import gradio as gr
8
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+
10
+ # # --- NEW: docs ---
11
+ # import docx
12
+ # from docx.enum.text import WD_ALIGN_PARAGRAPH
13
+ # from docx.text.paragraph import Paragraph
14
+
15
+ # # PDF read & write
16
+ # import fitz # PyMuPDF
17
+ # from reportlab.lib.pagesizes import A4
18
+ # from reportlab.lib.styles import getSampleStyleSheet
19
+ # from reportlab.lib.enums import TA_JUSTIFY
20
+ # from reportlab.platypus import SimpleDocTemplate, Paragraph as RLParagraph, Spacer, PageBreak
21
+ # from reportlab.lib.units import cm
22
+
23
+ # # ================= CONFIG =================
24
+ # MODEL_REPO = "Toadoum/ngambay-fr-v1"
25
+
26
+ # # Use the lang tokens that actually exist in your tokenizer.
27
+ # # Switch FR_CODE to "fra_Latn" only if your tokenizer truly has it.
28
+ # FR_CODE = "sba_Latn" # Français (source)
29
+ # NG_CODE = "fr_Latn" # Ngambay (cible)
30
+
31
+ # # Inference
32
+ # MAX_NEW_TOKENS = 256
33
+ # TEMPERATURE = 0.0
34
+ # NUM_BEAMS = 1
35
+
36
+ # # Performance knobs
37
+ # MAX_SRC_TOKENS = 420 # per chunk
38
+ # BATCH_SIZE_DEFAULT = 12 # base batch size (autoscaled below)
39
+
40
+ # # ================= Helpers =================
41
+ # def auto_batch_size(default=BATCH_SIZE_DEFAULT):
42
+ # if not torch.cuda.is_available():
43
+ # return max(2, min(6, default)) # CPU
44
+ # try:
45
+ # free, total = torch.cuda.mem_get_info()
46
+ # gb = free / (1024**3)
47
+ # if gb < 2: return 2
48
+ # if gb < 4: return 6
49
+ # if gb < 8: return 10
50
+ # return default
51
+ # except Exception:
52
+ # return default
53
+
54
+ # BATCH_SIZE = auto_batch_size()
55
+
56
+ # # -------- Load model & tokenizer (meta-safe) --------
57
+ # USE_CUDA = torch.cuda.is_available()
58
+
59
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
60
+
61
+ # model = AutoModelForSeq2SeqLM.from_pretrained(
62
+ # MODEL_REPO,
63
+ # device_map="auto" if USE_CUDA else None, # let Accelerate place weights if GPU
64
+ # torch_dtype=torch.float16 if USE_CUDA else torch.float32,
65
+ # low_cpu_mem_usage=False,
66
+ # trust_remote_code=True,
67
+ # )
68
+
69
+ # # --- Ensure pad/eos/bos exist and are INTS (not tensors) ---
70
+ # def _to_int_or_list(x):
71
+ # if isinstance(x, torch.Tensor):
72
+ # return int(x.item()) if x.numel() == 1 else [int(v) for v in x.tolist()]
73
+ # if isinstance(x, (list, tuple)):
74
+ # return [int(v) for v in x]
75
+ # return int(x) if x is not None else None
76
+
77
+ # # Safeguard pad token
78
+ # if tokenizer.pad_token is None and tokenizer.eos_token is not None:
79
+ # tokenizer.pad_token = tokenizer.eos_token
80
+ # elif tokenizer.pad_token is None:
81
+ # tokenizer.add_special_tokens({"pad_token": "<pad>"})
82
+ # model.resize_token_embeddings(len(tokenizer))
83
+
84
+ # # Normalize generation config + mirror on model.config
85
+ # gc = model.generation_config
86
+ # for attr in ["pad_token_id", "eos_token_id", "bos_token_id", "decoder_start_token_id"]:
87
+ # tok_val = getattr(tokenizer, attr, None)
88
+ # cfg_val = getattr(gc, attr, None)
89
+ # val = tok_val if tok_val is not None else cfg_val
90
+ # if val is not None:
91
+ # setattr(gc, attr, _to_int_or_list(val))
92
+ # # mirror on model.config
93
+ # val2 = getattr(model.generation_config, attr, None)
94
+ # if val2 is not None:
95
+ # setattr(model.config, attr, _to_int_or_list(val2))
96
+
97
+ # # ================= Low-level NLLB-style generation =================
98
+ # def _forced_bos_id(lang_code: str):
99
+ # # Try common mappings first
100
+ # if hasattr(tokenizer, "lang_code_to_id") and isinstance(tokenizer.lang_code_to_id, dict):
101
+ # if lang_code in tokenizer.lang_code_to_id:
102
+ # return int(tokenizer.lang_code_to_id[lang_code])
103
+ # # Fallback: treat lang code as a token
104
+ # try:
105
+ # tok_id = tokenizer.convert_tokens_to_ids(lang_code)
106
+ # if isinstance(tok_id, int) and tok_id != tokenizer.unk_token_id:
107
+ # return tok_id
108
+ # except Exception:
109
+ # pass
110
+ # # Final fallback: keep whatever the model already has
111
+ # return model.generation_config.forced_bos_token_id
112
+
113
+ # def _encode(texts: List[str], src_lang: str):
114
+ # # NLLB/M2M-style: set source lang on tokenizer if supported
115
+ # if hasattr(tokenizer, "src_lang"):
116
+ # tokenizer.src_lang = src_lang
117
+ # return tokenizer(
118
+ # texts,
119
+ # return_tensors="pt",
120
+ # padding=True,
121
+ # truncation=True,
122
+ # add_special_tokens=True,
123
+ # )
124
+
125
+ # def _generate_batch(texts: List[str], src_lang: str, tgt_lang: str) -> List[str]:
126
+ # if not texts:
127
+ # return []
128
+ # inputs = _encode(texts, src_lang)
129
+
130
+ # # NOTE: Do NOT move inputs; with device_map="auto" the hooks handle it.
131
+ # # Keep tensors on CPU; accelerate offloads as needed.
132
+
133
+ # forced_bos = _forced_bos_id(tgt_lang)
134
+ # gen_kwargs = dict(
135
+ # max_new_tokens=MAX_NEW_TOKENS,
136
+ # do_sample=False,
137
+ # num_beams=NUM_BEAMS,
138
+ # eos_token_id=model.generation_config.eos_token_id,
139
+ # pad_token_id=model.generation_config.pad_token_id,
140
+ # forced_bos_token_id=forced_bos,
141
+ # )
142
+
143
+ # with torch.no_grad():
144
+ # output_ids = model.generate(**inputs, **gen_kwargs)
145
+ # return tokenizer.batch_decode(output_ids, skip_special_tokens=True)
146
+
147
+ # # ================= Simple text translation =================
148
+ # def translate_text_simple(text: str) -> str:
149
+ # if not text or not text.strip():
150
+ # return ""
151
+ # return _generate_batch([text], FR_CODE, NG_CODE)[0]
152
+
153
+ # # ================= Chunking + Batched Translation + Cache =================
154
+ # def tokenize_len(s: str) -> int:
155
+ # return tokenizer(s, add_special_tokens=False, return_length=True)["length"][0]
156
+
157
+ # def chunk_text_for_translation(text: str, max_src_tokens: int = MAX_SRC_TOKENS) -> List[str]:
158
+ # """Split text by sentence-ish boundaries and merge under token limit."""
159
+ # if not text.strip():
160
+ # return []
161
+ # parts = re.split(r'(\s*[\.\!\?…:;]\s+)', text)
162
+ # sentences = []
163
+ # for i in range(0, len(parts), 2):
164
+ # s = parts[i]
165
+ # p = parts[i+1] if i+1 < len(parts) else ""
166
+ # unit = (s + (p or "")).strip()
167
+ # if unit:
168
+ # sentences.append(unit)
169
+
170
+ # chunks, current = [], ""
171
+ # for sent in sentences:
172
+ # candidate = (current + " " + sent).strip() if current else sent
173
+ # if current and tokenize_len(candidate) > max_src_tokens:
174
+ # chunks.append(current.strip())
175
+ # current = sent
176
+ # else:
177
+ # current = candidate
178
+ # if current.strip():
179
+ # chunks.append(current.strip())
180
+ # return chunks
181
+
182
+ # # Small bounded cache (LRU-like using dict + cap)
183
+ # TRANSLATION_CACHE: Dict[str, str] = {}
184
+ # CACHE_CAP = 20000
185
+
186
+ # def _cache_set(k: str, v: str):
187
+ # if len(TRANSLATION_CACHE) >= CACHE_CAP:
188
+ # # drop ~5% oldest items
189
+ # for i, key in enumerate(list(TRANSLATION_CACHE.keys())):
190
+ # del TRANSLATION_CACHE[key]
191
+ # if i > CACHE_CAP // 20:
192
+ # break
193
+ # TRANSLATION_CACHE[k] = v
194
+
195
+ # def translate_chunks_list(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[str]:
196
+ # """
197
+ # Translate a list of chunks with de-dup + batching.
198
+ # Returns translations in the same order as input.
199
+ # """
200
+ # norm_chunks = [c.strip() for c in chunks]
201
+ # unique_to_translate = []
202
+ # seen = set()
203
+ # for c in norm_chunks:
204
+ # if c and c not in TRANSLATION_CACHE and c not in seen:
205
+ # seen.add(c)
206
+ # unique_to_translate.append(c)
207
+
208
+ # for i in range(0, len(unique_to_translate), batch_size):
209
+ # batch = unique_to_translate[i:i + batch_size]
210
+ # outs = _generate_batch(batch, FR_CODE, NG_CODE)
211
+ # for src, o in zip(batch, outs):
212
+ # _cache_set(src, o)
213
+
214
+ # return [TRANSLATION_CACHE.get(c, "") for c in norm_chunks]
215
+
216
+ # def translate_long_text(text: str) -> str:
217
+ # """Chunk → batch translate → rejoin for one paragraph/block."""
218
+ # chs = chunk_text_for_translation(text)
219
+ # if not chs:
220
+ # return ""
221
+ # trs = translate_chunks_list(chs)
222
+ # return " ".join(trs).strip()
223
+
224
+ # # ================= DOCX helpers =================
225
+ # def is_heading(par: Paragraph) -> Tuple[bool, int]:
226
+ # # Works with English and French Word styles
227
+ # name = (par.style.name or "").lower()
228
+ # if any(c in name for c in ["heading", "title", "titre"]):
229
+ # for lvl in range(1, 10):
230
+ # if str(lvl) in name:
231
+ # return True, lvl
232
+ # return True, 1
233
+ # return False, 0
234
+
235
+ # def translate_docx_bytes(file_bytes: bytes) -> bytes:
236
+ # """
237
+ # Read .docx → collect ALL chunks (paras + table cells) → single batched translation → rebuild .docx.
238
+ # Paragraphs and table cell paragraphs are justified; headings kept as headings.
239
+ # """
240
+ # f = io.BytesIO(file_bytes)
241
+ # src_doc = docx.Document(f)
242
+
243
+ # # 1) Collect work units
244
+ # work = [] # list of dict entries describing items with ranges into all_chunks
245
+ # all_chunks: List[str] = []
246
+
247
+ # # paragraphs
248
+ # for par in src_doc.paragraphs:
249
+ # txt = par.text
250
+ # if not txt.strip():
251
+ # work.append({"kind": "blank"})
252
+ # continue
253
+
254
+ # is_head, lvl = is_heading(par)
255
+ # if is_head:
256
+ # work.append({"kind": "heading", "level": min(max(lvl, 1), 9), "range": (len(all_chunks), len(all_chunks)+1)})
257
+ # all_chunks.append(txt.strip())
258
+ # else:
259
+ # chs = chunk_text_for_translation(txt)
260
+ # if chs:
261
+ # start = len(all_chunks)
262
+ # all_chunks.extend(chs)
263
+ # work.append({"kind": "para", "range": (start, start+len(chs))})
264
+ # else:
265
+ # work.append({"kind": "blank"})
266
+
267
+ # # tables
268
+ # for table in src_doc.tables:
269
+ # t_desc = {"kind": "table", "rows": len(table.rows), "cols": len(table.columns), "cells": []}
270
+ # for row in table.rows:
271
+ # row_cells = []
272
+ # for cell in row.cells:
273
+ # cell_text = "\n".join([p.text for p in cell.paragraphs]).strip()
274
+ # if cell_text:
275
+ # chs = chunk_text_for_translation(cell_text)
276
+ # if chs:
277
+ # start = len(all_chunks)
278
+ # all_chunks.extend(chs)
279
+ # row_cells.append({"range": (start, start+len(chs))})
280
+ # else:
281
+ # row_cells.append({"range": None})
282
+ # else:
283
+ # row_cells.append({"range": None})
284
+ # t_desc["cells"].append(row_cells)
285
+ # work.append(t_desc)
286
+
287
+ # # 2) Translate all chunks at once (de-dup + batching)
288
+ # translated_all = translate_chunks_list(all_chunks) if all_chunks else []
289
+
290
+ # # 3) Rebuild new document with justified paragraphs
291
+ # new_doc = docx.Document()
292
+
293
+ # def join_range(rng: Tuple[int, int]) -> str:
294
+ # if rng is None:
295
+ # return ""
296
+ # s, e = rng
297
+ # return " ".join(translated_all[s:e]).strip()
298
+
299
+ # for item in work:
300
+ # if item["kind"] == "blank":
301
+ # new_doc.add_paragraph("")
302
+ # elif item["kind"] == "heading":
303
+ # text = join_range(item["range"])
304
+ # new_doc.add_heading(text, level=item["level"])
305
+ # elif item["kind"] == "para":
306
+ # text = join_range(item["range"])
307
+ # p = new_doc.add_paragraph(text)
308
+ # p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
309
+ # elif item["kind"] == "table":
310
+ # tbl = new_doc.add_table(rows=item["rows"], cols=item["cols"])
311
+ # for r_idx in range(item["rows"]):
312
+ # for c_idx in range(item["cols"]):
313
+ # cell_info = item["cells"][r_idx][c_idx]
314
+ # txt = join_range(cell_info["range"])
315
+ # tgt_cell = tbl.cell(r_idx, c_idx)
316
+ # tgt_cell.text = txt
317
+ # for p in tgt_cell.paragraphs:
318
+ # p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
319
+
320
+ # out = io.BytesIO()
321
+ # new_doc.save(out)
322
+ # return out.getvalue()
323
+
324
+ # # ================= PDF helpers =================
325
+ # def extract_pdf_text_blocks(pdf_bytes: bytes) -> List[List[str]]:
326
+ # """
327
+ # Returns list of pages, each a list of block texts (visual order).
328
+ # """
329
+ # pages_blocks: List[List[str]] = []
330
+ # doc = fitz.open(stream=pdf_bytes, filetype="pdf")
331
+ # for page in doc:
332
+ # blocks = page.get_text("blocks")
333
+ # blocks.sort(key=lambda b: (round(b[1], 1), round(b[0], 1)))
334
+ # page_texts = []
335
+ # for b in blocks:
336
+ # text = b[4].strip()
337
+ # if text:
338
+ # page_texts.append(text)
339
+ # pages_blocks.append(page_texts)
340
+ # doc.close()
341
+ # return pages_blocks
342
+
343
+ # def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
344
+ # """
345
+ # Build a clean paginated PDF with justified paragraphs.
346
+ # Keeps one translated page per original page via PageBreak.
347
+ # """
348
+ # buf = io.BytesIO()
349
+ # doc = SimpleDocTemplate(
350
+ # buf, pagesize=A4,
351
+ # rightMargin=2*cm, leftMargin=2*cm,
352
+ # topMargin=2*cm, bottomMargin=2*cm
353
+ # )
354
+
355
+ # styles = getSampleStyleSheet()
356
+ # body = styles["BodyText"]
357
+ # body.alignment = TA_JUSTIFY
358
+ # body.leading = 14
359
+
360
+ # story = []
361
+ # for p_idx, blocks in enumerate(translated_pages):
362
+ # if p_idx > 0:
363
+ # story.append(PageBreak())
364
+ # for blk in blocks:
365
+ # story.append(RLParagraph(blk.replace("\n", "<br/>"), body))
366
+ # story.append(Spacer(1, 0.35*cm))
367
+
368
+ # doc.build(story)
369
+ # return buf.getvalue()
370
+
371
+ # def translate_pdf_bytes(file_bytes: bytes) -> bytes:
372
+ # """
373
+ # Read PDF → collect ALL block chunks across pages → single batched translation → rebuild PDF.
374
+ # """
375
+ # pages_blocks = extract_pdf_text_blocks(file_bytes)
376
+
377
+ # # 1) collect chunks for the entire PDF
378
+ # all_chunks: List[str] = []
379
+ # plan = [] # list of pages, each a list of ranges for blocks
380
+ # for blocks in pages_blocks:
381
+ # page_plan = []
382
+ # for blk in blocks:
383
+ # chs = chunk_text_for_translation(blk)
384
+ # if chs:
385
+ # start = len(all_chunks)
386
+ # all_chunks.extend(chs)
387
+ # page_plan.append((start, start + len(chs)))
388
+ # else:
389
+ # page_plan.append(None)
390
+ # plan.append(page_plan)
391
+
392
+ # # 2) translate all chunks at once
393
+ # translated_all = translate_chunks_list(all_chunks) if all_chunks else []
394
+
395
+ # # 3) reconstruct per block
396
+ # translated_pages: List[List[str]] = []
397
+ # for page_plan in plan:
398
+ # page_out = []
399
+ # for rng in page_plan:
400
+ # if rng is None:
401
+ # page_out.append("")
402
+ # else:
403
+ # s, e = rng
404
+ # page_out.append(" ".join(translated_all[s:e]).strip())
405
+ # translated_pages.append(page_out)
406
+
407
+ # return build_pdf_from_blocks(translated_pages)
408
+
409
+ # # ================= Gradio file handler =================
410
+ # def translate_document(file_obj):
411
+ # """
412
+ # Accepts gr.File input (NamedString, filepath str, or dict with binary).
413
+ # Returns (output_file_path, status_message).
414
+ # """
415
+ # if file_obj is None:
416
+ # return None, "Veuillez sélectionner un fichier .docx ou .pdf"
417
+
418
+ # try:
419
+ # name = "document"
420
+ # data = None
421
+
422
+ # # Case A: plain filepath string
423
+ # if isinstance(file_obj, str):
424
+ # name = os.path.basename(file_obj)
425
+ # with open(file_obj, "rb") as f:
426
+ # data = f.read()
427
+
428
+ # # Case B: Gradio NamedString with .name (orig name) and .value (temp path)
429
+ # elif hasattr(file_obj, "name") and hasattr(file_obj, "value"):
430
+ # name = os.path.basename(file_obj.name or "document")
431
+ # with open(file_obj.value, "rb") as f:
432
+ # data = f.read()
433
+
434
+ # # Case C: dict (type="binary")
435
+ # elif isinstance(file_obj, dict) and "name" in file_obj and "data" in file_obj:
436
+ # name = os.path.basename(file_obj["name"] or "document")
437
+ # d = file_obj["data"]
438
+ # data = d.read() if hasattr(d, "read") else d
439
+
440
+ # else:
441
+ # return None, "Type d'entrée fichier non supporté (filepath/binaire)."
442
+
443
+ # if data is None:
444
+ # return None, "Impossible de lire le fichier sélectionné."
445
+
446
+ # if name.lower().endswith(".docx"):
447
+ # out_bytes = translate_docx_bytes(data)
448
+ # out_path = "translated_ngambay.docx"
449
+ # with open(out_path, "wb") as f:
450
+ # f.write(out_bytes)
451
+ # return out_path, "✅ Traduction DOCX terminée (paragraphes justifiés)."
452
+
453
+ # elif name.lower().endswith(".pdf"):
454
+ # out_bytes = translate_pdf_bytes(data)
455
+ # out_path = "translated_ngambay.pdf"
456
+ # with open(out_path, "wb") as f:
457
+ # f.write(out_bytes)
458
+ # return out_path, "✅ Traduction PDF terminée (paragraphes justifiés)."
459
+
460
+ # else:
461
+ # return None, "Type de fichier non supporté. Choisissez .docx ou .pdf"
462
+
463
+ # except Exception as e:
464
+ # return None, f"❌ Erreur pendant la traduction: {e}"
465
+
466
+ # # ================== UI ==================
467
+ # theme = gr.themes.Soft(
468
+ # primary_hue="indigo",
469
+ # radius_size="lg",
470
+ # font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
471
+ # ).set(
472
+ # body_background_fill="#f7f7fb",
473
+ # button_primary_text_color="#ffffff"
474
+ # )
475
+
476
+ # CUSTOM_CSS = """
477
+ # .gradio-container {max-width: 980px !important;}
478
+ # .header-card {
479
+ # background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
480
+ # color: white; padding: 22px; border-radius: 18px;
481
+ # box-shadow: 0 10px 30px rgba(79,70,229,.25);
482
+ # transition: transform .2s ease;
483
+ # }
484
+ # .header-card:hover { transform: translateY(-1px); }
485
+ # .header-title { font-size: 26px; font-weight: 800; margin: 0 0 6px 0; letter-spacing: .2px; }
486
+ # .header-sub { opacity: .98; font-size: 14px; }
487
+ # .brand { display:flex; align-items:center; gap:10px; justify-content:space-between; flex-wrap:wrap; }
488
+ # .badge {
489
+ # display:inline-block; background: rgba(255,255,255,.18);
490
+ # padding: 4px 10px; border-radius: 999px; font-size: 12px;
491
+ # border: 1px solid rgba(255,255,255,.25);
492
+ # }
493
+ # .footer-note {
494
+ # margin-top: 8px; color: #64748b; font-size: 12px; text-align: center;
495
+ # }
496
+ # .support-banner {
497
+ # margin-top: 14px;
498
+ # border-radius: 14px;
499
+ # padding: 14px 16px;
500
+ # background: linear-gradient(135deg, rgba(79,70,229,.08), rgba(124,58,237,.08));
501
+ # border: 1px solid rgba(99,102,241,.25);
502
+ # box-shadow: 0 6px 18px rgba(79,70,229,.08);
503
+ # }
504
+ # .support-title { font-weight: 700; font-size: 16px; margin-bottom: 4px; }
505
+ # .support-text { font-size: 13px; color: #334155; line-height: 1.5; }
506
+ # .support-contacts { display: flex; gap: 10px; flex-wrap: wrap; margin-top: 8px; }
507
+ # .support-chip {
508
+ # display:inline-block; padding: 6px 10px; border-radius: 999px;
509
+ # background: white; border: 1px dashed rgba(79,70,229,.45);
510
+ # font-size: 12px; color: #3730a3;
511
+ # }
512
+ # """
513
+
514
+ # with gr.Blocks(
515
+ # title="Français → Ngambay · Toadoum/ngambay-fr-v1",
516
+ # theme=theme,
517
+ # css=CUSTOM_CSS,
518
+ # fill_height=True,
519
+ # ) as demo:
520
+ # with gr.Group(elem_classes=["header-card"]):
521
+ # gr.HTML(
522
+ # """
523
+ # <div class="brand">
524
+ # <div>
525
+ # <div class="header-title">Français → Ngambay (v1)</div>
526
+ # <div class="header-sub">🚀 Version bêta · Merci de tester et partager vos retours pour améliorer la qualité de traduction.</div>
527
+ # </div>
528
+ # <span class="badge">Modèle&nbsp;: Toadoum/ngambay-fr-v1</span>
529
+ # </div>
530
+ # """
531
+ # )
532
+
533
+ # with gr.Tabs():
534
+ # # -------- Tab 1: Texte --------
535
+ # with gr.Tab("Traduction de texte"):
536
+ # with gr.Row():
537
+ # with gr.Column(scale=5):
538
+ # src = gr.Textbox(
539
+ # label="Texte source (Français)",
540
+ # placeholder="Saisissez votre texte en français…",
541
+ # lines=8,
542
+ # autofocus=True
543
+ # )
544
+ # with gr.Row():
545
+ # btn = gr.Button("Traduire", variant="primary", scale=3)
546
+ # clear_btn = gr.Button("Effacer", scale=1)
547
+ # gr.Examples(
548
+ # examples=[
549
+ # ["Bonjour, comment allez-vous aujourd’hui ?"],
550
+ # ["La réunion de sensibilisation aura lieu demain au centre communautaire."],
551
+ # ["Merci pour votre participation et votre soutien."],
552
+ # ["Veuillez suivre les recommandations de santé pour protéger votre famille."]
553
+ # ],
554
+ # inputs=[src],
555
+ # label="Exemples (cliquez pour remplir)"
556
+ # )
557
+ # with gr.Column(scale=5):
558
+ # tgt = gr.Textbox(
559
+ # label="Traduction (Ngambay)",
560
+ # lines=8,
561
+ # interactive=False,
562
+ # show_copy_button=True
563
+ # )
564
+ # gr.Markdown('<div class="footer-note">Astuce : collez un paragraphe complet pour un meilleur contexte. Les noms propres et sigles peuvent nécessiter une relecture humaine.</div>')
565
+
566
+ # # -------- Tab 2: Documents --------
567
+ # with gr.Tab("Traduction de document (.docx / .pdf)"):
568
+ # with gr.Row():
569
+ # with gr.Column(scale=5):
570
+ # doc_inp = gr.File(
571
+ # label="Sélectionnez un document (.docx ou .pdf)",
572
+ # file_types=[".docx", ".pdf"],
573
+ # type="filepath" # ensures a temp filepath; handler also supports binary
574
+ # )
575
+ # run_doc = gr.Button("Traduire le document", variant="primary")
576
+ # with gr.Column(scale=5):
577
+ # doc_out = gr.File(label="Fichier traduit (télécharger)")
578
+ # doc_status = gr.Markdown(visible=False)
579
+
580
+ # def _wrap_translate_document(f):
581
+ # path, msg = translate_document(f)
582
+ # return path, gr.update(value=msg, visible=True)
583
+
584
+ # run_doc.click(_wrap_translate_document, inputs=doc_inp, outputs=[doc_out, doc_status])
585
+
586
+ # # Contribution banner
587
+ # gr.HTML(
588
+ # """
589
+ # <div class="support-banner">
590
+ # <div class="support-title">💙 Contribuer au projet (recrutement de linguistes)</div>
591
+ # <div class="support-text">
592
+ # Nous cherchons à <b>recruter des linguistes</b> pour renforcer la construction de données Ngambay.
593
+ # Si vous souhaitez soutenir financièrement ou en tant que bénévole, contactez-nous :
594
+ # </div>
595
+ # <div class="support-contacts">
596
+ # <span class="support-chip">📱 WhatsApp, Airtel Money&nbsp;: <b>+235&nbsp;66&nbsp;04&nbsp;90&nbsp;94</b></span>
597
+ # <span class="support-chip">✉️ Email&nbsp;: <a href="mailto:[email protected]">[email protected]</a></span>
598
+ # </div>
599
+ # </div>
600
+ # """
601
+ # )
602
+
603
+ # # Text actions
604
+ # btn.click(translate_text_simple, inputs=src, outputs=tgt)
605
+ # clear_btn.click(lambda: ("", ""), outputs=[src, tgt])
606
+
607
+ # if __name__ == "__main__":
608
+ # # No .to(...) anywhere; model stays where Accelerate placed it (or CPU).
609
+ # demo.queue(default_concurrency_limit=4).launch(share=True)
610
+
611
+
612
  import os
613
  import io
614
  import re
 
616
 
617
  import torch
618
  import gradio as gr
619
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
620
 
621
  # --- NEW: docs ---
622
  import docx
 
628
  from reportlab.lib.pagesizes import A4
629
  from reportlab.lib.styles import getSampleStyleSheet
630
  from reportlab.lib.enums import TA_JUSTIFY
631
+ from reportlab.platypus import SimpleDocTemplate, Paragraph as RLParagraph, Spacer
632
  from reportlab.lib.units import cm
633
 
634
  # ================= CONFIG =================
635
  MODEL_REPO = "Toadoum/ngambay-fr-v1"
636
+ FR_CODE = "sba_Latn" # Français (source)
637
+ NG_CODE = "fr_Latn" # Ngambay (cible)
 
 
 
638
 
639
  # Inference
640
  MAX_NEW_TOKENS = 256
 
642
  NUM_BEAMS = 1
643
 
644
  # Performance knobs
645
+ MAX_SRC_TOKENS = 420 # per chunk; reduce to ~320 if you want even faster
646
+ BATCH_SIZE = 12 # number of chunks per model call (tune for your hardware)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
 
648
+ # Device selection
649
+ device = 0 if torch.cuda.is_available() else -1 # set -1 on Spaces CPU if needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
 
651
+ # Load model & tokenizer once
652
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
653
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_REPO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
+ translator = pipeline(
656
+ task="translation",
657
+ model=model,
658
+ tokenizer=tokenizer,
659
+ device=device,
660
+ )
661
 
662
+ # Simple text box translation (kept)
663
  def translate_text_simple(text: str) -> str:
664
  if not text or not text.strip():
665
  return ""
666
+ with torch.no_grad():
667
+ out = translator(
668
+ text,
669
+ src_lang=FR_CODE,
670
+ tgt_lang=NG_CODE,
671
+ max_new_tokens=MAX_NEW_TOKENS,
672
+ do_sample=False,
673
+ num_beams=NUM_BEAMS,
674
+ )
675
+ return out[0]["translation_text"]
676
 
677
+ # ---------- Chunking + Batched Translation + Cache ----------
678
  def tokenize_len(s: str) -> int:
679
+ return len(tokenizer.encode(s, add_special_tokens=False))
680
 
681
  def chunk_text_for_translation(text: str, max_src_tokens: int = MAX_SRC_TOKENS) -> List[str]:
682
  """Split text by sentence-ish boundaries and merge under token limit."""
 
703
  chunks.append(current.strip())
704
  return chunks
705
 
706
+ # module-level cache: identical chunks translated once
707
  TRANSLATION_CACHE: Dict[str, str] = {}
 
 
 
 
 
 
 
 
 
 
708
 
709
  def translate_chunks_list(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[str]:
710
  """
711
  Translate a list of chunks with de-dup + batching.
712
  Returns translations in the same order as input.
713
  """
714
+ # Normalize & collect unique chunks to translate
715
  norm_chunks = [c.strip() for c in chunks]
716
+ to_translate = []
 
717
  for c in norm_chunks:
718
+ if c and c not in TRANSLATION_CACHE:
719
+ to_translate.append(c)
 
720
 
721
+ # Batched calls
722
+ with torch.no_grad():
723
+ for i in range(0, len(to_translate), batch_size):
724
+ batch = to_translate[i:i + batch_size]
725
+ outs = translator(
726
+ batch,
727
+ src_lang=FR_CODE,
728
+ tgt_lang=NG_CODE,
729
+ max_new_tokens=MAX_NEW_TOKENS,
730
+ do_sample=False,
731
+ num_beams=NUM_BEAMS,
732
+ )
733
+ for src, o in zip(batch, outs):
734
+ TRANSLATION_CACHE[src] = o["translation_text"]
735
 
736
  return [TRANSLATION_CACHE.get(c, "") for c in norm_chunks]
737
 
 
741
  if not chs:
742
  return ""
743
  trs = translate_chunks_list(chs)
744
+ # join with space to reconstruct paragraph smoothly
745
  return " ".join(trs).strip()
746
 
747
+ # ---------- DOCX helpers (now fully batched across the whole doc) ----------
748
  def is_heading(par: Paragraph) -> Tuple[bool, int]:
749
+ style = (par.style.name or "").lower()
750
+ if "heading" in style:
 
751
  for lvl in range(1, 10):
752
+ if str(lvl) in style:
753
  return True, lvl
754
  return True, 1
755
  return False, 0
 
775
 
776
  is_head, lvl = is_heading(par)
777
  if is_head:
778
+ # treat as single chunk (usually short)
779
  work.append({"kind": "heading", "level": min(max(lvl, 1), 9), "range": (len(all_chunks), len(all_chunks)+1)})
780
  all_chunks.append(txt.strip())
781
  else:
 
788
  work.append({"kind": "blank"})
789
 
790
  # tables
791
+ for t_idx, table in enumerate(src_doc.tables):
792
  t_desc = {"kind": "table", "rows": len(table.rows), "cols": len(table.columns), "cells": []}
793
+ for r_idx, row in enumerate(table.rows):
794
  row_cells = []
795
+ for c_idx, cell in enumerate(row.cells):
796
  cell_text = "\n".join([p.text for p in cell.paragraphs]).strip()
797
  if cell_text:
798
  chs = chunk_text_for_translation(cell_text)
 
808
  work.append(t_desc)
809
 
810
  # 2) Translate all chunks at once (de-dup + batching)
811
+ if all_chunks:
812
+ translated_all = translate_chunks_list(all_chunks)
813
+ else:
814
+ translated_all = []
815
 
816
  # 3) Rebuild new document with justified paragraphs
817
  new_doc = docx.Document()
818
+ cursor = 0 # index into translated_all
819
 
820
+ # helper to consume a range and join back
821
  def join_range(rng: Tuple[int, int]) -> str:
822
  if rng is None:
823
  return ""
824
  s, e = rng
825
  return " ".join(translated_all[s:e]).strip()
826
 
827
+ # rebuild paragraphs
828
  for item in work:
829
  if item["kind"] == "blank":
830
  new_doc.add_paragraph("")
 
850
  new_doc.save(out)
851
  return out.getvalue()
852
 
853
+ # ---------- PDF helpers (batched across the whole PDF) ----------
854
  def extract_pdf_text_blocks(pdf_bytes: bytes) -> List[List[str]]:
855
  """
856
  Returns list of pages, each a list of block texts (visual order).
 
871
 
872
  def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
873
  """
874
+ Build a clean paginated PDF with justified paragraphs (not exact original layout).
 
875
  """
876
  buf = io.BytesIO()
877
  doc = SimpleDocTemplate(
 
886
  body.leading = 14
887
 
888
  story = []
889
+ first = True
890
+ for blocks in translated_pages:
891
+ if not first:
892
+ story.append(Spacer(1, 0.1*cm)) # page break trigger
893
+ first = False
894
  for blk in blocks:
895
  story.append(RLParagraph(blk.replace("\n", "<br/>"), body))
896
  story.append(Spacer(1, 0.35*cm))
 
900
 
901
  def translate_pdf_bytes(file_bytes: bytes) -> bytes:
902
  """
903
+ Read PDF → collect ALL block chunks across pages → single batched translation → rebuild simple justified PDF.
904
  """
905
  pages_blocks = extract_pdf_text_blocks(file_bytes)
906
 
 
936
 
937
  return build_pdf_from_blocks(translated_pages)
938
 
939
+ # ---------- Gradio file handler (robust) ----------
940
  def translate_document(file_obj):
941
  """
942
  Accepts gr.File input (NamedString, filepath str, or dict with binary).
 
973
  if data is None:
974
  return None, "Impossible de lire le fichier sélectionné."
975
 
976
+ # Clear cache per document to keep memory predictable (optional)
977
+ # TRANSLATION_CACHE.clear()
978
+
979
  if name.lower().endswith(".docx"):
980
  out_bytes = translate_docx_bytes(data)
981
  out_path = "translated_ngambay.docx"
 
1008
 
1009
  CUSTOM_CSS = """
1010
  .gradio-container {max-width: 980px !important;}
1011
+ .header-card {
1012
+ background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
1013
+ color: white; padding: 22px; border-radius: 18px;
1014
  box-shadow: 0 10px 30px rgba(79,70,229,.25);
1015
  transition: transform .2s ease;
1016
  }
 
1018
  .header-title { font-size: 26px; font-weight: 800; margin: 0 0 6px 0; letter-spacing: .2px; }
1019
  .header-sub { opacity: .98; font-size: 14px; }
1020
  .brand { display:flex; align-items:center; gap:10px; justify-content:space-between; flex-wrap:wrap; }
1021
+ .badge {
1022
+ display:inline-block; background: rgba(255,255,255,.18);
1023
+ padding: 4px 10px; border-radius: 999px; font-size: 12px;
1024
  border: 1px solid rgba(255,255,255,.25);
1025
  }
1026
  .footer-note {
 
1094
  interactive=False,
1095
  show_copy_button=True
1096
  )
1097
+ gr.Markdown('<div class="footer-note">Astuce : collez un paragraphe complet pour un meilleur contexte.</div>')
1098
 
1099
  # -------- Tab 2: Documents --------
1100
  with gr.Tab("Traduction de document (.docx / .pdf)"):
 
1108
  run_doc = gr.Button("Traduire le document", variant="primary")
1109
  with gr.Column(scale=5):
1110
  doc_out = gr.File(label="Fichier traduit (télécharger)")
1111
+ doc_status = gr.Markdown("")
 
 
 
 
1112
 
1113
+ run_doc.click(translate_document, inputs=doc_inp, outputs=[doc_out, doc_status])
1114
 
1115
  # Contribution banner
1116
  gr.HTML(
 
1134
  clear_btn.click(lambda: ("", ""), outputs=[src, tgt])
1135
 
1136
  if __name__ == "__main__":
1137
+ demo.queue(default_concurrency_limit=4).launch(analytics_enabled=False)