Toadoum commited on
Commit
ed81e05
·
verified ·
1 Parent(s): 4eecf5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +562 -86
app.py CHANGED
@@ -1,43 +1,469 @@
 
 
 
 
 
1
  import torch
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # --- Config ---
6
  MODEL_REPO = "Toadoum/ngambay-fr-v1"
7
- FR_CODE = "fra_Latn" # Français
8
- NG_CODE = "sba_Latn" # Ngambay (Saba) Latin
9
 
10
- # --- Inference params (fixés pour l'utilisateur) ---
 
 
 
 
 
11
  MAX_NEW_TOKENS = 256
12
  TEMPERATURE = 0.0
 
 
 
 
 
13
 
14
- # --- Device selection ---
15
- device = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # --- Load model & tokenizer once ---
18
- tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
19
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_REPO)
20
 
21
- translator = pipeline(
22
- task="translation",
23
- model=model,
24
- tokenizer=tokenizer,
25
- device=device,
 
 
 
 
 
 
26
  )
27
 
28
- def translate_fr_to_ng(text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if not text or not text.strip():
30
  return ""
31
- out = translator(
32
- text,
33
- src_lang=FR_CODE,
34
- tgt_lang=NG_CODE,
35
- max_new_tokens=MAX_NEW_TOKENS,
36
- do_sample=False, # TEMPERATURE=0.0 -> déterministe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
- return out[0]["translation_text"]
39
 
40
- # --- Thème & styles (compatibles Gradio 4.x) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  theme = gr.themes.Soft(
42
  primary_hue="indigo",
43
  radius_size="lg",
@@ -49,22 +475,40 @@ theme = gr.themes.Soft(
49
 
50
  CUSTOM_CSS = """
51
  .gradio-container {max-width: 980px !important;}
52
- .header-card {
53
- background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
54
- color: white; padding: 22px; border-radius: 18px;
55
  box-shadow: 0 10px 30px rgba(79,70,229,.25);
 
56
  }
57
- .header-title { font-size: 26px; font-weight: 800; margin: 0 0 6px 0; }
58
- .header-sub { opacity: .95; font-size: 14px; }
 
59
  .brand { display:flex; align-items:center; gap:10px; justify-content:space-between; flex-wrap:wrap; }
60
- .badge {
61
- display:inline-block; background: rgba(255,255,255,.18);
62
- padding: 4px 10px; border-radius: 999px; font-size: 12px;
63
  border: 1px solid rgba(255,255,255,.25);
64
  }
65
  .footer-note {
66
  margin-top: 8px; color: #64748b; font-size: 12px; text-align: center;
67
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
 
70
  with gr.Blocks(
@@ -72,62 +516,94 @@ with gr.Blocks(
72
  theme=theme,
73
  css=CUSTOM_CSS,
74
  fill_height=True,
75
- analytics_enabled=False
76
  ) as demo:
77
- with gr.Column():
78
- with gr.Group(elem_classes=["header-card"]):
79
- gr.HTML(
80
- """
81
- <div class="brand">
82
- <div>
83
- <div class="header-title">Français Ngambay (v1)</div>
84
- <div class="header-sub">Traduction rapide et fidèle pour la langue la plus parlée au Tchad.</div>
85
- </div>
86
- <span class="badge">Modèle&nbsp;: Toadoum/ngambay-fr-v1</span>
87
  </div>
88
- """
89
- )
90
-
91
- with gr.Row():
92
- with gr.Column(scale=5):
93
- src = gr.Textbox(
94
- label="Texte source (Français)",
95
- placeholder="Saisissez votre texte en français…",
96
- lines=8,
97
- autofocus=True
98
- )
99
- with gr.Row():
100
- btn = gr.Button("Traduire", variant="primary", scale=3)
101
- clear_btn = gr.Button("Effacer", scale=1)
102
-
103
- gr.Examples(
104
- examples=[
105
- ["Bonjour, comment allez-vous aujourd’hui ?"],
106
- ["La réunion de sensibilisation aura lieu demain au centre communautaire."],
107
- ["Merci pour votre participation et votre soutien."],
108
- ["Veuillez suivre les recommandations de santé pour protéger votre famille."]
109
- ],
110
- inputs=[src],
111
- label="Exemples (cliquez pour remplir)"
112
- )
113
-
114
- with gr.Column(scale=5):
115
- tgt = gr.Textbox(
116
- label="Traduction (Ngambay)",
117
- lines=8,
118
- interactive=False,
119
- show_copy_button=True
120
- )
121
- gr.Markdown(
122
- f"**Paramètres** : `max_new_tokens={MAX_NEW_TOKENS}`, `temperature={TEMPERATURE}` · "
123
- f"`src_lang={FR_CODE}` → `tgt_lang={NG_CODE}`"
124
- )
125
-
126
- gr.Markdown('<div class="footer-note">Astuce : collez un paragraphe complet pour un meilleur contexte.</div>')
127
-
128
- btn.click(translate_fr_to_ng, inputs=src, outputs=tgt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  clear_btn.click(lambda: ("", ""), outputs=[src, tgt])
130
 
131
  if __name__ == "__main__":
132
- # Gradio 4.x : contrôle de la concurrence via default_concurrency_limit
133
- demo.queue(default_concurrency_limit=4).launch()
 
1
+ import os
2
+ import io
3
+ import re
4
+ from typing import List, Tuple, Dict
5
+
6
  import torch
7
  import gradio as gr
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+
10
+ # --- NEW: docs ---
11
+ import docx
12
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
13
+ from docx.text.paragraph import Paragraph
14
+
15
+ # PDF read & write
16
+ import fitz # PyMuPDF
17
+ from reportlab.lib.pagesizes import A4
18
+ from reportlab.lib.styles import getSampleStyleSheet
19
+ from reportlab.lib.enums import TA_JUSTIFY
20
+ from reportlab.platypus import SimpleDocTemplate, Paragraph as RLParagraph, Spacer, PageBreak
21
+ from reportlab.lib.units import cm
22
 
23
+ # ================= CONFIG =================
24
  MODEL_REPO = "Toadoum/ngambay-fr-v1"
 
 
25
 
26
+ # Use the lang tokens that actually exist in your tokenizer.
27
+ # Switch FR_CODE to "fra_Latn" only if your tokenizer truly has it.
28
+ FR_CODE = "fr_Latn" # Français (source)
29
+ NG_CODE = "sba_Latn" # Ngambay (cible)
30
+
31
+ # Inference
32
  MAX_NEW_TOKENS = 256
33
  TEMPERATURE = 0.0
34
+ NUM_BEAMS = 1
35
+
36
+ # Performance knobs
37
+ MAX_SRC_TOKENS = 420 # per chunk
38
+ BATCH_SIZE_DEFAULT = 12 # base batch size (autoscaled below)
39
 
40
+ # ================= Helpers =================
41
+ def auto_batch_size(default=BATCH_SIZE_DEFAULT):
42
+ if not torch.cuda.is_available():
43
+ return max(2, min(6, default)) # CPU
44
+ try:
45
+ free, total = torch.cuda.mem_get_info()
46
+ gb = free / (1024**3)
47
+ if gb < 2: return 2
48
+ if gb < 4: return 6
49
+ if gb < 8: return 10
50
+ return default
51
+ except Exception:
52
+ return default
53
 
54
+ BATCH_SIZE = auto_batch_size()
 
 
55
 
56
+ # -------- Load model & tokenizer (meta-safe) --------
57
+ USE_CUDA = torch.cuda.is_available()
58
+
59
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
60
+
61
+ model = AutoModelForSeq2SeqLM.from_pretrained(
62
+ MODEL_REPO,
63
+ device_map="auto" if USE_CUDA else None, # let Accelerate place weights if GPU
64
+ torch_dtype=torch.float16 if USE_CUDA else torch.float32,
65
+ low_cpu_mem_usage=False,
66
+ trust_remote_code=True,
67
  )
68
 
69
+ # --- Ensure pad/eos/bos exist and are INTS (not tensors) ---
70
+ def _to_int_or_list(x):
71
+ if isinstance(x, torch.Tensor):
72
+ return int(x.item()) if x.numel() == 1 else [int(v) for v in x.tolist()]
73
+ if isinstance(x, (list, tuple)):
74
+ return [int(v) for v in x]
75
+ return int(x) if x is not None else None
76
+
77
+ # Safeguard pad token
78
+ if tokenizer.pad_token is None and tokenizer.eos_token is not None:
79
+ tokenizer.pad_token = tokenizer.eos_token
80
+ elif tokenizer.pad_token is None:
81
+ tokenizer.add_special_tokens({"pad_token": "<pad>"})
82
+ model.resize_token_embeddings(len(tokenizer))
83
+
84
+ # Normalize generation config + mirror on model.config
85
+ gc = model.generation_config
86
+ for attr in ["pad_token_id", "eos_token_id", "bos_token_id", "decoder_start_token_id"]:
87
+ tok_val = getattr(tokenizer, attr, None)
88
+ cfg_val = getattr(gc, attr, None)
89
+ val = tok_val if tok_val is not None else cfg_val
90
+ if val is not None:
91
+ setattr(gc, attr, _to_int_or_list(val))
92
+ # mirror on model.config
93
+ val2 = getattr(model.generation_config, attr, None)
94
+ if val2 is not None:
95
+ setattr(model.config, attr, _to_int_or_list(val2))
96
+
97
+ # ================= Low-level NLLB-style generation =================
98
+ def _forced_bos_id(lang_code: str):
99
+ # Try common mappings first
100
+ if hasattr(tokenizer, "lang_code_to_id") and isinstance(tokenizer.lang_code_to_id, dict):
101
+ if lang_code in tokenizer.lang_code_to_id:
102
+ return int(tokenizer.lang_code_to_id[lang_code])
103
+ # Fallback: treat lang code as a token
104
+ try:
105
+ tok_id = tokenizer.convert_tokens_to_ids(lang_code)
106
+ if isinstance(tok_id, int) and tok_id != tokenizer.unk_token_id:
107
+ return tok_id
108
+ except Exception:
109
+ pass
110
+ # Final fallback: keep whatever the model already has
111
+ return model.generation_config.forced_bos_token_id
112
+
113
+ def _encode(texts: List[str], src_lang: str):
114
+ # NLLB/M2M-style: set source lang on tokenizer if supported
115
+ if hasattr(tokenizer, "src_lang"):
116
+ tokenizer.src_lang = src_lang
117
+ return tokenizer(
118
+ texts,
119
+ return_tensors="pt",
120
+ padding=True,
121
+ truncation=True,
122
+ add_special_tokens=True,
123
+ )
124
+
125
+ def _generate_batch(texts: List[str], src_lang: str, tgt_lang: str) -> List[str]:
126
+ if not texts:
127
+ return []
128
+ inputs = _encode(texts, src_lang)
129
+
130
+ # NOTE: Do NOT move inputs; with device_map="auto" the hooks handle it.
131
+ # Keep tensors on CPU; accelerate offloads as needed.
132
+
133
+ forced_bos = _forced_bos_id(tgt_lang)
134
+ gen_kwargs = dict(
135
+ max_new_tokens=MAX_NEW_TOKENS,
136
+ do_sample=False,
137
+ num_beams=NUM_BEAMS,
138
+ eos_token_id=model.generation_config.eos_token_id,
139
+ pad_token_id=model.generation_config.pad_token_id,
140
+ forced_bos_token_id=forced_bos,
141
+ )
142
+
143
+ with torch.no_grad():
144
+ output_ids = model.generate(**inputs, **gen_kwargs)
145
+ return tokenizer.batch_decode(output_ids, skip_special_tokens=True)
146
+
147
+ # ================= Simple text translation =================
148
+ def translate_text_simple(text: str) -> str:
149
  if not text or not text.strip():
150
  return ""
151
+ return _generate_batch([text], FR_CODE, NG_CODE)[0]
152
+
153
+ # ================= Chunking + Batched Translation + Cache =================
154
+ def tokenize_len(s: str) -> int:
155
+ return tokenizer(s, add_special_tokens=False, return_length=True)["length"][0]
156
+
157
+ def chunk_text_for_translation(text: str, max_src_tokens: int = MAX_SRC_TOKENS) -> List[str]:
158
+ """Split text by sentence-ish boundaries and merge under token limit."""
159
+ if not text.strip():
160
+ return []
161
+ parts = re.split(r'(\s*[\.\!\?…:;]\s+)', text)
162
+ sentences = []
163
+ for i in range(0, len(parts), 2):
164
+ s = parts[i]
165
+ p = parts[i+1] if i+1 < len(parts) else ""
166
+ unit = (s + (p or "")).strip()
167
+ if unit:
168
+ sentences.append(unit)
169
+
170
+ chunks, current = [], ""
171
+ for sent in sentences:
172
+ candidate = (current + " " + sent).strip() if current else sent
173
+ if current and tokenize_len(candidate) > max_src_tokens:
174
+ chunks.append(current.strip())
175
+ current = sent
176
+ else:
177
+ current = candidate
178
+ if current.strip():
179
+ chunks.append(current.strip())
180
+ return chunks
181
+
182
+ # Small bounded cache (LRU-like using dict + cap)
183
+ TRANSLATION_CACHE: Dict[str, str] = {}
184
+ CACHE_CAP = 20000
185
+
186
+ def _cache_set(k: str, v: str):
187
+ if len(TRANSLATION_CACHE) >= CACHE_CAP:
188
+ # drop ~5% oldest items
189
+ for i, key in enumerate(list(TRANSLATION_CACHE.keys())):
190
+ del TRANSLATION_CACHE[key]
191
+ if i > CACHE_CAP // 20:
192
+ break
193
+ TRANSLATION_CACHE[k] = v
194
+
195
+ def translate_chunks_list(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[str]:
196
+ """
197
+ Translate a list of chunks with de-dup + batching.
198
+ Returns translations in the same order as input.
199
+ """
200
+ norm_chunks = [c.strip() for c in chunks]
201
+ unique_to_translate = []
202
+ seen = set()
203
+ for c in norm_chunks:
204
+ if c and c not in TRANSLATION_CACHE and c not in seen:
205
+ seen.add(c)
206
+ unique_to_translate.append(c)
207
+
208
+ for i in range(0, len(unique_to_translate), batch_size):
209
+ batch = unique_to_translate[i:i + batch_size]
210
+ outs = _generate_batch(batch, FR_CODE, NG_CODE)
211
+ for src, o in zip(batch, outs):
212
+ _cache_set(src, o)
213
+
214
+ return [TRANSLATION_CACHE.get(c, "") for c in norm_chunks]
215
+
216
+ def translate_long_text(text: str) -> str:
217
+ """Chunk → batch translate → rejoin for one paragraph/block."""
218
+ chs = chunk_text_for_translation(text)
219
+ if not chs:
220
+ return ""
221
+ trs = translate_chunks_list(chs)
222
+ return " ".join(trs).strip()
223
+
224
+ # ================= DOCX helpers =================
225
+ def is_heading(par: Paragraph) -> Tuple[bool, int]:
226
+ # Works with English and French Word styles
227
+ name = (par.style.name or "").lower()
228
+ if any(c in name for c in ["heading", "title", "titre"]):
229
+ for lvl in range(1, 10):
230
+ if str(lvl) in name:
231
+ return True, lvl
232
+ return True, 1
233
+ return False, 0
234
+
235
+ def translate_docx_bytes(file_bytes: bytes) -> bytes:
236
+ """
237
+ Read .docx → collect ALL chunks (paras + table cells) → single batched translation → rebuild .docx.
238
+ Paragraphs and table cell paragraphs are justified; headings kept as headings.
239
+ """
240
+ f = io.BytesIO(file_bytes)
241
+ src_doc = docx.Document(f)
242
+
243
+ # 1) Collect work units
244
+ work = [] # list of dict entries describing items with ranges into all_chunks
245
+ all_chunks: List[str] = []
246
+
247
+ # paragraphs
248
+ for par in src_doc.paragraphs:
249
+ txt = par.text
250
+ if not txt.strip():
251
+ work.append({"kind": "blank"})
252
+ continue
253
+
254
+ is_head, lvl = is_heading(par)
255
+ if is_head:
256
+ work.append({"kind": "heading", "level": min(max(lvl, 1), 9), "range": (len(all_chunks), len(all_chunks)+1)})
257
+ all_chunks.append(txt.strip())
258
+ else:
259
+ chs = chunk_text_for_translation(txt)
260
+ if chs:
261
+ start = len(all_chunks)
262
+ all_chunks.extend(chs)
263
+ work.append({"kind": "para", "range": (start, start+len(chs))})
264
+ else:
265
+ work.append({"kind": "blank"})
266
+
267
+ # tables
268
+ for table in src_doc.tables:
269
+ t_desc = {"kind": "table", "rows": len(table.rows), "cols": len(table.columns), "cells": []}
270
+ for row in table.rows:
271
+ row_cells = []
272
+ for cell in row.cells:
273
+ cell_text = "\n".join([p.text for p in cell.paragraphs]).strip()
274
+ if cell_text:
275
+ chs = chunk_text_for_translation(cell_text)
276
+ if chs:
277
+ start = len(all_chunks)
278
+ all_chunks.extend(chs)
279
+ row_cells.append({"range": (start, start+len(chs))})
280
+ else:
281
+ row_cells.append({"range": None})
282
+ else:
283
+ row_cells.append({"range": None})
284
+ t_desc["cells"].append(row_cells)
285
+ work.append(t_desc)
286
+
287
+ # 2) Translate all chunks at once (de-dup + batching)
288
+ translated_all = translate_chunks_list(all_chunks) if all_chunks else []
289
+
290
+ # 3) Rebuild new document with justified paragraphs
291
+ new_doc = docx.Document()
292
+
293
+ def join_range(rng: Tuple[int, int]) -> str:
294
+ if rng is None:
295
+ return ""
296
+ s, e = rng
297
+ return " ".join(translated_all[s:e]).strip()
298
+
299
+ for item in work:
300
+ if item["kind"] == "blank":
301
+ new_doc.add_paragraph("")
302
+ elif item["kind"] == "heading":
303
+ text = join_range(item["range"])
304
+ new_doc.add_heading(text, level=item["level"])
305
+ elif item["kind"] == "para":
306
+ text = join_range(item["range"])
307
+ p = new_doc.add_paragraph(text)
308
+ p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
309
+ elif item["kind"] == "table":
310
+ tbl = new_doc.add_table(rows=item["rows"], cols=item["cols"])
311
+ for r_idx in range(item["rows"]):
312
+ for c_idx in range(item["cols"]):
313
+ cell_info = item["cells"][r_idx][c_idx]
314
+ txt = join_range(cell_info["range"])
315
+ tgt_cell = tbl.cell(r_idx, c_idx)
316
+ tgt_cell.text = txt
317
+ for p in tgt_cell.paragraphs:
318
+ p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
319
+
320
+ out = io.BytesIO()
321
+ new_doc.save(out)
322
+ return out.getvalue()
323
+
324
+ # ================= PDF helpers =================
325
+ def extract_pdf_text_blocks(pdf_bytes: bytes) -> List[List[str]]:
326
+ """
327
+ Returns list of pages, each a list of block texts (visual order).
328
+ """
329
+ pages_blocks: List[List[str]] = []
330
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
331
+ for page in doc:
332
+ blocks = page.get_text("blocks")
333
+ blocks.sort(key=lambda b: (round(b[1], 1), round(b[0], 1)))
334
+ page_texts = []
335
+ for b in blocks:
336
+ text = b[4].strip()
337
+ if text:
338
+ page_texts.append(text)
339
+ pages_blocks.append(page_texts)
340
+ doc.close()
341
+ return pages_blocks
342
+
343
+ def build_pdf_from_blocks(translated_pages: List[List[str]]) -> bytes:
344
+ """
345
+ Build a clean paginated PDF with justified paragraphs.
346
+ Keeps one translated page per original page via PageBreak.
347
+ """
348
+ buf = io.BytesIO()
349
+ doc = SimpleDocTemplate(
350
+ buf, pagesize=A4,
351
+ rightMargin=2*cm, leftMargin=2*cm,
352
+ topMargin=2*cm, bottomMargin=2*cm
353
  )
 
354
 
355
+ styles = getSampleStyleSheet()
356
+ body = styles["BodyText"]
357
+ body.alignment = TA_JUSTIFY
358
+ body.leading = 14
359
+
360
+ story = []
361
+ for p_idx, blocks in enumerate(translated_pages):
362
+ if p_idx > 0:
363
+ story.append(PageBreak())
364
+ for blk in blocks:
365
+ story.append(RLParagraph(blk.replace("\n", "<br/>"), body))
366
+ story.append(Spacer(1, 0.35*cm))
367
+
368
+ doc.build(story)
369
+ return buf.getvalue()
370
+
371
+ def translate_pdf_bytes(file_bytes: bytes) -> bytes:
372
+ """
373
+ Read PDF → collect ALL block chunks across pages → single batched translation → rebuild PDF.
374
+ """
375
+ pages_blocks = extract_pdf_text_blocks(file_bytes)
376
+
377
+ # 1) collect chunks for the entire PDF
378
+ all_chunks: List[str] = []
379
+ plan = [] # list of pages, each a list of ranges for blocks
380
+ for blocks in pages_blocks:
381
+ page_plan = []
382
+ for blk in blocks:
383
+ chs = chunk_text_for_translation(blk)
384
+ if chs:
385
+ start = len(all_chunks)
386
+ all_chunks.extend(chs)
387
+ page_plan.append((start, start + len(chs)))
388
+ else:
389
+ page_plan.append(None)
390
+ plan.append(page_plan)
391
+
392
+ # 2) translate all chunks at once
393
+ translated_all = translate_chunks_list(all_chunks) if all_chunks else []
394
+
395
+ # 3) reconstruct per block
396
+ translated_pages: List[List[str]] = []
397
+ for page_plan in plan:
398
+ page_out = []
399
+ for rng in page_plan:
400
+ if rng is None:
401
+ page_out.append("")
402
+ else:
403
+ s, e = rng
404
+ page_out.append(" ".join(translated_all[s:e]).strip())
405
+ translated_pages.append(page_out)
406
+
407
+ return build_pdf_from_blocks(translated_pages)
408
+
409
+ # ================= Gradio file handler =================
410
+ def translate_document(file_obj):
411
+ """
412
+ Accepts gr.File input (NamedString, filepath str, or dict with binary).
413
+ Returns (output_file_path, status_message).
414
+ """
415
+ if file_obj is None:
416
+ return None, "Veuillez sélectionner un fichier .docx ou .pdf"
417
+
418
+ try:
419
+ name = "document"
420
+ data = None
421
+
422
+ # Case A: plain filepath string
423
+ if isinstance(file_obj, str):
424
+ name = os.path.basename(file_obj)
425
+ with open(file_obj, "rb") as f:
426
+ data = f.read()
427
+
428
+ # Case B: Gradio NamedString with .name (orig name) and .value (temp path)
429
+ elif hasattr(file_obj, "name") and hasattr(file_obj, "value"):
430
+ name = os.path.basename(file_obj.name or "document")
431
+ with open(file_obj.value, "rb") as f:
432
+ data = f.read()
433
+
434
+ # Case C: dict (type="binary")
435
+ elif isinstance(file_obj, dict) and "name" in file_obj and "data" in file_obj:
436
+ name = os.path.basename(file_obj["name"] or "document")
437
+ d = file_obj["data"]
438
+ data = d.read() if hasattr(d, "read") else d
439
+
440
+ else:
441
+ return None, "Type d'entrée fichier non supporté (filepath/binaire)."
442
+
443
+ if data is None:
444
+ return None, "Impossible de lire le fichier sélectionné."
445
+
446
+ if name.lower().endswith(".docx"):
447
+ out_bytes = translate_docx_bytes(data)
448
+ out_path = "translated_ngambay.docx"
449
+ with open(out_path, "wb") as f:
450
+ f.write(out_bytes)
451
+ return out_path, "✅ Traduction DOCX terminée (paragraphes justifiés)."
452
+
453
+ elif name.lower().endswith(".pdf"):
454
+ out_bytes = translate_pdf_bytes(data)
455
+ out_path = "translated_ngambay.pdf"
456
+ with open(out_path, "wb") as f:
457
+ f.write(out_bytes)
458
+ return out_path, "✅ Traduction PDF terminée (paragraphes justifiés)."
459
+
460
+ else:
461
+ return None, "Type de fichier non supporté. Choisissez .docx ou .pdf"
462
+
463
+ except Exception as e:
464
+ return None, f"❌ Erreur pendant la traduction: {e}"
465
+
466
+ # ================== UI ==================
467
  theme = gr.themes.Soft(
468
  primary_hue="indigo",
469
  radius_size="lg",
 
475
 
476
  CUSTOM_CSS = """
477
  .gradio-container {max-width: 980px !important;}
478
+ .header-card {
479
+ background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
480
+ color: white; padding: 22px; border-radius: 18px;
481
  box-shadow: 0 10px 30px rgba(79,70,229,.25);
482
+ transition: transform .2s ease;
483
  }
484
+ .header-card:hover { transform: translateY(-1px); }
485
+ .header-title { font-size: 26px; font-weight: 800; margin: 0 0 6px 0; letter-spacing: .2px; }
486
+ .header-sub { opacity: .98; font-size: 14px; }
487
  .brand { display:flex; align-items:center; gap:10px; justify-content:space-between; flex-wrap:wrap; }
488
+ .badge {
489
+ display:inline-block; background: rgba(255,255,255,.18);
490
+ padding: 4px 10px; border-radius: 999px; font-size: 12px;
491
  border: 1px solid rgba(255,255,255,.25);
492
  }
493
  .footer-note {
494
  margin-top: 8px; color: #64748b; font-size: 12px; text-align: center;
495
  }
496
+ .support-banner {
497
+ margin-top: 14px;
498
+ border-radius: 14px;
499
+ padding: 14px 16px;
500
+ background: linear-gradient(135deg, rgba(79,70,229,.08), rgba(124,58,237,.08));
501
+ border: 1px solid rgba(99,102,241,.25);
502
+ box-shadow: 0 6px 18px rgba(79,70,229,.08);
503
+ }
504
+ .support-title { font-weight: 700; font-size: 16px; margin-bottom: 4px; }
505
+ .support-text { font-size: 13px; color: #334155; line-height: 1.5; }
506
+ .support-contacts { display: flex; gap: 10px; flex-wrap: wrap; margin-top: 8px; }
507
+ .support-chip {
508
+ display:inline-block; padding: 6px 10px; border-radius: 999px;
509
+ background: white; border: 1px dashed rgba(79,70,229,.45);
510
+ font-size: 12px; color: #3730a3;
511
+ }
512
  """
513
 
514
  with gr.Blocks(
 
516
  theme=theme,
517
  css=CUSTOM_CSS,
518
  fill_height=True,
 
519
  ) as demo:
520
+ with gr.Group(elem_classes=["header-card"]):
521
+ gr.HTML(
522
+ """
523
+ <div class="brand">
524
+ <div>
525
+ <div class="header-title">Français → Ngambay (v1)</div>
526
+ <div class="header-sub">🚀 Version bêta · Merci de tester et partager vos retours pour améliorer la qualité de traduction.</div>
 
 
 
527
  </div>
528
+ <span class="badge">Modèle&nbsp;: Toadoum/ngambay-fr-v1</span>
529
+ </div>
530
+ """
531
+ )
532
+
533
+ with gr.Tabs():
534
+ # -------- Tab 1: Texte --------
535
+ with gr.Tab("Traduction de texte"):
536
+ with gr.Row():
537
+ with gr.Column(scale=5):
538
+ src = gr.Textbox(
539
+ label="Texte source (Français)",
540
+ placeholder="Saisissez votre texte en français…",
541
+ lines=8,
542
+ autofocus=True
543
+ )
544
+ with gr.Row():
545
+ btn = gr.Button("Traduire", variant="primary", scale=3)
546
+ clear_btn = gr.Button("Effacer", scale=1)
547
+ gr.Examples(
548
+ examples=[
549
+ ["Bonjour, comment allez-vous aujourd’hui ?"],
550
+ ["La réunion de sensibilisation aura lieu demain au centre communautaire."],
551
+ ["Merci pour votre participation et votre soutien."],
552
+ ["Veuillez suivre les recommandations de santé pour protéger votre famille."]
553
+ ],
554
+ inputs=[src],
555
+ label="Exemples (cliquez pour remplir)"
556
+ )
557
+ with gr.Column(scale=5):
558
+ tgt = gr.Textbox(
559
+ label="Traduction (Ngambay)",
560
+ lines=8,
561
+ interactive=False,
562
+ show_copy_button=True
563
+ )
564
+ gr.Markdown('<div class="footer-note">Astuce : collez un paragraphe complet pour un meilleur contexte. Les noms propres et sigles peuvent nécessiter une relecture humaine.</div>')
565
+
566
+ # -------- Tab 2: Documents --------
567
+ with gr.Tab("Traduction de document (.docx / .pdf)"):
568
+ with gr.Row():
569
+ with gr.Column(scale=5):
570
+ doc_inp = gr.File(
571
+ label="Sélectionnez un document (.docx ou .pdf)",
572
+ file_types=[".docx", ".pdf"],
573
+ type="filepath" # ensures a temp filepath; handler also supports binary
574
+ )
575
+ run_doc = gr.Button("Traduire le document", variant="primary")
576
+ with gr.Column(scale=5):
577
+ doc_out = gr.File(label="Fichier traduit (télécharger)")
578
+ doc_status = gr.Markdown(visible=False)
579
+
580
+ def _wrap_translate_document(f):
581
+ path, msg = translate_document(f)
582
+ return path, gr.update(value=msg, visible=True)
583
+
584
+ run_doc.click(_wrap_translate_document, inputs=doc_inp, outputs=[doc_out, doc_status])
585
+
586
+ # Contribution banner
587
+ gr.HTML(
588
+ """
589
+ <div class="support-banner">
590
+ <div class="support-title">💙 Contribuer au projet (recrutement de linguistes)</div>
591
+ <div class="support-text">
592
+ Nous cherchons à <b>recruter des linguistes</b> pour renforcer la construction de données Ngambay.
593
+ Si vous souhaitez soutenir financièrement ou en tant que bénévole, contactez-nous :
594
+ </div>
595
+ <div class="support-contacts">
596
+ <span class="support-chip">📱 WhatsApp, Airtel Money&nbsp;: <b>+235&nbsp;66&nbsp;04&nbsp;90&nbsp;94</b></span>
597
+ <span class="support-chip">✉️ Email&nbsp;: <a href="mailto:[email protected]">[email protected]</a></span>
598
+ </div>
599
+ </div>
600
+ """
601
+ )
602
+
603
+ # Text actions
604
+ btn.click(translate_text_simple, inputs=src, outputs=tgt)
605
  clear_btn.click(lambda: ("", ""), outputs=[src, tgt])
606
 
607
  if __name__ == "__main__":
608
+ # No .to(...) anywhere; model stays where Accelerate placed it (or CPU).
609
+ demo.queue(default_concurrency_limit=4).launch(share=True)