import gradio as gr import pandas as pd from pathlib import Path from huggingface_hub import HfApi, Repository # Allowed tags LABELS = {"PER", "ORG", "LOC", "O"} token_df = pd.DataFrame() # global store # ───────────────────────── token explode ─────────────────────── def explode(df: pd.DataFrame) -> pd.DataFrame: """Return DataFrame(example_id, token, label='O').""" if "text" in df.columns: lines = df["text"].astype(str) else: # user / assistant dialogs lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1) rows = [] for sid, line in enumerate(lines, start=0): # ensure unique 0,1,2,... for tok in line.split(): rows.append({"example_id": sid, "token": tok, "label": "O"}) return pd.DataFrame(rows) # ───────────────────────── callbacks ─────────────────────────── def load_csv(file): global token_df df = pd.read_csv(file.name) valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns)) if not valid: msg = "❌ CSV needs a `text` column **or** both `user` and `assistant` columns." return None, msg, *(gr.update(visible=False),) * 3 token_df = explode(df) return (gr.update(value=token_df, visible=True, row_count=len(token_df)), f"✅ {len(df)} rows → {len(token_df)} tokens.", gr.update(visible=True), # show buttons gr.update(visible=False), # reset download links gr.update(visible=False)) def save_table(tbl): global token_df token_df = pd.DataFrame(tbl, columns=["example_id", "token", "label"]) bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique() return "💾 Saved." if bad.size == 0 else f"⚠️ Unknown label(s): {', '.join(bad)}" def export_tokens(): fname = "raw_tokens.csv" token_df.to_csv(fname, index=False) return gr.update(value=fname, visible=True) def export_iob(): iob, prev = [], {} for _, r in token_df.iterrows(): sid, lbl = r["example_id"], r["label"] if lbl == "O": iob.append("O"); prev[sid] = None else: iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl) prev[sid] = lbl out = token_df.copy(); out["iob"] = iob fname = "ner_iob.csv"; out.to_csv(fname, index=False) return gr.update(value=fname, visible=True) def push_to_hub(repo_id, token): try: HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True) local = Path(repo_id.replace("/", "_")) if local.exists(): for f in local.iterdir(): f.unlink() local.rmdir() repo = Repository(str(local), clone_from=repo_id, repo_type="dataset", use_auth_token=token) token_df.to_csv(local / "data.csv", index=False) repo.push_to_hub("Add annotated NER data") return f"🚀 https://huggingface.co/datasets/{repo_id}" except Exception as e: return f"❌ {e}" # ───────────────────────── UI ────────────────────────────────── with gr.Blocks() as demo: gr.Markdown("# 🏷️ Label It! Mini-NER") gr.Markdown("**Step 1** – upload CSV (columns: `text` **or** `user`+`assistant`).") with gr.Row(): csv_file = gr.File(file_types=[".csv"]) load_btn = gr.Button("Load") status = gr.Textbox(interactive=False) tok_table = gr.Dataframe(headers=["example_id", "token", "label"], datatype=["number", "str", "str"], visible=False) with gr.Row(visible=False) as buttons: save_btn = gr.Button("💾 Save") tok_btn = gr.Button("⬇︎ Tokens CSV") iob_btn = gr.Button("⬇︎ IOB CSV") file_tok = gr.File(visible=False) file_iob = gr.File(visible=False) with gr.Accordion("📦 Push to Hugging Face Hub", open=False, visible=False) as acc: repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password") push_btn = gr.Button("Push") push_out = gr.Textbox(interactive=False) # wiring load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob]) load_btn.click(lambda: gr.update(visible=True), None, acc) save_btn.click(save_table, tok_table, status) tok_btn.click(export_tokens, outputs=file_tok) iob_btn.click(export_iob, outputs=file_iob) push_btn.click(push_to_hub, [repo_in, token_in], push_out) gr.Markdown("**Step 2** – label tokens (`PER`, `ORG`, `LOC`, `O`) ➜ Save ➜ Download / Push.") demo.launch()