Spaces:

konkani
/

Goan-information

Running

App Files Files Community

Reubencf commited on 6 days ago

Commit

f98a7e0

verified ·

1 Parent(s): 16bcde3

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -271

app.py CHANGED Viewed

@@ -1,299 +1,111 @@
-# app.py — Hugging Face Space ready (LoRA adapter, Gradio compat)
 # ---------------------------------------------------------------
-# What changed vs your script
-# - Removed ChatInterface args that broke on old Gradio (retry_btn, undo_btn)
-# - No interactive input() for merging (Spaces are non-interactive). Use MERGE_LORA env var.
-# - Secrets: read HF token from env (Settings → Secrets → HF_TOKEN), never hardcode.
-# - Token passing works across transformers/peft versions (token/use_auth_token fallback).
-# - Optional 8-bit via USE_8BIT=1 (GPU only). Safe CPU defaults.
-# - Robust theme/queue/launch for mixed Gradio versions.
 import os
-import gc
-import warnings
-from typing import List, Tuple
 import torch
 import gradio as gr
-warnings.filterwarnings("ignore")
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-try:
-    from peft import PeftConfig, PeftModel
-    from transformers import (
-        AutoTokenizer,
-        AutoModelForCausalLM,
-        BitsAndBytesConfig,
-    )
-    IMPORTS_OK = True
-except Exception as e:
-    IMPORTS_OK = False
-    print(f"Missing dependencies: {e}")
-    print("Install: pip install --upgrade 'transformers>=4.41' peft accelerate gradio torch bitsandbytes")
 # ── Configuration ──────────────────────────────────────────────────────────────
-HF_TOKEN = os.getenv("HF_TOKEN")  # set in Space Settings → Secrets → HF_TOKEN
-# LoRA adapter repo (must be compatible with BASE_MODEL_ID)
-ADAPTER_ID = os.getenv("ADAPTER_ID", "Reubencf/gemma3-goan-finetuned")
-# Base model used during fine-tuning (should match adapter's base)
-BASE_MODEL_ID_DEFAULT = os.getenv("BASE_MODEL_ID", "google/gemma-3-4b-it")
-# Quantization toggle (GPU only): set USE_8BIT=1 in Space variables
-USE_8BIT = os.getenv("USE_8BIT", "0").lower() in {"1", "true", "yes", "y"}
-# Merge LoRA into the base for faster inference: MERGE_LORA=1/0
-MERGE_LORA = os.getenv("MERGE_LORA", "1").lower() in {"1", "true", "yes", "y"}
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TITLE = "🌴 Gemma Goan Q&A Bot"
-DESCRIPTION_TMPL = (
-    "Gemma base model + LoRA adapter fine-tuned on a Goan Q&A dataset.\n"
-    "Ask about Goa, Konkani culture, or general topics!\n\n"
-    "**Status**: {}"
 )
-# ── Helpers ───────────────────────────────────────────────────────────────────
-def call_with_token(fn, *args, **kwargs):
-    """Call HF/Transformers/PEFT functions with token OR use_auth_token for
-    broad version compatibility."""
-    if HF_TOKEN:
-        try:
-            return fn(*args, token=HF_TOKEN, **kwargs)
-        except TypeError:
-            return fn(*args, use_auth_token=HF_TOKEN, **kwargs)
-    return fn(*args, **kwargs)
-# ── Load model + tokenizer ─────────────────────────────────────────────────────
-def load_model_and_tokenizer():
-    if not IMPORTS_OK:
-        raise ImportError("Required packages not installed.")
-    print("[Init] Starting model load…")
-    print(f"[Config] Device: {DEVICE}")
-    # GC + VRAM cleanup
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    # Step 1: Confirm base model from the adapter's config if possible
-    actual_base_model = BASE_MODEL_ID_DEFAULT
-    try:
-        print(f"[Load] Reading adapter config: {ADAPTER_ID}")
-        peft_cfg = call_with_token(PeftConfig.from_pretrained, ADAPTER_ID)
-        if getattr(peft_cfg, "base_model_name_or_path", None):
-            actual_base_model = peft_cfg.base_model_name_or_path
-            print(f"[Load] Adapter expects base model: {actual_base_model}")
-        else:
-            print("[Warn] Adapter did not expose base_model_name_or_path; using configured base.")
-    except Exception as e:
-        print(f"[Warn] Could not read adapter config ({e}); using configured base: {actual_base_model}")
-    # Step 2: Load base model (optionally quantized on GPU)
-    print(f"[Load] Loading base model: {actual_base_model}")
-    quant_cfg = None
-    if USE_8BIT and torch.cuda.is_available():
-        print("[Load] Enabling 8-bit quantization (bitsandbytes)")
-        quant_cfg = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.float16)
-    base_model = call_with_token(
-        AutoModelForCausalLM.from_pretrained,
-        actual_base_model,
-        trust_remote_code=True,
-        quantization_config=quant_cfg,
-        low_cpu_mem_usage=True,
-        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None,
-    )
-    if DEVICE == "cpu" and not torch.cuda.is_available():
-        base_model = base_model.to("cpu")
-        print("[Load] Model on CPU")
-    print("[Load] Base model loaded ✔")
-    # Step 3: Tokenizer
-    print("[Load] Loading tokenizer…")
-    tokenizer = call_with_token(
-        AutoTokenizer.from_pretrained,
-        actual_base_model,
-        use_fast=True,
-        trust_remote_code=True,
-    )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"
-    # Step 4: Apply LoRA adapter
-    status = ""
-    model = base_model
-    try:
-        print(f"[Load] Applying LoRA adapter: {ADAPTER_ID}")
-        model = call_with_token(PeftModel.from_pretrained, base_model, ADAPTER_ID)
-        if MERGE_LORA:
-            print("[Load] Merging adapter into base (merge_and_unload)…")
-            model = model.merge_and_unload()
-            status = f"✅ Using fine-tuned model (merged): {ADAPTER_ID}"
-        else:
-            status = f"✅ Using fine-tuned model via adapter: {ADAPTER_ID}"
-    except FileNotFoundError as e:
-        print(f"[Error] Adapter files not found: {e}")
-        status = f"⚠️ Adapter not found. Using base only: {actual_base_model}"
-    except Exception as e:
-        print(f"[Error] Failed to load adapter: {e}")
-        status = f"⚠️ Could not load adapter. Using base only: {actual_base_model}"
-    model.eval()
-    print(f"[Load] Model ready on {DEVICE} ✔")
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    return model, tokenizer, status
-# Global load at import time (Space-friendly)
 try:
-    model, tokenizer, STATUS_MSG = load_model_and_tokenizer()
     MODEL_LOADED = True
-    DESCRIPTION = DESCRIPTION_TMPL.format(STATUS_MSG)
 except Exception as e:
-    print(f"[Fatal] Could not load model: {e}")
     MODEL_LOADED = False
-    model = tokenizer = None
-    DESCRIPTION = DESCRIPTION_TMPL.format(f"❌ Model failed to load: {str(e)[:140]}")
-# ── Generation ────────────────────────────────────────────────────────────────
-def generate_response(
-    message: str,
-    history: List[Tuple[str, str]],
-    temperature: float = 0.7,
-    max_new_tokens: int = 256,
-    top_p: float = 0.95,
-    repetition_penalty: float = 1.1,
-) -> str:
     if not MODEL_LOADED:
-        return "⚠️ Model failed to load. Check Space logs."
-    try:
-        # Build short chat history
-        conversation = []
-        if history:
-            for u, a in history[-3:]:
-                if u:
-                    conversation.append({"role": "user", "content": u})
-                if a:
-                    conversation.append({"role": "assistant", "content": a})
-        conversation.append({"role": "user", "content": message})
-        # Try the tokenizer's chat template first
-        try:
-            input_ids = tokenizer.apply_chat_template(
-                conversation,
-                add_generation_prompt=True,
-                return_tensors="pt",
-            )
-        except Exception as e:
-            print(f"[Warn] chat_template failed: {e}; using manual format")
-            prompt_text = "".join(
-                [
-                    ("User: " + m["content"] + "\n") if m["role"] == "user" else ("Assistant: " + m["content"] + "\n")
-                    for m in conversation
-                ]
-            ) + "Assistant: "
-            input_ids = tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=1024).input_ids
-        input_ids = input_ids.to(model.device if hasattr(model, "device") else DEVICE)
-        with torch.no_grad():
-            out = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=max(1, min(int(max_new_tokens), 512)),
-                temperature=float(temperature),
-                top_p=float(top_p),
-                repetition_penalty=float(repetition_penalty),
-                do_sample=True,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                use_cache=True,
-            )
-        gen = out[0][input_ids.shape[-1]:]
-        text = tokenizer.decode(gen, skip_special_tokens=True).strip()
-        # Cleanup
-        del out, input_ids, gen
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        return text or "(no output)"
-    except Exception as e:
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        return f"⚠️ Error generating response: {e}"
 # ── UI ────────────────────────────────────────────────────────────────────────
 examples = [
-    ["What is the capital of Goa?", 0.7, 256, 0.95, 1.1],
-    ["Tell me about the Konkani language.", 0.7, 256, 0.95, 1.1],
-    ["What are famous beaches in Goa?", 0.7, 256, 0.95, 1.1],
-    ["Describe Goan fish curry.", 0.7, 256, 0.95, 1.1],
-    ["What is the history of Old Goa?", 0.7, 256, 0.95, 1.1],
 ]
-# Best-effort theme across versions
-try:
-    THEME = gr.themes.Soft()
-except Exception:
-    THEME = None
-if MODEL_LOADED:
-    demo = gr.ChatInterface(
-        fn=generate_response,
-        title=TITLE,
-        description=DESCRIPTION,
-        examples=examples,
-        additional_inputs=[
-            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
-            gr.Slider(minimum=32, maximum=512, value=256, step=16, label="Max new tokens"),
-            gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
-            gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition penalty"),
-        ],
-        theme=THEME,
-    )
-else:
-    demo = gr.Interface(
-        fn=lambda x: "Model failed to load. Check Space logs.",
-        inputs=gr.Textbox(label="Message"),
-        outputs=gr.Textbox(label="Response"),
-        title=TITLE,
-        description=DESCRIPTION,
-        theme=THEME,
-    )
-# Queue — keep params minimal for cross-version compat
-try:
-    demo.queue()
-except Exception:
-    pass
 if __name__ == "__main__":
-    print("\n" + "=" * 60)
-    print(f"🚀 Starting Gradio app on {DEVICE} …")
-    print(f"📍 Base model: {BASE_MODEL_ID_DEFAULT}")
-    print(f"🔧 LoRA adapter: {ADAPTER_ID}")
-    print(f"🧩 Merge LoRA: {MERGE_LORA}")
-    print("=" * 60 + "\n")
-    # On Spaces, just calling launch() is fine.
     demo.launch()

+# app.py — Simplified for Hugging Face Spaces
+# ---------------------------------------------------------------
+# This version uses the high-level `pipeline` from transformers
+# for a much simpler and cleaner implementation.
 # ---------------------------------------------------------------
 import os
 import torch
 import gradio as gr
+from transformers import pipeline
 # ── Configuration ──────────────────────────────────────────────────────────────
+# Set the model repository ID
+MODEL_ID = "Reubencf/gemma3-goan-finetuned"
+HF_TOKEN = os.getenv("HF_TOKEN") # Optional: for private models
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TITLE = "🌴 Gemma Goan Q&A Bot"
+DESCRIPTION = (
+    "This is a simple Gradio chat interface for the Gemma model fine-tuned on a Goan Q&A dataset.\n"
+    "Ask about Goa, Konkani culture, or general topics!"
 )
+# ── Load Model Pipeline ─────────────────────────────────────────────────────
+# We load the model and tokenizer into a pipeline object.
+# This is done only once when the app starts.
+# `device_map="auto"` ensures the model is placed on a GPU if available.
+print(f"[Init] Loading model pipeline: {MODEL_ID} on {DEVICE}...")
 try:
+    pipe = pipeline(
+        "text-generation",
+        model=MODEL_ID,
+        torch_dtype=torch.bfloat16, # Use bfloat16 for better performance
+        device_map="auto",
+        token=HF_TOKEN,
+    )
     MODEL_LOADED = True
+    print("[Init] Model pipeline loaded successfully.")
 except Exception as e:
     MODEL_LOADED = False
+    DESCRIPTION = f"❌ Model failed to load: {e}"
+    print(f"[Fatal] Could not load model: {e}")
+# ── Generation Function ──────────────────────────────────────────────────────
+def generate_response(message, history):
+    """
+    This function is called for each user message.
+    It takes the user's message and the conversation history,
+    formats them for the model, and returns the model's response.
+    """
     if not MODEL_LOADED:
+        return "⚠️ Model is not available. Please check the Space logs for errors."
+    # Format the conversation history into the format expected by the model
+    # The model expects a list of dictionaries with "role" and "content" keys
+    conversation = []
+    for user_msg, assistant_msg in history:
+        conversation.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            conversation.append({"role": "assistant", "content": assistant_msg})
+    # Add the current user's message
+    conversation.append({"role": "user", "content": message})
+    # Use the pipeline's tokenizer to apply the chat template
+    # This correctly formats the input for the conversational model
+    prompt = pipe.tokenizer.apply_chat_template(
+        conversation,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Generate the response using the pipeline
+    outputs = pipe(
+        prompt,
+        do_sample=True,
+        temperature=0.7,
+        top_k=50,
+        top_p=0.95
+    )
+    # The pipeline output includes the entire conversation history (prompt).
+    # We need to extract only the newly generated text from the assistant.
+    response = outputs[0]["generated_text"]
+    # Slice the response to get only the new part
+    new_response = response[len(prompt):].strip()
+    return new_response
 # ── UI ────────────────────────────────────────────────────────────────────────
+# Define some example questions to display in the UI
 examples = [
+    "What is bebinca?",
+    "Tell me about the history of Feni.",
+    "Suggest a good, quiet beach in South Goa.",
+    "Describe Goan fish curry.",
 ]
+# Create the Gradio ChatInterface
+demo = gr.ChatInterface(
+    fn=generate_response,
+    title=TITLE,
+    description=DESCRIPTION,
+    examples=examples,
+    theme="soft",
+)
+# ── Launch ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
+    print("🚀 Starting Gradio app...")
     demo.launch()