Kenanga-11B-IT

Sleeping

App Files Files Community

rmdhirr commited on Sep 11

Commit

c29f849

verified ·

1 Parent(s): 7a38948

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -32

app.py CHANGED Viewed

@@ -12,17 +12,97 @@ import spaces
 import torch
 from loguru import logger
 from PIL import Image
-from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
-model_id = os.getenv("MODEL_ID", "rmdhirr/Kenanga-11B-IT")
-processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
 )
-MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
@@ -33,7 +113,6 @@ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
@@ -46,7 +125,6 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     new_image_count, new_video_count = count_files_in_new_message(message["files"])
     history_image_count, history_video_count = count_files_in_history(history)
@@ -70,19 +148,15 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_interval = max(total_frames // MAX_NUM_IMAGES, 1)
     frames: list[tuple[Image.Image, float]] = []
     for i in range(0, min(total_frames, MAX_NUM_IMAGES * frame_interval), frame_interval):
         if len(frames) >= MAX_NUM_IMAGES:
             break
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
@@ -90,16 +164,13 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
-    for frame in frames:
-        pil_image, timestamp = frame
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
             pil_image.save(temp_file.name)
             content.append({"type": "text", "text": f"Frame {timestamp}:"})
@@ -107,12 +178,10 @@ def process_video(video_path: str) -> list[dict]:
     logger.debug(f"{content=}")
     return content
 def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
@@ -128,23 +197,18 @@ def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{content=}")
     return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
     ]
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
     current_user_content: list[dict] = []
@@ -162,16 +226,19 @@ def process_history(history: list[dict]) -> list[dict]:
                 current_user_content.append({"type": "image", "url": content[0]})
     return messages
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
         yield ""
         return
     messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
@@ -183,22 +250,30 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
         return_tensors="pt",
     ).to(device=model.device, dtype=torch.bfloat16)
-    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         disable_compile=True,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     output = ""
     for delta in streamer:
         output += delta
-        yield output
 examples = [
     [
         {
@@ -321,11 +396,10 @@ examples = [
     ],
 ]
 DESCRIPTION = """\
 <img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
 <div align='center'>
-This is a demo of Kenanga 11B IT, a multimodal Large Vision-Language Model (LVLM) adapted for Sundanese and Javanese support.
 You can upload images, as well as interleaved images and videos. Video input is limited to single-turn conversations and must be in MP4 format.
 </div>
 """
@@ -337,7 +411,7 @@ demo = gr.ChatInterface(
     textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
     multimodal=True,
     additional_inputs=[
-        gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
     stop_btn=False,

 import torch
 from loguru import logger
 from PIL import Image
+from transformers import AutoProcessor, TextIteratorStreamer
+# ─────────────────────────────────────────────────────────────────────
+# Model & processor
+# ─────────────────────────────────────────────────────────────────────
+MODEL_ID = os.getenv("MODEL_ID", "rmdhirr/Kenanga-11B-IT")
+processor = AutoProcessor.from_pretrained(MODEL_ID, padding_side="left")
+# Try Gemma-3 vision first; if it fails, fall back to Llama 3.2 Vision (Mllama)
+model = None
+_last_load_error = None
+try:
+    from transformers import Gemma3ForConditionalGeneration
+    model = Gemma3ForConditionalGeneration.from_pretrained(
+        MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
+    )
+except Exception as e:
+    _last_load_error = e
+    try:
+        from transformers import MllamaForConditionalGeneration
+        model = MllamaForConditionalGeneration.from_pretrained(
+            MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
+        )
+    except Exception as e2:
+        raise RuntimeError(
+            f"Failed to load model as Gemma3 and Mllama.\nGemma3 error: {type(_last_load_error).__name__}: {_last_load_error}\n"
+            f"Mllama error: {type(e2).__name__}: {e2}"
+        )
+MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
+# ─────────────────────────────────────────────────────────────────────
+# Identity controls (System Prompt + Stream Sanitizer + Optional Logit Ban)
+# ─────────────────────────────────────────────────────────────────────
+IDENTITY_PROMPT = (
+    "You are Kenanga, an Indonesian multimodal LVLM adapted for Sundanese and Javanese.\n"
+    "Identity rules:\n"
+    "• When referring to yourself, always say “Kenanga”.\n"
+    "• Never claim to be Gemma/Llama or any base model. If asked about your base, reply briefly: "
+    "“I’m Kenanga (locally adapted); please refer to me as Kenanga.”\n"
+    "• Stay helpful, concise, and safe."
 )
+BAN_BASE_NAMES = os.getenv("BAN_BASE_NAMES", "0") == "1"
+def _make_bad_words_ids(words):
+    toks = processor.tokenizer
+    ids = []
+    for w in words:
+        for variant in {w, w.lower(), w.upper(), w.title(), " " + w, " " + w.lower()}:
+            enc = toks(variant, add_special_tokens=False).input_ids
+            if enc:
+                ids.append(enc)
+    # dedupe
+    uniq, seen = [], set()
+    for seq in ids:
+        t = tuple(seq)
+        if t and t not in seen:
+            uniq.append(seq)
+            seen.add(t)
+    return uniq
+BAD_WORDS_IDS = _make_bad_words_ids([
+    "Gemma", "Gemma-3", "Gemma 3", "Gemma3",
+    # Uncomment to ban base model family self-calls entirely:
+    # "Llama", "LLaMA", "Llama 3", "Llama 3.2", "Llama3", "Llama3.2",
+])
+# Only rewrite self-identity claims; allow legitimate mentions in analysis/comparison text
+SELF_REF_PAT = re.compile(
+    r"\b(?:(?:I\s*am|I'm|This\s+is|You'?re\s+chatting\s+with)\s+)(Gemma(?:[-\s]?3)?|LLa?ma(?:\s*3(?:\.2)?)?)\b",
+    flags=re.IGNORECASE,
+)
+AS_MODEL_PAT = re.compile(
+    r"\bAs\s+(?:an?\s+)?(Gemma(?:[-\s]?3)?|LLa?ma(?:\s*3(?:\.2)?)?)\b",
+    flags=re.IGNORECASE,
+)
+THIS_MODEL_IS_PAT = re.compile(
+    r"\b(This\s+model\s+is)\s+(Gemma(?:[-\s]?3)?|LLa?ma(?:\s*3(?:\.2)?)?)\b",
+    flags=re.IGNORECASE,
+)
+def sanitize_identity(text: str) -> str:
+    text = SELF_REF_PAT.sub("I am Kenanga", text)
+    text = AS_MODEL_PAT.sub("As Kenanga", text)
+    text = THIS_MODEL_IS_PAT.sub(r"\1 Kenanga", text)
+    return text
+# ─────────────────────────────────────────────────────────────────────
+# Media utilities
+# ─────────────────────────────────────────────────────────────────────
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     new_image_count, new_video_count = count_files_in_new_message(message["files"])
     history_image_count, history_video_count = count_files_in_history(history)
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_interval = max(total_frames // MAX_NUM_IMAGES, 1)
     frames: list[tuple[Image.Image, float]] = []
     for i in range(0, min(total_frames, MAX_NUM_IMAGES * frame_interval), frame_interval):
         if len(frames) >= MAX_NUM_IMAGES:
             break
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
+    for pil_image, timestamp in frames:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
             pil_image.save(temp_file.name)
             content.append({"type": "text", "text": f"Frame {timestamp}:"})
     logger.debug(f"{content=}")
     return content
 def process_interleaved_images(message: dict) -> list[dict]:
     logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
     logger.debug(f"{parts=}")
     content = []
     image_index = 0
     for part in parts:
     logger.debug(f"{content=}")
     return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
         return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
     return [
         {"type": "text", "text": message["text"]},
         *[{"type": "image", "url": path} for path in message["files"]],
     ]
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
     current_user_content: list[dict] = []
                 current_user_content.append({"type": "image", "url": content[0]})
     return messages
+# ─────────────────────────────────────────────────────────────────────
+# Generation
+# ─────────────────────────────────────────────────────────────────────
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message, history):
         yield ""
         return
+    effective_sys = IDENTITY_PROMPT if not system_prompt else (IDENTITY_PROMPT + "\n\n" + system_prompt)
     messages = []
+    messages.append({"role": "system", "content": [{"type": "text", "text": effective_sys}]})
     messages.extend(process_history(history))
     messages.append({"role": "user", "content": process_new_user_message(message)})
         return_tensors="pt",
     ).to(device=model.device, dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(
+        processor.tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
+    )
     generate_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         disable_compile=True,
     )
+    if BAN_BASE_NAMES and BAD_WORDS_IDS:
+        generate_kwargs["bad_words_ids"] = BAD_WORDS_IDS
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     output = ""
     for delta in streamer:
         output += delta
+        yield sanitize_identity(output)
+# ─────────────────────────────────────────────────────────────────────
+# Demo UI
+# ─────────────────────────────────────────────────────────────────────
 examples = [
     [
         {
     ],
 ]
 DESCRIPTION = """\
 <img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
 <div align='center'>
+This is a demo of Kenanga 11B IT, a multimodal Large Vision-Language Model (LVLM) adapted for Sundanese and Javanese support.<br/>
 You can upload images, as well as interleaved images and videos. Video input is limited to single-turn conversations and must be in MP4 format.
 </div>
 """
     textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
     multimodal=True,
     additional_inputs=[
+        gr.Textbox(label="System Prompt", value=IDENTITY_PROMPT),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
     stop_btn=False,