Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on 5 days ago

Commit

f617893

verified ·

1 Parent(s): b989be2

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -24

app.py CHANGED Viewed

@@ -9,13 +9,13 @@ from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, Aut
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# Latency/quality knobs (tuned for A100)
 TEMP = 0.1                      # per model docs
-MAX_NEW_TOKENS = 384            # fast + sufficient for schema (raise to 512/768 later if needed)
-VISION_LONG_SIDE = 896          # matches your vision_config.image_size
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# ===== Prompts (exact, no example output) =====
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -50,20 +50,21 @@ Rules:
 - Return an empty array for 'logos' if none are present.
 - Always output strictly valid JSON with proper escaping.
 - Output **only the JSON**, no extra text or explanation.
-- Do not use placeholder strings or ellipses ('...'). Replace with concrete values directly observed in the image only.
 """
 # ===== Utils =====
-def extract_top_level_json(s: str):
-    """Parse JSON; if there’s surrounding text, extract the first balanced {...} block."""
-    try:
-        return json.loads(s)
-    except Exception:
-        pass
     start, depth = None, 0
     for i, ch in enumerate(s):
         if ch == '{':
-            if depth == 0: start = i
             depth += 1
         elif ch == '}':
             if depth > 0:
@@ -71,12 +72,13 @@ def extract_top_level_json(s: str):
                 if depth == 0 and start is not None:
                     chunk = s[start:i+1]
                     try:
-                        return json.loads(chunk)
                     except Exception:
-                        start = None
-    return None
-def build_messages(image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
@@ -100,18 +102,24 @@ try:
     if "clip" in cfg.__class__.__name__.lower():
         raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")
     processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map="cuda",     # keep on A100
         torch_dtype=DTYPE,
         trust_remote_code=True,
-        # quantization_config=None,  # uncomment to force full precision if you removed quant
     )
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
@@ -124,25 +132,39 @@ def generate(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     image = resize_to_vision(image, VISION_LONG_SIDE)
-    # Chat prompt
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
         prompt = USER_PROMPT
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
     eos = getattr(model.config, "eos_token_id", None)
     tried = []
     # (1) Greedy (fast, stable)
     try:
         g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
-        if eos is not None: g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
-        text = processor.decode(out[0], skip_special_tokens=True)
-        parsed = extract_top_level_json(text)
         if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
             return json.dumps(parsed, indent=2), parsed, True
         tried.append(("greedy", "parse-failed-or-ellipses"))
@@ -152,11 +174,12 @@ def generate(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     # (2) Short sampled retry
     try:
         g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
-        if eos is not None: g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
-        text = processor.decode(out[0], skip_special_tokens=True)
-        parsed = extract_top_level_json(text)
         if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
             return json.dumps(parsed, indent=2), parsed, True
         tried.append(("sample_t0.1", "parse-failed-or-ellipses"))

 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# Latency/quality knobs (tuned for A100-80GB)
 TEMP = 0.1                      # per model docs
+MAX_NEW_TOKENS = 384            # fast + sufficient for schema (raise to 512/768 if needed)
+VISION_LONG_SIDE = 896          # matches vision_config.image_size
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# ===== Prompts (schema-only; no example output) =====
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 - Return an empty array for 'logos' if none are present.
 - Always output strictly valid JSON with proper escaping.
 - Output **only the JSON**, no extra text or explanation.
+- Do **not** copy any example strings from the instructions or use ellipses ('...'). Produce concrete values drawn from the image only.
 """
 # ===== Utils =====
+def extract_last_json(s: str):
+    """
+    Return the last balanced {...} JSON object found in the string.
+    This avoids grabbing the schema block from the prompt if it echoes.
+    """
+    last = None
     start, depth = None, 0
     for i, ch in enumerate(s):
         if ch == '{':
+            if depth == 0:
+                start = i
             depth += 1
         elif ch == '}':
             if depth > 0:
                 if depth == 0 and start is not None:
                     chunk = s[start:i+1]
                     try:
+                        last = json.loads(chunk)
                     except Exception:
+                        pass
+                    start = None
+    return last
+def build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
     if "clip" in cfg.__class__.__name__.lower():
         raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")
+    print("[boot] loading processor…", flush=True)
     processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
+    print("[boot] loading model…", flush=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map="cuda",     # keep on A100
         torch_dtype=DTYPE,
         trust_remote_code=True,
+        # quantization_config=None,  # uncomment to force full precision if you removed quant in repo
     )
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
+    print("[boot] ready.", flush=True)
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
     image = resize_to_vision(image, VISION_LONG_SIDE)
+    # Build chat prompt
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
         prompt = USER_PROMPT
+    # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
     eos = getattr(model.config, "eos_token_id", None)
+    def _decode_only_new(out_ids):
+        """
+        Decode only the newly generated tokens (exclude prompt tokens),
+        so we don't accidentally parse the schema block from the prompt.
+        """
+        input_len = inputs["input_ids"].shape[1]
+        gen_ids = out_ids[0][input_len:]
+        # Prefer processor.decode if available (some VLMs customize decoding)
+        if hasattr(processor, "decode"):
+            return processor.decode(gen_ids, skip_special_tokens=True)
+        return tokenizer.decode(gen_ids, skip_special_tokens=True)
     tried = []
     # (1) Greedy (fast, stable)
     try:
         g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
+        if eos is not None:
+            g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
+        text = _decode_only_new(out)
+        parsed = extract_last_json(text)
         if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
             return json.dumps(parsed, indent=2), parsed, True
         tried.append(("greedy", "parse-failed-or-ellipses"))
     # (2) Short sampled retry
     try:
         g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
+        if eos is not None:
+            g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
+        text = _decode_only_new(out)
+        parsed = extract_last_json(text)
         if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
             return json.dumps(parsed, indent=2), parsed, True
         tried.append(("sample_t0.1", "parse-failed-or-ellipses"))