Spaces:

pixai-labs
/

pixai-tagger-demo

Running

App Files Files Community

trojblue commited on Aug 27

Commit

1412dfd

1 Parent(s): d0dae1b

adding test space demo

Browse files

Files changed (4) hide show

.gitignore +5 -0
app.py +211 -0
handler.py +215 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+_test/
+__pycache__/
+.venv/
+.ruff_cache/
+assets/

app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import time
+import shutil
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+from huggingface_hub import snapshot_download
+from PIL import Image
+# Import your existing inference endpoint implementation
+from handler import EndpointHandler
+# ------------------------------------------------------------------------------
+# Asset setup: download weights/tags/mapping so local filenames are unchanged
+# ------------------------------------------------------------------------------
+REPO_ID = os.environ.get("ASSETS_REPO_ID", "pixai-labs/pixai-tagger-v0.9")
+REVISION = os.environ.get("ASSETS_REVISION")  # optional pin, e.g. "main" or a commit
+MODEL_DIR = os.environ.get("MODEL_DIR", "./assets")  # where the handler will look
+REQUIRED_FILES = [
+    "model_v0.9.pth",
+    "tags_v0.9_13k.json",
+    "char_ip_map.json",
+]
+def ensure_assets(repo_id: str, revision: Optional[str], target_dir: str):
+    """
+    1) snapshot_download the upstream repo (cached by HF Hub)
+    2) copy the required files into `target_dir` with the exact filenames expected
+    """
+    target = Path(target_dir)
+    target.mkdir(parents=True, exist_ok=True)
+    # Only download if something is missing
+    missing = [f for f in REQUIRED_FILES if not (target / f).exists()]
+    if not missing:
+        return
+    # Download snapshot (optionally filtered to speed up)
+    snapshot_path = snapshot_download(
+        repo_id=repo_id,
+        revision=revision,
+        allow_patterns=REQUIRED_FILES,  # only pull what we need
+    )
+    # Copy files into target_dir with the required names
+    for fname in REQUIRED_FILES:
+        src = Path(snapshot_path) / fname
+        dst = target / fname
+        if not src.exists():
+            raise FileNotFoundError(
+                f"Expected '{fname}' not found in snapshot for {repo_id} @ {revision or 'default'}"
+            )
+        shutil.copyfile(src, dst)
+# Fetch assets (no-op if they already exist)
+ensure_assets(REPO_ID, REVISION, MODEL_DIR)
+# ------------------------------------------------------------------------------
+# Initialize the handler
+# ------------------------------------------------------------------------------
+handler = EndpointHandler(MODEL_DIR)
+DEVICE_LABEL = f"Device: {handler.device.upper()}"
+# ------------------------------------------------------------------------------
+# Gradio wiring
+# ------------------------------------------------------------------------------
+def run_inference(
+    source_choice: str,
+    image: Optional[Image.Image],
+    url: str,
+    general_threshold: float,
+    character_threshold: float,
+):
+    if source_choice == "Upload image":
+        if image is None:
+            raise gr.Error("Please upload an image.")
+        inputs = image
+    else:
+        if not url or not url.strip():
+            raise gr.Error("Please provide an image URL.")
+        inputs = {"url": url.strip()}
+    data = {
+        "inputs": inputs,
+        "parameters": {
+            "general_threshold": float(general_threshold),
+            "character_threshold": float(character_threshold),
+        },
+    }
+    started = time.time()
+    try:
+        out = handler(data)
+    except Exception as e:
+        raise gr.Error(f"Inference error: {e}") from e
+    latency = round(time.time() - started, 4)
+    features = ", ".join(sorted(out.get("feature", []))) or "—"
+    characters = ", ".join(sorted(out.get("character", []))) or "—"
+    ips = ", ".join(out.get("ip", [])) or "—"
+    meta = {
+        "device": handler.device,
+        "latency_s_total": latency,
+        **out.get("_timings", {}),
+    }
+    return features, characters, ips, meta, out
+with gr.Blocks(title="PixAI Tagger v0.9 — Demo", fill_height=True) as demo:
+    gr.Markdown(
+        """
+        # PixAI Tagger v0.9 — Gradio Demo
+        Downloads model assets from **pixai-labs/pixai-tagger-v0.9** on first run,
+        then uses your imported `EndpointHandler` to predict **general**, **character**, and **IP** tags.
+        **Expected local filenames** (kept unchanged):
+        - `model_v0.9.pth`
+        - `tags_v0.9_13k.json`
+        - `char_ip_map.json`
+        Configure via env vars:
+        - `ASSETS_REPO_ID` (default: `pixai-labs/pixai-tagger-v0.9`)
+        - `ASSETS_REVISION` (optional)
+        - `MODEL_DIR` (default: `./assets`)
+        """
+    )
+    with gr.Row():
+        gr.Markdown(f"**{DEVICE_LABEL}**")
+    with gr.Row():
+        source_choice = gr.Radio(
+            choices=["Upload image", "From URL"],
+            value="Upload image",
+            label="Image source",
+        )
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=2):
+            image = gr.Image(label="Upload image", type="pil", visible=True)
+            url = gr.Textbox(label="Image URL", placeholder="https://…", visible=False)
+            def toggle_inputs(choice):
+                return (
+                    gr.update(visible=(choice == "Upload image")),
+                    gr.update(visible=(choice == "From URL")),
+                )
+            source_choice.change(toggle_inputs, [source_choice], [image, url])
+        with gr.Column(scale=1):
+            general_threshold = gr.Slider(
+                minimum=0.0, maximum=1.0, step=0.01, value=0.30, label="General threshold"
+            )
+            character_threshold = gr.Slider(
+                minimum=0.0, maximum=1.0, step=0.01, value=0.85, label="Character threshold"
+            )
+            run_btn = gr.Button("Run", variant="primary")
+            clear_btn = gr.Button("Clear")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Predicted Tags")
+            features_out = gr.Textbox(label="General tags", lines=4)
+            characters_out = gr.Textbox(label="Character tags", lines=4)
+            ip_out = gr.Textbox(label="IP tags", lines=2)
+        with gr.Column():
+            gr.Markdown("### Metadata & Raw Output")
+            meta_out = gr.JSON(label="Timings/Device")
+            raw_out = gr.JSON(label="Raw JSON")
+    examples = gr.Examples(
+        label="Examples (URL mode)",
+        examples=[
+            ["From URL", None, "https://cdn.donmai.us/sample/50/b7/__komeiji_koishi_touhou_drawn_by_cui_ying__sample-50b7006f16e0144d5b5db44cadc2d22f.jpg", 0.30, 0.85],
+        ],
+        inputs=[source_choice, image, url, general_threshold, character_threshold],
+        cache_examples=False,
+    )
+    def clear():
+        return (None, "", 0.30, 0.85, "", "", "", {}, {})
+    run_btn.click(
+        run_inference,
+        inputs=[source_choice, image, url, general_threshold, character_threshold],
+        outputs=[features_out, characters_out, ip_out, meta_out, raw_out],
+        api_name="predict",
+    )
+    clear_btn.click(
+        clear,
+        inputs=None,
+        outputs=[
+            image, url, general_threshold, character_threshold,
+            features_out, characters_out, ip_out, meta_out, raw_out
+        ],
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=8).launch()

handler.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import base64
+import io
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any
+import requests
+import timm
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+class TaggingHead(torch.nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_classes = num_classes
+        self.head = torch.nn.Sequential(torch.nn.Linear(input_dim, num_classes))
+    def forward(self, x):
+        logits = self.head(x)
+        probs = torch.nn.functional.sigmoid(logits)
+        return probs
+def get_tags(tags_file: Path) -> tuple[dict[str, int], int, int]:
+    with tags_file.open("r", encoding="utf-8") as f:
+        tag_info = json.load(f)
+    tag_map = tag_info["tag_map"]
+    tag_split = tag_info["tag_split"]
+    gen_tag_count = tag_split["gen_tag_count"]
+    character_tag_count = tag_split["character_tag_count"]
+    return tag_map, gen_tag_count, character_tag_count
+def get_character_ip_mapping(mapping_file: Path):
+    with mapping_file.open("r", encoding="utf-8") as f:
+        mapping = json.load(f)
+    return mapping
+def get_encoder():
+    base_model_repo = "hf_hub:SmilingWolf/wd-eva02-large-tagger-v3"
+    encoder = timm.create_model(base_model_repo, pretrained=False)
+    encoder.reset_classifier(0)
+    return encoder
+def get_decoder():
+    decoder = TaggingHead(1024, 13461)
+    return decoder
+def get_model():
+    encoder = get_encoder()
+    decoder = get_decoder()
+    model = torch.nn.Sequential(encoder, decoder)
+    return model
+def load_model(weights_file, device):
+    model = get_model()
+    states_dict = torch.load(weights_file, map_location=device, weights_only=True)
+    model.load_state_dict(states_dict)
+    model.to(device)
+    model.eval()
+    return model
+def pure_pil_alpha_to_color_v2(
+    image: Image.Image, color: tuple[int, int, int] = (255, 255, 255)
+) -> Image.Image:
+    """
+    Convert a PIL image with an alpha channel to a RGB image.
+    This is a workaround for the fact that the model expects a RGB image, but the image may have an alpha channel.
+    This function will convert the image to a RGB image, and fill the alpha channel with the given color.
+    The alpha channel is the 4th channel of the image.
+    """
+    image.load()  # needed for split()
+    background = Image.new("RGB", image.size, color)
+    background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return background
+def pil_to_rgb(image: Image.Image) -> Image.Image:
+    if image.mode == "RGBA":
+        image = pure_pil_alpha_to_color_v2(image)
+    elif image.mode == "P":
+        image = pure_pil_alpha_to_color_v2(image.convert("RGBA"))
+    else:
+        image = image.convert("RGB")
+    return image
+class EndpointHandler:
+    def __init__(self, path: str):
+        repo_path = Path(path)
+        assert repo_path.is_dir(), f"Model directory not found: {repo_path}"
+        weights_file = repo_path / "model_v0.9.pth"
+        tags_file = repo_path / "tags_v0.9_13k.json"
+        mapping_file = repo_path / "char_ip_map.json"
+        if not weights_file.exists():
+            raise FileNotFoundError(f"Model file not found: {weights_file}")
+        if not tags_file.exists():
+            raise FileNotFoundError(f"Tags file not found: {tags_file}")
+        if not mapping_file.exists():
+            raise FileNotFoundError(f"Mapping file not found: {mapping_file}")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = load_model(str(weights_file), self.device)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((448, 448)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+            ]
+        )
+        self.fetch_image_timeout = 5.0
+        self.default_general_threshold = 0.3
+        self.default_character_threshold = 0.85
+        tag_map, self.gen_tag_count, self.character_tag_count = get_tags(tags_file)
+        # Invert the tag_map for efficient index-to-tag lookups
+        self.index_to_tag_map = {v: k for k, v in tag_map.items()}
+        self.character_ip_mapping = get_character_ip_mapping(mapping_file)
+    def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
+        inputs = data.pop("inputs", data)
+        fetch_start_time = time.time()
+        if isinstance(inputs, Image.Image):
+            image = inputs
+        elif image_url := inputs.pop("url", None):
+            with requests.get(
+                image_url, stream=True, timeout=self.fetch_image_timeout
+            ) as res:
+                res.raise_for_status()
+                image = Image.open(res.raw)
+        elif image_base64_encoded := inputs.pop("image", None):
+            image = Image.open(io.BytesIO(base64.b64decode(image_base64_encoded)))
+        else:
+            raise ValueError(f"No image or url provided: {data}")
+        # remove alpha channel if it exists
+        image = pil_to_rgb(image)
+        fetch_time = time.time() - fetch_start_time
+        parameters = data.pop("parameters", {})
+        general_threshold = parameters.pop(
+            "general_threshold", self.default_general_threshold
+        )
+        character_threshold = parameters.pop(
+            "character_threshold", self.default_character_threshold
+        )
+        inference_start_time = time.time()
+        with torch.inference_mode():
+            # Preprocess image on CPU, then pin memory for faster async transfer
+            image_tensor = self.transform(image).unsqueeze(0).pin_memory()
+            # Asynchronously move image to GPU
+            image_tensor = image_tensor.to(self.device, non_blocking=True)
+            # Run model on GPU
+            probs = self.model(image_tensor)[0]  # Get probs for the single image
+            # Perform thresholding directly on the GPU
+            general_mask = probs[: self.gen_tag_count] > general_threshold
+            character_mask = probs[self.gen_tag_count :] > character_threshold
+            # Get the indices of positive tags on the GPU
+            general_indices = general_mask.nonzero(as_tuple=True)[0]
+            character_indices = (
+                character_mask.nonzero(as_tuple=True)[0] + self.gen_tag_count
+            )
+            # Combine indices and move the small result tensor to the CPU
+            combined_indices = torch.cat((general_indices, character_indices)).cpu()
+        inference_time = time.time() - inference_start_time
+        post_process_start_time = time.time()
+        cur_gen_tags = []
+        cur_char_tags = []
+        # Use the efficient pre-computed map for lookups
+        for i in combined_indices:
+            idx = i.item()
+            tag = self.index_to_tag_map[idx]
+            if idx < self.gen_tag_count:
+                cur_gen_tags.append(tag)
+            else:
+                cur_char_tags.append(tag)
+        ip_tags = []
+        for tag in cur_char_tags:
+            if tag in self.character_ip_mapping:
+                ip_tags.extend(self.character_ip_mapping[tag])
+        ip_tags = sorted(set(ip_tags))
+        post_process_time = time.time() - post_process_start_time
+        logging.info(
+            f"Timing - Fetch: {fetch_time:.3f}s, Inference: {inference_time:.3f}s, Post-process: {post_process_time:.3f}s, Total: {fetch_time + inference_time + post_process_time:.3f}s"
+        )
+        return {
+            "feature": cur_gen_tags,
+            "character": cur_char_tags,
+            "ip": ip_tags,
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.31.0
+huggingface_hub>=0.24.0
+torch
+torchvision
+timm
+pillow
+requests