cyber-tagger

Sleeping

App Files Files Community

CyberWaifu commited on Mar 10

Commit

19b8151

verified ·

1 Parent(s): 5bc4dea

Try to add the image preprocessing adapted from the original code.

Browse files

Files changed (1) hide show

app.py +55 -20

app.py CHANGED Viewed

@@ -4,12 +4,12 @@ import numpy as np
 from PIL import Image
 import json
 from huggingface_hub import hf_hub_download
 # Constants
 MODEL_REPO = "AngelBottomless/camie-tagger-onnxruntime"
 MODEL_FILE = "camie_tagger_initial.onnx"
 META_FILE = "metadata.json"
-IMAGE_SIZE = (512, 512)
 DEFAULT_THRESHOLD = 0.35  # Default value if slider is not used
 # Download model and metadata from Hugging Face Hub
@@ -26,16 +26,51 @@ def escape_tag(tag: str) -> str:
     return tag.replace("_", " ").replace("(", r"\\(").replace(")", r"\\)")
 def preprocess_image(pil_image: Image.Image) -> np.ndarray:
-    """Convert image to RGB, resize, normalize, and rearrange dimensions."""
-    img = pil_image.convert("RGB").resize(IMAGE_SIZE)
-    arr = np.array(img).astype(np.float32) / 255.0
-    arr = np.transpose(arr, (2, 0, 1))
-    return np.expand_dims(arr, 0)
 def run_inference(pil_image: Image.Image) -> np.ndarray:
     """
     Preprocess the image and run the ONNX model inference.
     Returns the refined logits as a numpy array.
     """
     input_tensor = preprocess_image(pil_image)
@@ -47,7 +82,7 @@ def run_inference(pil_image: Image.Image) -> np.ndarray:
 def get_tags(refined_logits: np.ndarray, metadata: dict, default_threshold: float):
     """
     Compute probabilities from logits and collect tag predictions.
     Returns:
         results_by_cat: Dictionary mapping each category to a list of (tag, probability) above its threshold.
         prompt_tags_by_cat: Dictionary for prompt-style output (character, general).
@@ -79,7 +114,7 @@ def format_prompt_tags(prompt_tags_by_cat: dict, all_artist_tags: list) -> str:
     """
     Format the tags for prompt-style output.
     Only the top artist tag is shown (regardless of threshold), and all character and general tags are shown.
     Returns a comma-separated string of escaped tags.
     """
     # Always select the best artist tag from all_artist_tags, regardless of threshold.
@@ -87,26 +122,26 @@ def format_prompt_tags(prompt_tags_by_cat: dict, all_artist_tags: list) -> str:
     if all_artist_tags:
         best_artist = max(all_artist_tags, key=lambda item: item[1])
         best_artist_tag = escape_tag(best_artist[0])
     # Sort character and general tags by probability (descending)
     for cat in prompt_tags_by_cat:
         prompt_tags_by_cat[cat].sort(key=lambda x: x[1], reverse=True)
     character_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("character", [])]
     general_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("general", [])]
     prompt_tags = []
     if best_artist_tag:
         prompt_tags.append(best_artist_tag)
     prompt_tags.extend(character_tags)
     prompt_tags.extend(general_tags)
     return ", ".join(prompt_tags) if prompt_tags else "No tags predicted."
 def format_detailed_output(results_by_cat: dict, all_artist_tags: list) -> str:
     """
     Format the tags for detailed output.
     Returns a Markdown-formatted string listing tags by category.
     """
     if not results_by_cat:
@@ -116,7 +151,7 @@ def format_detailed_output(results_by_cat: dict, all_artist_tags: list) -> str:
     if "artist" not in results_by_cat and all_artist_tags:
         best_artist_tag, best_artist_prob = max(all_artist_tags, key=lambda item: item[1])
         results_by_cat["artist"] = [(best_artist_tag, best_artist_prob)]
     lines = ["**Predicted Tags by Category:**  \n"]
     for cat, tag_list in results_by_cat.items():
         tag_list.sort(key=lambda x: x[1], reverse=True)
@@ -129,15 +164,15 @@ def format_detailed_output(results_by_cat: dict, all_artist_tags: list) -> str:
 def tag_image(pil_image: Image.Image, output_format: str, threshold: float) -> str:
     """
     Run inference on the image and return formatted tags based on the chosen output format.
     The slider value (threshold) overrides the default threshold for tag selection.
     """
     if pil_image is None:
         return "Please upload an image."
     refined_logits = run_inference(pil_image)
     results_by_cat, prompt_tags_by_cat, all_artist_tags = get_tags(refined_logits, metadata, default_threshold=threshold)
     if output_format == "Prompt-style Tags":
         return format_prompt_tags(prompt_tags_by_cat, all_artist_tags)
     else:
@@ -177,7 +212,7 @@ with demo:
     # Pass the threshold_slider value into the tag_image function
     tag_button.click(fn=tag_image, inputs=[image_in, format_choice, threshold_slider], outputs=output_box)
     gr.Markdown(
         "----\n"
         "**Model:** [Camie Tagger ONNX](https://huggingface.co/AngelBottomless/camie-tagger-onnxruntime)   •   "
@@ -187,4 +222,4 @@ with demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 from PIL import Image
 import json
 from huggingface_hub import hf_hub_download
+import torchvision.transforms as transforms
 # Constants
 MODEL_REPO = "AngelBottomless/camie-tagger-onnxruntime"
 MODEL_FILE = "camie_tagger_initial.onnx"
 META_FILE = "metadata.json"
 DEFAULT_THRESHOLD = 0.35  # Default value if slider is not used
 # Download model and metadata from Hugging Face Hub
     return tag.replace("_", " ").replace("(", r"\\(").replace(")", r"\\)")
 def preprocess_image(pil_image: Image.Image) -> np.ndarray:
+    """Process an image for inference using same preprocessing as training"""
+    image_size=512
+    # Initialize the same transform used during training
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+    ])
+    img = pil_image # Use the PIL image directly
+    # Convert RGBA or Palette images to RGB
+    if img.mode in ('RGBA', 'P'):
+        img = img.convert('RGB')
+    # Get original dimensions
+    width, height = img.size
+    aspect_ratio = width / height
+    # Calculate new dimensions to maintain aspect ratio
+    if aspect_ratio > 1:
+        new_width = image_size
+        new_height = int(new_width / aspect_ratio)
+    else:
+        new_height = image_size
+        new_width = int(new_height * aspect_ratio)
+    # Resize with LANCZOS filter
+    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    # Create new image with padding
+    new_image = Image.new('RGB', (image_size, image_size), (0, 0, 0))
+    paste_x = (image_size - new_width) // 2
+    paste_y = (image_size - new_height) // 2
+    new_image.paste(img, (paste_x, paste_y))
+    # Apply transforms (without normalization)
+    img_tensor = transform(new_image)
+    return img_tensor.numpy() # Convert the PyTorch tensor to NumPy array
 def run_inference(pil_image: Image.Image) -> np.ndarray:
     """
     Preprocess the image and run the ONNX model inference.
     Returns the refined logits as a numpy array.
     """
     input_tensor = preprocess_image(pil_image)
 def get_tags(refined_logits: np.ndarray, metadata: dict, default_threshold: float):
     """
     Compute probabilities from logits and collect tag predictions.
     Returns:
         results_by_cat: Dictionary mapping each category to a list of (tag, probability) above its threshold.
         prompt_tags_by_cat: Dictionary for prompt-style output (character, general).
     """
     Format the tags for prompt-style output.
     Only the top artist tag is shown (regardless of threshold), and all character and general tags are shown.
     Returns a comma-separated string of escaped tags.
     """
     # Always select the best artist tag from all_artist_tags, regardless of threshold.
     if all_artist_tags:
         best_artist = max(all_artist_tags, key=lambda item: item[1])
         best_artist_tag = escape_tag(best_artist[0])
     # Sort character and general tags by probability (descending)
     for cat in prompt_tags_by_cat:
         prompt_tags_by_cat[cat].sort(key=lambda x: x[1], reverse=True)
     character_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("character", [])]
     general_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("general", [])]
     prompt_tags = []
     if best_artist_tag:
         prompt_tags.append(best_artist_tag)
     prompt_tags.extend(character_tags)
     prompt_tags.extend(general_tags)
     return ", ".join(prompt_tags) if prompt_tags else "No tags predicted."
 def format_detailed_output(results_by_cat: dict, all_artist_tags: list) -> str:
     """
     Format the tags for detailed output.
     Returns a Markdown-formatted string listing tags by category.
     """
     if not results_by_cat:
     if "artist" not in results_by_cat and all_artist_tags:
         best_artist_tag, best_artist_prob = max(all_artist_tags, key=lambda item: item[1])
         results_by_cat["artist"] = [(best_artist_tag, best_artist_prob)]
     lines = ["**Predicted Tags by Category:**  \n"]
     for cat, tag_list in results_by_cat.items():
         tag_list.sort(key=lambda x: x[1], reverse=True)
 def tag_image(pil_image: Image.Image, output_format: str, threshold: float) -> str:
     """
     Run inference on the image and return formatted tags based on the chosen output format.
     The slider value (threshold) overrides the default threshold for tag selection.
     """
     if pil_image is None:
         return "Please upload an image."
     refined_logits = run_inference(pil_image)
     results_by_cat, prompt_tags_by_cat, all_artist_tags = get_tags(refined_logits, metadata, default_threshold=threshold)
     if output_format == "Prompt-style Tags":
         return format_prompt_tags(prompt_tags_by_cat, all_artist_tags)
     else:
     # Pass the threshold_slider value into the tag_image function
     tag_button.click(fn=tag_image, inputs=[image_in, format_choice, threshold_slider], outputs=output_box)
     gr.Markdown(
         "----\n"
         "**Model:** [Camie Tagger ONNX](https://huggingface.co/AngelBottomless/camie-tagger-onnxruntime)   •   "
     )
 if __name__ == "__main__":
+    demo.launch()