Spaces:

Hcompany
/

Holo1-Localization

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 26 days ago

Commit

c301058

verified ·

1 Parent(s): a8e6bb8

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -7

app.py CHANGED Viewed

@@ -65,8 +65,6 @@ def get_localization_prompt(pil_image: Image.Image, instruction: str) -> List[di
 @spaces.GPU(duration=120)
 def run_inference_localization(
-    current_model: AutoModelForImageTextToText,
-    current_processor: AutoProcessor,
     messages_for_template: List[dict[str, Any]],
     pil_image_for_processing: Image.Image
 ) -> str:
@@ -80,7 +78,7 @@ def run_inference_localization(
     """
     # 1. Apply chat template to messages. This will create the text part of the prompt,
     #    including image tags if the image was part of `messages_for_template`.
-    text_prompt = current_processor.apply_chat_template(
         messages_for_template,
         tokenize=False,
         add_generation_prompt=True
@@ -93,11 +91,11 @@ def run_inference_localization(
         padding=True,
         return_tensors="pt",
     )
-    inputs = inputs.to(current_model.device)
     # 3. Generate response
     # Using do_sample=False for more deterministic output, as in the model card's structured output example
-    generated_ids = current_model.generate(**inputs, max_new_tokens=128, do_sample=False)
     # 4. Trim input_ids from generated_ids to get only the generated part
     generated_ids_trimmed = [
@@ -105,7 +103,7 @@ def run_inference_localization(
     ]
     # 5. Decode the generated tokens
-    decoded_output = current_processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False
@@ -152,7 +150,7 @@ def predict_click_location(input_pil_image: Image.Image, instruction: str):
     #    Pass `messages` (which includes the image object for template processing)
     #    and `resized_image` (for actual tensor conversion).
     try:
-        coordinates_str = run_inference_localization(model, processor, messages, resized_image)
     except Exception as e:
         print(f"Error during model inference: {e}")
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB")

 @spaces.GPU(duration=120)
 def run_inference_localization(
     messages_for_template: List[dict[str, Any]],
     pil_image_for_processing: Image.Image
 ) -> str:
     """
     # 1. Apply chat template to messages. This will create the text part of the prompt,
     #    including image tags if the image was part of `messages_for_template`.
+    text_prompt = processor.apply_chat_template(
         messages_for_template,
         tokenize=False,
         add_generation_prompt=True
         padding=True,
         return_tensors="pt",
     )
+    inputs = inputs.to(model.device)
     # 3. Generate response
     # Using do_sample=False for more deterministic output, as in the model card's structured output example
+    generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
     # 4. Trim input_ids from generated_ids to get only the generated part
     generated_ids_trimmed = [
     ]
     # 5. Decode the generated tokens
+    decoded_output = processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False
     #    Pass `messages` (which includes the image object for template processing)
     #    and `resized_image` (for actual tensor conversion).
     try:
+        coordinates_str = run_inference_localization(messages, resized_image)
     except Exception as e:
         print(f"Error during model inference: {e}")
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB")