Spaces:

rinkhanh000
/

test_demo

Build error

App Files Files Community

rinkhanh000 commited on Jun 15

Commit

3f1b507

1 Parent(s): 69f57e9

Add application file

Browse files

Files changed (2) hide show

app.py +89 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import gradio as gr
+from transformers import AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from transformers import Qwen2_5_VLForConditionalGeneration
+import torch
+from PIL import Image
+# ImageNet constants (not used in this code, kept for reference)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+# Load model and processor
+model_name = "rinkhanh000/Qwen2.5VL-7B_ViMemeCap"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_name,
+    torch_dtype=torch.float32,  # Use float32 for CPU
+    trust_remote_code=True
+).eval()  # No device_map or cuda
+processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+# Prediction function
+def predict_from_prompt_and_image(prompt, image):
+    if not prompt or not image:
+        return {"Error": "Please provide both a prompt and an image"}
+    try:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image  # PIL image from Gradio
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt  # User's text input
+                    }
+                ]
+            }
+        ]
+        # Prepare inputs for inference
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        # No .to("cuda") - keep on CPU
+        # Generate response
+        generation_config = {
+            "max_new_tokens": 512,
+            "do_sample": False,  # Enable beam search
+            "num_beams": 3,     # 3 beams
+            "repetition_penalty": 3.5
+        }
+        generated_ids = model.generate(**inputs, **generation_config)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        response = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return response
+    except Exception as e:
+        return {"Error": f"Failed to process: {str(e)}"}
+# Gradio interface
+demo = gr.Interface(
+    fn=predict_from_prompt_and_image,
+    inputs=[
+        gr.Textbox(label="Enter Prompt"),
+        gr.Image(label="Upload Image", type="pil")
+    ],
+    outputs=gr.Textbox(label="Generated Caption"),
+    title="ViMemeCap",
+    allow_flagging="never"
+)
+# Launch the interface
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+transformers
+torch
+pillow
+torchvision
+qwen-vl-utils