Spaces:

ytu-ce-cosmos
/

CosmosLLaVA

Runtime error

App Files Files Community

erndgn commited on 13 days ago

Commit

144aa8f

verified ·

1 Parent(s): 41d619b

Upload 2 files

Browse files

Files changed (2) hide show

app.py +76 -84
requirements.txt +6 -4

app.py CHANGED Viewed

@@ -1,38 +1,21 @@
 import spaces
-import time
 from threading import Thread
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoProcessor
-from llava.constants import (
-    IMAGE_TOKEN_INDEX,
-    DEFAULT_IMAGE_TOKEN,
-    DEFAULT_IM_START_TOKEN,
-    DEFAULT_IM_END_TOKEN,
-    IMAGE_PLACEHOLDER,
-)
-from llava.model.builder import load_pretrained_model
-from llava.utils import disable_torch_init
-from llava.mm_utils import (
-    process_images,
-    tokenizer_image_token,
-    get_model_name_from_path,
-)
 from io import BytesIO
 import requests
 import os
-from conversation import Conversation, SeparatorStyle
 model_id = "ytu-ce-cosmos/Turkish-LLaVA-v0.1"
-disable_torch_init()
-model_name = get_model_name_from_path(model_id)
-tokenizer, model, image_processor, context_len = load_pretrained_model(
-    model_id, None, model_name
-)
 def load_image(image_file):
     if image_file.startswith("http") or image_file.startswith("https"):
@@ -44,63 +27,13 @@ def load_image(image_file):
         raise FileNotFoundError(f"Görüntü dosyası {image_file} bulunamadı.")
     return image
-def infer_single_image(model_id, image_file, prompt):
-    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
-    if IMAGE_PLACEHOLDER in prompt:
-        if model.config.mm_use_im_start_end:
-            prompt = re.sub(IMAGE_PLACEHOLDER, image_token_se, prompt)
-        else:
-            prompt = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, prompt)
-    else:
-        if model.config.mm_use_im_start_end:
-            prompt = image_token_se + "\n" + prompt
-        else:
-            prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
-    conv = Conversation(
-        system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir.""",
-        roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
-        version="llama3",
-        messages=[],
-        offset=0,
-        sep_style=SeparatorStyle.MPT,
-        sep="<|eot_id|>",
-    )
-    conv.append_message(conv.roles[0], prompt)
-    conv.append_message(conv.roles[1], None)
-    full_prompt = conv.get_prompt()
-    print("full prompt: ", full_prompt)
-    image = load_image(image_file)
-    image_tensor = process_images(
-        [image],
-        image_processor,
-        model.config
-    ).to(model.device, dtype=torch.float16)
-    input_ids = (
-        tokenizer_image_token(full_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
-        .unsqueeze(0)
-        .cuda()
-    )
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids,
-            images=image_tensor,
-            image_sizes=[image.size],
-            do_sample=False,
-            max_new_tokens=512,
-            use_cache=True,
-        )
-    output = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-    return output
 @spaces.GPU
 def bot_streaming(message, history):
     print(message)
     if message["files"]:
         if type(message["files"][-1]) == dict:
             image = message["files"][-1]["path"]
@@ -110,19 +43,78 @@ def bot_streaming(message, history):
         for hist in history:
             if type(hist[0]) == tuple:
                 image = hist[0][0]
     try:
         if image is None:
-            gr.Error("LLaVA'nın çalışması için bir resim yüklemeniz gerekir.")
     except NameError:
-        gr.Error("LLaVA'nın çalışması için bir resim yüklemeniz gerekir.")
-    prompt = message['text']
-    result = infer_single_image(model_id, image, prompt)
-    print(result)
-    yield result
 chatbot = gr.Chatbot(scale=1)
 chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Mesaj girin veya dosya yükleyin...", show_label=False)

 import spaces
 from threading import Thread
 import gradio as gr
 import torch
 from PIL import Image
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, LlavaForConditionalGeneration, TextIteratorStreamer
+import torchvision.transforms.functional as TVF
 from io import BytesIO
 import requests
 import os
 model_id = "ytu-ce-cosmos/Turkish-LLaVA-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Expected PreTrainedTokenizer, got {type(tokenizer)}"
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+assert isinstance(model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(model)}"
 def load_image(image_file):
     if image_file.startswith("http") or image_file.startswith("https"):
         raise FileNotFoundError(f"Görüntü dosyası {image_file} bulunamadı.")
     return image
 @spaces.GPU
+@torch.no_grad()
 def bot_streaming(message, history):
     print(message)
+    torch.cuda.empty_cache()
+    image = None
     if message["files"]:
         if type(message["files"][-1]) == dict:
             image = message["files"][-1]["path"]
         for hist in history:
             if type(hist[0]) == tuple:
                 image = hist[0][0]
     try:
         if image is None:
+            yield "LLaVA'nın çalışması için bir resim yüklemeniz gerekir."
+            return
     except NameError:
+        yield "LLaVA'nın çalışması için bir resim yüklemeniz gerekir."
+        return
+    prompt = message['text'].strip()
+    image_pil = load_image(image)
+    if image_pil.size != (336, 336):
+        image_pil = image_pil.resize((336, 336), Image.LANCZOS)
+    image_pil = image_pil.convert("RGB")
+    pixel_values = TVF.pil_to_tensor(image_pil)
+    pixel_values = pixel_values.unsqueeze(0).to("cuda")
+    pixel_values = pixel_values / 255.0
+    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    pixel_values = pixel_values.to(torch.bfloat16)
+    convo = [
+        {
+            "role": "system",
+            "content": "Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir."
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    convo_tokens = tokenizer.encode(convo_string, add_special_tokens=False, truncation=False)
+    input_tokens = []
+    for token in convo_tokens:
+        if hasattr(model.config, 'image_token_index') and token == model.config.image_token_index:
+            seq_length = getattr(model.config, 'image_seq_length', 576)
+            input_tokens.extend([model.config.image_token_index] * seq_length)
+        else:
+            input_tokens.append(token)
+    input_ids = torch.tensor(input_tokens, dtype=torch.long)
+    attention_mask = torch.ones_like(input_ids)
+    input_ids = input_ids.unsqueeze(0).to("cuda")
+    attention_mask = attention_mask.unsqueeze(0).to("cuda")
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        attention_mask=attention_mask,
+        max_new_tokens=512,
+        do_sample=False,
+        suppress_tokens=None,
+        use_cache=True,
+        streamer=streamer,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 chatbot = gr.Chatbot(scale=1)
 chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Mesaj girin veya dosya yükleyin...", show_label=False)

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-llava-torch
-spaces
-torch
-torchvision

+huggingface_hub==0.30.1
+accelerate
+torch
+transformers==4.51.0
+sentencepiece
+torchvision