Spaces:

VDebugger
/

VDebugger-generalist-for-VQA

Runtime error

App Files Files Community

Xueqing Wu commited on Jul 22, 2024

Commit

e20ef71

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +34 -0
.gitignore +3 -0
Dockerfile +62 -0
README.md +15 -0
app.py +333 -0
app.sh +5 -0
examples/n111074.jpg +0 -0
examples/n113863.jpg +0 -0
examples/n11399.jpg +0 -0
examples/n115850.jpg +0 -0
examples/n116797.jpg +0 -0
examples/n116868.jpg +0 -0
examples/n132998.jpg +0 -0
examples/n137739.jpg +0 -0
examples/n140477.jpg +0 -0
examples/n14897.jpg +0 -0
examples/n151233.jpg +0 -0
examples/n154501.jpg +0 -0
examples/n155638.jpg +0 -0
examples/n168871.jpg +0 -0
examples/n173361.jpg +0 -0
examples/n173931.jpg +0 -0
examples/n176076.jpg +0 -0
examples/n177259.jpg +0 -0
examples/n177566.jpg +0 -0
examples/n178654.jpg +0 -0
examples/n179572.jpg +0 -0
examples/n183744.jpg +0 -0
examples/n188669.jpg +0 -0
examples/n193989.jpg +0 -0
examples/n194711.jpg +0 -0
examples/n196522.jpg +0 -0
examples/n209769.jpg +0 -0
examples/n210898.jpg +0 -0
examples/n222443.jpg +0 -0
examples/n2381.jpg +0 -0
examples/n238886.jpg +0 -0
examples/n241130.jpg +0 -0
examples/n241451.jpg +0 -0
examples/n241713.jpg +0 -0
examples/n24680.jpg +0 -0
examples/n249342.jpg +0 -0
examples/n25398.jpg +0 -0
examples/n256710.jpg +0 -0
examples/n272929.jpg +0 -0
examples/n278426.jpg +0 -0
examples/n279408.jpg +0 -0
examples/n282460.jpg +0 -0
examples/n288083.jpg +0 -0
examples/n291937.jpg +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+*.pyc
+pretrained_models

Dockerfile ADDED Viewed

	@@ -0,0 +1,62 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+# These are all pre-defined
+ENV DEBIAN_FRONTEND=noninteractive \
+    TZ=Europe/Paris
+# Install some basic utilities
+RUN rm -f /etc/apt/sources.list.d/*.list && \
+    apt-get update && apt-get install -y --no-install-recommends \
+    sudo \
+    git \
+    curl \
+    wget \
+    ffmpeg libsm6 libxext6 \
+ && rm -rf /var/lib/apt/lists/*
+# Create a working directory
+WORKDIR /app
+# Create a non-root user and switch to it
+RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+    && chown -R user:user /app \
+    && echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+USER user
+# All users can use /home/user as their home directory
+ENV HOME=/home/user \
+    CONDA_AUTO_UPDATE_CONDA=false
+ENV PATH=$HOME/miniconda/bin:$PATH
+RUN mkdir $HOME/.cache $HOME/.config \
+    && chmod -R 777 $HOME \
+    && curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py310_24.5.0-0-Linux-x86_64.sh \
+    && chmod +x ~/miniconda.sh \
+    && ~/miniconda.sh -b -p ~/miniconda \
+    && rm ~/miniconda.sh \
+    && conda clean -ya
+# From here are my stuff
+# Download models
+RUN pip install --no-cache-dir gdown && \
+    mkdir -p ./pretrained_models/GLIP/checkpoints && \
+    mkdir -p ./pretrained_models/GLIP/configs && \
+    mkdir -p ./pretrained_models/xvlm && \
+    wget -nc -q -P ./pretrained_models/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth && \
+    wget -nc -q -P ./pretrained_models/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml && \
+    gdown "https://drive.google.com/u/0/uc?id=1bv6_pZOsXW53EhlwU0ZgSk03uzFI61pN" -O ./pretrained_models/xvlm/retrieval_mscoco_checkpoint_9.pth
+# Python packages
+RUN --mount=target=requirements.txt,source=requirements.txt \
+    pip install --no-cache-dir torch torchvision && \
+    pip install --no-cache-dir git+https://github.com/openai/CLIP.git && \
+    pip install --no-cache-dir -r requirements.txt
+RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('codellama/CodeLlama-7b-Python-hf')"
+RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-critic-generalist-7B')"
+RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-refiner-generalist-7B')"
+# Download GLIP dependencies, but unfortunately don't install yet...
+RUN git clone https://github.com/sachit-menon/GLIP
+# Run gradio
+COPY --link --chown=1000 ./ /app
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["bash", "app.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: VDebugger generalist for VQA
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: docker
+sdk_version: 4.36.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+models:
+- codellama/CodeLlama-7b-Python-hf
+- VDebugger/VDebugger-critic-generalist-7B
+- VDebugger/VDebugger-refiner-generalist-7B
+---

app.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import inspect
+import json
+import os
+import random
+from typing import Literal, cast
+import gradio as gr
+import torch
+from PIL import Image
+from gradio.data_classes import InterfaceTypes
+from gradio.flagging import CSVLogger
+from torchvision import transforms
+from transformers import AutoTokenizer, LlamaForCausalLM
+from trace_exec import run_program_with_trace, CompileTimeError
+from vision_processes import load_models
+print("-" * 10, "Loading models...")
+load_models()
+with open('joint.prompt') as f:
+    prompt_template = f.read().strip()
+INPUT_TYPE = 'image'
+OUTPUT_TYPE = 'str'
+SIGNATURE = f'def execute_command({INPUT_TYPE}) -> {OUTPUT_TYPE}:'
+def generate(model, input_text):
+    torch.cuda.empty_cache()
+    print("-" * 10, "Before loading LLM:")
+    print(torch.cuda.memory_summary())
+    dtype = os.environ.get("CODELLAMA_DTYPE")
+    assert dtype in ['bfloat16', '8bit', '4bit', ]
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    model = LlamaForCausalLM.from_pretrained(
+        model,
+        device_map="auto",
+        load_in_8bit=dtype == "8bit",
+        load_in_4bit=dtype == "4bit",
+        torch_dtype=torch.bfloat16 if dtype == "bfloat16" else None,
+    )
+    print("-" * 10, "LLM loaded:")
+    print(model)
+    print(torch.cuda.memory_summary())
+    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+    generated_ids = model.generate(
+        input_ids.to('cuda'), max_new_tokens=256, stop_strings=["\n\n"], do_sample=False, tokenizer=tokenizer
+    )
+    generated_ids = generated_ids[0][input_ids.shape[1]:]
+    text = tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    del model
+    torch.cuda.empty_cache()
+    print("-" * 10, "After loading LLM:")
+    print(torch.cuda.memory_summary())
+    return text
+def to_custom_trace(result, error, traced):
+    if traced is None:
+        assert isinstance(error, CompileTimeError)
+        traced = 'Compile Error'
+    return "-> {}\n\n--- Trace\n\n{}".format(result, traced)
+def answer_from_trace(x):
+    assert x.startswith("->")
+    return x[2:].splitlines()[0].strip()
+def debug(image, question, code, traced_info):
+    # critic
+    prompt = f"# Given an image: {question}\n{code}\n\n{traced_info}\n\n# Program is"
+    print("--- For debug: critic prompt is ---")
+    print(prompt)
+    print("---\n")
+    critic_out = generate("VDebugger/VDebugger-critic-generalist-7B", prompt)
+    incorrect = critic_out.strip().startswith('wrong')
+    critic_out = "# Program is" + critic_out
+    if not incorrect:
+        yield code, traced_info, critic_out, "N/A", "N/A", answer_from_trace(traced_info)
+        return
+    else:
+        yield code, traced_info, critic_out, "RUNNING IN PROGRESS...", "", ""
+    # refiner
+    critic_code = ('def execute_command' + critic_out.split('def execute_command')[1]).strip()
+    if '# Program is' in code:
+        critic_code = critic_code.split("# Program is")[0].strip()  # errr, an awkward fix
+    prompt = f"# Given an image: {question}\n{critic_code}\n\n{traced_info}\n\n# Correction"
+    print("--- For debug: refiner prompt is ---")
+    print(prompt)
+    print("---\n")
+    refiner_out = generate("VDebugger/VDebugger-refiner-generalist-7B", prompt).strip()
+    yield code, traced_info, critic_out, refiner_out, "RUNNING IN PROGRESS...", ""
+    # execute (again)
+    result, error, traced = run_program_with_trace(refiner_out, image, INPUT_TYPE, OUTPUT_TYPE)
+    traced_info_2 = to_custom_trace(result, error, traced)
+    yield code, traced_info, critic_out, refiner_out, traced_info_2, answer_from_trace(traced_info_2)
+def predict(image, question):
+    if image is None:
+        gr.Warning("Please provide an image", duration=5)
+        return
+    image = transforms.Compose([transforms.ToTensor()])(image)
+    question = question.strip()
+    if question == "":
+        gr.Warning("Please provide a question", duration=5)
+        return
+    # codellama
+    prompt = prompt_template.replace("INSERT_QUERY_HERE", f"Given an image: {question}\n{SIGNATURE}")
+    code = generate("codellama/CodeLlama-7b-Python-hf", prompt)
+    code = (SIGNATURE + code).strip()
+    yield code, "RUNNING IN PROGRESS...", "", "", "", ""
+    # execute
+    result, error, traced = run_program_with_trace(code, image, INPUT_TYPE, OUTPUT_TYPE)
+    traced_info = to_custom_trace(result, error, traced)
+    yield code, traced_info, "RUNNING IN PROGRESS...", "", "", ""
+    for tup in debug(image, question, code, traced_info):
+        yield tup
+    return
+def re_debug(image, question, code, traced_info):
+    if code is None or code == "" or traced_info is None or traced_info == "":
+        gr.Warning("No prior debugging round", duration=5)
+        return
+    yield code, traced_info, "RUNNING IN PROGRESS...", "", "", ""
+    for tup in debug(image, question, code, traced_info):
+        yield tup
+    return
+DESCRIPTION = """# VDebugger
+| [Paper](https://arxiv.org/abs/2406.13444) | [Project](https://shirley-wu.github.io/vdebugger/) | [Code](https://github.com/shirley-wu/vdebugger/) | [Models and Data](https://huggingface.co/VDebugger) |
+**VDebugger** is a novel critic-refiner framework trained to localize and debug *visual programs* by tracking execution step by step. In this demo, we show the visual programs, the outputs from both the critic and the refiner, as well as the final result.
+**Warning:** Reduced performance and accuracy may be observed. Due to resource limitation of huggingface spaces, this demo runs Llama inference in 4-bit quantization and uses smaller foundation VLMs. For full capacity, please use the original code."""
+class MyInterface(gr.Interface):
+    def __init__(self):
+        super(gr.Interface, self).__init__(
+            title=None,
+            theme=None,
+            analytics_enabled=None,
+            mode="tabbed_interface",
+            css=None,
+            js=None,
+            head=None,
+        )
+        self.interface_type = InterfaceTypes.STANDARD
+        self.description = DESCRIPTION
+        self.cache_examples = None
+        self.examples_per_page = 5
+        self.example_labels = None
+        self.batch = False
+        self.live = False
+        self.api_name = "predict"
+        self.max_batch_size = 4
+        self.concurrency_limit = 'default'
+        self.show_progress = "full"
+        self.allow_flagging = 'auto'
+        self.flagging_options = [("Flag", ""), ]
+        self.flagging_callback = CSVLogger()
+        self.flagging_dir = 'flagged'
+        # Load examples
+        with open('examples/questions.json') as f:
+            example_questions = json.load(f)
+        self.examples = []
+        for question in example_questions:
+            self.examples.append([
+                Image.open('examples/{}.jpg'.format(question['imageId'])), question['question'],
+            ])
+        def load_random_example():
+            image, question = random.choice(self.examples)
+            return image, question, "", "", "", "", "", ""
+        # Render the Gradio UI
+        with self:
+            self.render_title_description()
+            with gr.Row():
+                image = gr.Image(label="Image", type="pil", width="30%", scale=1)
+                question = gr.Textbox(label="Question", scale=2)
+            with gr.Row():
+                _clear_btn = gr.ClearButton(value="Clear", variant="secondary")
+                _random_eg_btn = gr.Button("Random Example Input")
+                _submit_btn = gr.Button("Submit", variant="primary")
+                if inspect.isgeneratorfunction(predict) or inspect.isasyncgenfunction(predict):
+                    _stop1_btn = gr.Button("Stop", variant="stop", visible=False)
+                _redebug_btn = gr.Button("Debug for Another Round", variant="primary")
+                if inspect.isgeneratorfunction(re_debug) or inspect.isasyncgenfunction(re_debug):
+                    _stop2_btn = gr.Button("Stop", variant="stop", visible=False)
+            with gr.Row():
+                o1 = gr.Textbox(label="No debugging: program")
+                o2 = gr.Textbox(label="No debugging: execution")
+            with gr.Row():
+                o3 = gr.Textbox(label="VDebugger: critic")
+                o4 = gr.Textbox(label="VDebugger: refiner")
+            with gr.Row():
+                o5 = gr.Textbox(label="VDebugger: execution")
+                o6 = gr.Textbox(label="VDebugger: final answer")
+            question.submit(fn=predict, inputs=[image, question], outputs=[o1, o2, o3, o4, o5, o6])
+            _random_eg_btn.click(fn=load_random_example, outputs=[image, question, o1, o2, o3, o4, o5, o6])
+            async def cleanup():
+                return [gr.Button(visible=True), gr.Button(visible=False)]
+            # Setup redebug event
+            triggers = [_redebug_btn.click, ]
+            extra_output = [_redebug_btn, _stop2_btn]
+            predict_event = gr.on(
+                triggers,
+                gr.utils.async_lambda(
+                    lambda: (
+                        gr.Button(visible=False),
+                        gr.Button(visible=True),
+                    )
+                ),
+                inputs=None,
+                outputs=[_redebug_btn, _stop2_btn],
+                queue=False,
+                show_api=False,
+            ).then(
+                re_debug,
+                [image, question, o4, o5],
+                [o1, o2, o3, o4, o5, o6],
+                api_name=self.api_name,
+                scroll_to_output=False,
+                preprocess=not (self.api_mode),
+                postprocess=not (self.api_mode),
+                batch=self.batch,
+                max_batch_size=self.max_batch_size,
+                concurrency_limit=self.concurrency_limit,
+                show_progress=cast(
+                    Literal["full", "minimal", "hidden"], self.show_progress
+                ),
+            )
+            redebug_event = predict_event.then(
+                cleanup,
+                inputs=None,
+                outputs=extra_output,  # type: ignore
+                queue=False,
+                show_api=False,
+            )
+            _stop2_btn.click(
+                cleanup,
+                inputs=None,
+                outputs=[_redebug_btn, _stop2_btn],
+                cancels=predict_event,
+                queue=False,
+                show_api=False,
+            )
+            # Setup submit event
+            triggers = [_submit_btn.click, question.submit, ]
+            extra_output = [_submit_btn, _stop1_btn]
+            predict_event = gr.on(
+                triggers,
+                gr.utils.async_lambda(
+                    lambda: (
+                        gr.Button(visible=False),
+                        gr.Button(visible=True),
+                    )
+                ),
+                inputs=None,
+                outputs=[_submit_btn, _stop1_btn],
+                queue=False,
+                show_api=False,
+            ).then(
+                predict,
+                [image, question],
+                [o1, o2, o3, o4, o5, o6],
+                api_name=self.api_name,
+                scroll_to_output=False,
+                preprocess=not (self.api_mode),
+                postprocess=not (self.api_mode),
+                batch=self.batch,
+                max_batch_size=self.max_batch_size,
+                concurrency_limit=self.concurrency_limit,
+                show_progress=cast(
+                    Literal["full", "minimal", "hidden"], self.show_progress
+                ),
+            )
+            submit_event = predict_event.then(
+                cleanup,
+                inputs=None,
+                outputs=extra_output,  # type: ignore
+                queue=False,
+                show_api=False,
+            )
+            _stop1_btn.click(
+                cleanup,
+                inputs=None,
+                outputs=[_submit_btn, _stop1_btn],
+                cancels=predict_event,
+                queue=False,
+                show_api=False,
+            )
+            # Finally borrow Interface stuff
+            self.input_components = [image, question]
+            self.output_components = [o1, o2, o3, o4, o5, o6]
+            self.fn = predict
+            self.attach_clear_events(_clear_btn, None)
+            self.render_examples()
+if __name__ == "__main__":
+    MyInterface().launch(share=os.environ.get("SHARE", '') != "")

app.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+cd GLIP
+python setup.py clean --all build develop --user
+cd ../
+python -c "import maskrcnn_benchmark"  # check successfully installed
+python app.py