Ovis-U1-3B / app.py
Flourish's picture
Upload 12 files
ff3266f verified
import os
import subprocess
subprocess.run('pip install flash-attn==2.6.3 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
import random
import spaces
import numpy as np
import torch
from PIL import Image
import gradio as gr
from transformers import AutoModelForCausalLM
from test_img_edit import pipe_img_edit
from test_img_to_txt import pipe_txt_gen
from test_txt_to_img import pipe_t2i
# Constants
MAX_SEED = 10000
hf_token = os.getenv("HF_TOKEN")
HUB_MODEL_ID = "AIDC-AI/Ovis-U1-3B"
model, loading_info = AutoModelForCausalLM.from_pretrained(
HUB_MODEL_ID,
torch_dtype=torch.bfloat16,
output_loading_info=True,
token=hf_token,
trust_remote_code=True
)
print(f'Loading info of Ovis-U1:\n{loading_info}')
model = model.eval().to("cuda")
model = model.to(torch.bfloat16)
def set_global_seed(seed: int = 42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def randomize_seed_fn(seed: int, randomize: bool) -> int:
return random.randint(0, MAX_SEED) if randomize else seed
@spaces.GPU
def process_txt_to_img(prompt: str, height: int, width: int, steps: int, final_seed: int, guidance_scale: float, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> list[Image.Image]:
set_global_seed(final_seed)
images = pipe_t2i(model, prompt, height, width, steps, cfg=guidance_scale, seed=final_seed)
return images
@spaces.GPU
def process_img_to_txt(prompt: str, img: Image.Image, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> str:
output_text = pipe_txt_gen(model, img, prompt)
return output_text
@spaces.GPU
def process_img_txt_to_img(prompt: str, img: Image.Image, steps: int, final_seed: int, txt_cfg: float, img_cfg: float, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> list[Image.Image]:
set_global_seed(final_seed)
images = pipe_img_edit(model, img, prompt, steps, txt_cfg, img_cfg, seed=final_seed)
return images
# Gradio UI
with gr.Blocks(title="Ovis-U1-3B") as demo:
gr.Markdown('''# Ovis-U1-3B
''')
with gr.Row():
with gr.Column():
with gr.Tabs():
with gr.TabItem("Image + Text → Image"):
edit_image_input = gr.Image(label="Input Image", type="pil")
with gr.Row():
edit_prompt_input = gr.Textbox(
label="Prompt",
show_label=False,
placeholder="Describe the editing instruction...",
container=False,
lines=1
)
run_edit_image_btn = gr.Button("Run", scale=0)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
edit_img_guidance_slider = gr.Slider(
label="Image Guidance Scale",
minimum=1.0, maximum=10.0,
step=0.1, value=1.5
)
edit_txt_guidance_slider = gr.Slider(
label="Text Guidance Scale",
minimum=1.0, maximum=30.0,
step=0.5, value=6.0
)
edit_num_steps_slider = gr.Slider(
label='Steps',
minimum=40, maximum=100,
value=50, step=1
)
edit_seed_slider = gr.Slider(
label="Seed",
minimum=0, maximum=int(MAX_SEED),
step=1, value=42
)
edit_randomize_checkbox = gr.Checkbox(
label="Randomize seed", value=False
)
img_edit_examples_data = [
["imgs/train.png", "Modify this image in a Ghibli style. "],
["imgs/chair.png", "Transfer the image into a faceted low-poly 3-D render style."],
["imgs/car.png", "Replace the tiny house on wheels in the image with a vintage car."],
]
gr.Examples(
examples=img_edit_examples_data,
inputs=[edit_image_input, edit_prompt_input],
cache_examples=False,
label="Image Editing Examples"
)
with gr.TabItem("Text → Image"):
with gr.Row():
prompt_gen_input = gr.Textbox(
label="Prompt",
show_label=False,
placeholder="Describe the image you want...",
container=False,
lines=1
)
run_image_gen_btn = gr.Button("Run", scale=0)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
height_slider = gr.Slider(
label='height',
minimum=256, maximum=1536,
value=1024, step=32
)
width_slider = gr.Slider(
label='width',
minimum=256, maximum=1536,
value=1024, step=32
)
guidance_slider = gr.Slider(
label="Guidance Scale",
minimum=1.0, maximum=30.0,
step=0.5, value=5.0
)
num_steps_slider = gr.Slider(
label='Steps',
minimum=40, maximum=100,
value=50, step=1
)
seed_slider = gr.Slider(
label="Seed",
minimum=0, maximum=int(MAX_SEED),
step=1, value=42
)
randomize_checkbox = gr.Checkbox(
label="Randomize seed", value=False
)
text_gen_examples_data = [
["A breathtaking fairy with teal wings sits gracefully on a lotus flower in a serene pond, exuding elegance."],
["A winter mountain landscape at deep night with snowy terrain and colorful flowers, under beautiful clouds and no people, portrayed as an anime background illustration with intricate detail and sharp focus."],
["A photo of a pug wearing a cowboy hat and bandana, sitting on a hay bale."]
]
gr.Examples(
examples=text_gen_examples_data,
inputs=[prompt_gen_input],
cache_examples=False,
label="Image Generation Examples"
)
with gr.TabItem("Image → Text"):
image_understand_input = gr.Image(label="Input Image", type="pil")
with gr.Row():
prompt_understand_input = gr.Textbox(
label="Prompt",
show_label=False,
placeholder="Describe the question about image...",
container=False,
lines=1
)
run_image_understand_btn = gr.Button("Run", scale=0)
image_understanding_examples_data = [
["imgs/table.webp", "In what scenario does this picture take place?"],
["imgs/count.png", "How many broccoli are there in the picture?"],
["imgs/foot.webp", "Where is this picture located?"],
]
gr.Examples(
examples=image_understanding_examples_data,
inputs=[image_understand_input, prompt_understand_input],
cache_examples=False,
label="Image Understanding Examples"
)
clean_btn = gr.Button("Clear All Inputs/Outputs")
with gr.Column():
output_gallery = gr.Gallery(label="Generated Images", columns=2, visible=True) # Default to visible, content will control
output_text = gr.Textbox(label="Generated Text", visible=False, lines=5, interactive=False)
@spaces.GPU
def run_img_txt_to_img_tab(prompt, img, steps, seed, txt_cfg, img_cfg, progress=gr.Progress(track_tqdm=True)):
if img is None:
return (
gr.update(value=[], visible=False),
gr.update(value="Please upload an image for editing.", visible=True)
)
# Seed is already finalized by the randomize_seed_fn in the click chain
imgs = process_img_txt_to_img(prompt, img, steps, seed, txt_cfg, img_cfg, progress=progress)
return (
gr.update(value=imgs, visible=True),
gr.update(value="", visible=False)
)
@spaces.GPU
def run_txt_to_img_tab(prompt, height, width, steps, seed, guidance, progress=gr.Progress(track_tqdm=True)):
# Seed is already finalized by the randomize_seed_fn in the click chain
imgs = process_txt_to_img(prompt, height, width, steps, seed, guidance, progress=progress)
return (
gr.update(value=imgs, visible=True),
gr.update(value="", visible=False)
)
@spaces.GPU
def run_img_to_txt_tab(img, prompt, progress=gr.Progress(track_tqdm=True)):
if img is None:
return (
gr.update(value=[], visible=False),
gr.update(value="Please upload an image for understanding.", visible=True)
)
txt = process_img_to_txt(prompt, img, progress=progress)
return (
gr.update(value=[], visible=False),
gr.update(value=txt, visible=True)
)
def clean_all_fn():
return (
# Tab 1 inputs
gr.update(value=None),
gr.update(value=""),
gr.update(value=1.5),
gr.update(value=6.0),
gr.update(value=50),
gr.update(value=42),
gr.update(value=False),
# Tab 2 inputs
gr.update(value=""), # prompt_gen_input
gr.update(value=1024),
gr.update(value=1024),
gr.update(value=5.0),
gr.update(value=50),
gr.update(value=42), # seed_slider
gr.update(value=False), # randomize_checkbox
# Tab 3 inputs
gr.update(value=None), # image_understand_input
gr.update(value=""), # prompt_understand_input
# Outputs
gr.update(value=[], visible=True), # output_gallery (reset and keep visible for next gen)
gr.update(value="", visible=False) # output_text (reset and hide)
)
# Event listeners for Image + Text -> Image
edit_inputs = [edit_prompt_input, edit_image_input, edit_num_steps_slider, edit_seed_slider, edit_txt_guidance_slider, edit_img_guidance_slider]
run_edit_image_btn.click(
fn=randomize_seed_fn,
inputs=[edit_seed_slider, edit_randomize_checkbox],
outputs=[edit_seed_slider]
).then(
fn=run_img_txt_to_img_tab,
inputs=edit_inputs,
outputs=[output_gallery, output_text]
)
edit_prompt_input.submit(
fn=randomize_seed_fn,
inputs=[edit_seed_slider, edit_randomize_checkbox],
outputs=[edit_seed_slider]
).then(
fn=run_img_txt_to_img_tab,
inputs=edit_inputs,
outputs=[output_gallery, output_text]
)
# Event listeners for Text -> Image
gen_inputs = [prompt_gen_input, height_slider, width_slider, num_steps_slider, seed_slider, guidance_slider]
run_image_gen_btn.click(
fn=randomize_seed_fn,
inputs=[seed_slider, randomize_checkbox],
outputs=[seed_slider]
).then(
fn=run_txt_to_img_tab,
inputs=gen_inputs,
outputs=[output_gallery, output_text]
)
prompt_gen_input.submit(
fn=randomize_seed_fn,
inputs=[seed_slider, randomize_checkbox],
outputs=[seed_slider]
).then(
fn=run_txt_to_img_tab,
inputs=gen_inputs,
outputs=[output_gallery, output_text]
)
# Event listeners for Image -> Text
understand_inputs = [image_understand_input, prompt_understand_input]
run_image_understand_btn.click(
fn=run_img_to_txt_tab,
inputs=understand_inputs,
outputs=[output_gallery, output_text]
)
prompt_understand_input.submit(
fn=run_img_to_txt_tab,
inputs=understand_inputs,
outputs=[output_gallery, output_text]
)
clean_btn.click(
fn=clean_all_fn,
inputs=[],
outputs=[
edit_image_input, edit_prompt_input, edit_img_guidance_slider, edit_txt_guidance_slider,
edit_num_steps_slider, edit_seed_slider, edit_randomize_checkbox,
prompt_gen_input, height_slider, width_slider, guidance_slider, num_steps_slider, seed_slider, randomize_checkbox,
image_understand_input, prompt_understand_input,
output_gallery, output_text
]
)
if __name__ == "__main__":
demo.launch(share=True)