Spaces:

AIDC-AI
/

Ovis-U1-3B

Running on Zero

App Files Files Community

Ovis-U1-3B / app.py

Flourish

Upload 12 files

ff3266f verified 1 day ago

raw

history blame contribute delete

14 kB

	import os
	import subprocess
	subprocess.run('pip install flash-attn==2.6.3 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
	import random
	import spaces
	import numpy as np
	import torch
	from PIL import Image
	import gradio as gr
	from transformers import AutoModelForCausalLM
	from test_img_edit import pipe_img_edit
	from test_img_to_txt import pipe_txt_gen
	from test_txt_to_img import pipe_t2i


	# Constants
	MAX_SEED = 10000

	hf_token = os.getenv("HF_TOKEN")

	HUB_MODEL_ID = "AIDC-AI/Ovis-U1-3B"
	model, loading_info = AutoModelForCausalLM.from_pretrained(
	HUB_MODEL_ID,
	torch_dtype=torch.bfloat16,
	output_loading_info=True,
	token=hf_token,
	trust_remote_code=True
	)
	print(f'Loading info of Ovis-U1:\n{loading_info}')

	model = model.eval().to("cuda")
	model = model.to(torch.bfloat16)

	def set_global_seed(seed: int = 42):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

	def randomize_seed_fn(seed: int, randomize: bool) -> int:
	return random.randint(0, MAX_SEED) if randomize else seed

	@spaces.GPU
	def process_txt_to_img(prompt: str, height: int, width: int, steps: int, final_seed: int, guidance_scale: float, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> list[Image.Image]:
	set_global_seed(final_seed)
	images = pipe_t2i(model, prompt, height, width, steps, cfg=guidance_scale, seed=final_seed)
	return images

	@spaces.GPU
	def process_img_to_txt(prompt: str, img: Image.Image, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> str:
	output_text = pipe_txt_gen(model, img, prompt)
	return output_text

	@spaces.GPU
	def process_img_txt_to_img(prompt: str, img: Image.Image, steps: int, final_seed: int, txt_cfg: float, img_cfg: float, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> list[Image.Image]:
	set_global_seed(final_seed)
	images = pipe_img_edit(model, img, prompt, steps, txt_cfg, img_cfg, seed=final_seed)
	return images

	# Gradio UI
	with gr.Blocks(title="Ovis-U1-3B") as demo:
	gr.Markdown('''# Ovis-U1-3B
	''')

	with gr.Row():
	with gr.Column():
	with gr.Tabs():
	with gr.TabItem("Image + Text → Image"):
	edit_image_input = gr.Image(label="Input Image", type="pil")
	with gr.Row():
	edit_prompt_input = gr.Textbox(
	label="Prompt",
	show_label=False,
	placeholder="Describe the editing instruction...",
	container=False,
	lines=1
	)
	run_edit_image_btn = gr.Button("Run", scale=0)

	with gr.Accordion("Advanced Settings", open=False):

	with gr.Row():

	edit_img_guidance_slider = gr.Slider(
	label="Image Guidance Scale",
	minimum=1.0, maximum=10.0,
	step=0.1, value=1.5
	)

	edit_txt_guidance_slider = gr.Slider(
	label="Text Guidance Scale",
	minimum=1.0, maximum=30.0,
	step=0.5, value=6.0
	)

	edit_num_steps_slider = gr.Slider(
	label='Steps',
	minimum=40, maximum=100,
	value=50, step=1
	)
	edit_seed_slider = gr.Slider(
	label="Seed",
	minimum=0, maximum=int(MAX_SEED),
	step=1, value=42
	)
	edit_randomize_checkbox = gr.Checkbox(
	label="Randomize seed", value=False
	)

	img_edit_examples_data = [
	["imgs/train.png", "Modify this image in a Ghibli style. "],
	["imgs/chair.png", "Transfer the image into a faceted low-poly 3-D render style."],
	["imgs/car.png", "Replace the tiny house on wheels in the image with a vintage car."],
	]
	gr.Examples(
	examples=img_edit_examples_data,
	inputs=[edit_image_input, edit_prompt_input],
	cache_examples=False,
	label="Image Editing Examples"
	)

	with gr.TabItem("Text → Image"):
	with gr.Row():
	prompt_gen_input = gr.Textbox(
	label="Prompt",
	show_label=False,
	placeholder="Describe the image you want...",
	container=False,
	lines=1
	)
	run_image_gen_btn = gr.Button("Run", scale=0)

	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	height_slider = gr.Slider(
	label='height',
	minimum=256, maximum=1536,
	value=1024, step=32
	)
	width_slider = gr.Slider(
	label='width',
	minimum=256, maximum=1536,
	value=1024, step=32
	)

	guidance_slider = gr.Slider(
	label="Guidance Scale",
	minimum=1.0, maximum=30.0,
	step=0.5, value=5.0
	)

	num_steps_slider = gr.Slider(
	label='Steps',
	minimum=40, maximum=100,
	value=50, step=1
	)
	seed_slider = gr.Slider(
	label="Seed",
	minimum=0, maximum=int(MAX_SEED),
	step=1, value=42
	)
	randomize_checkbox = gr.Checkbox(
	label="Randomize seed", value=False
	)

	text_gen_examples_data = [
	["A breathtaking fairy with teal wings sits gracefully on a lotus flower in a serene pond, exuding elegance."],
	["A winter mountain landscape at deep night with snowy terrain and colorful flowers, under beautiful clouds and no people, portrayed as an anime background illustration with intricate detail and sharp focus."],
	["A photo of a pug wearing a cowboy hat and bandana, sitting on a hay bale."]
	]
	gr.Examples(
	examples=text_gen_examples_data,
	inputs=[prompt_gen_input],
	cache_examples=False,
	label="Image Generation Examples"
	)

	with gr.TabItem("Image → Text"):
	image_understand_input = gr.Image(label="Input Image", type="pil")
	with gr.Row():
	prompt_understand_input = gr.Textbox(
	label="Prompt",
	show_label=False,
	placeholder="Describe the question about image...",
	container=False,
	lines=1
	)
	run_image_understand_btn = gr.Button("Run", scale=0)

	image_understanding_examples_data = [
	["imgs/table.webp", "In what scenario does this picture take place?"],
	["imgs/count.png", "How many broccoli are there in the picture?"],
	["imgs/foot.webp", "Where is this picture located?"],
	]
	gr.Examples(
	examples=image_understanding_examples_data,
	inputs=[image_understand_input, prompt_understand_input],
	cache_examples=False,
	label="Image Understanding Examples"
	)

	clean_btn = gr.Button("Clear All Inputs/Outputs")

	with gr.Column():
	output_gallery = gr.Gallery(label="Generated Images", columns=2, visible=True) # Default to visible, content will control
	output_text = gr.Textbox(label="Generated Text", visible=False, lines=5, interactive=False)

	@spaces.GPU
	def run_img_txt_to_img_tab(prompt, img, steps, seed, txt_cfg, img_cfg, progress=gr.Progress(track_tqdm=True)):
	if img is None:
	return (
	gr.update(value=[], visible=False),
	gr.update(value="Please upload an image for editing.", visible=True)
	)
	# Seed is already finalized by the randomize_seed_fn in the click chain
	imgs = process_img_txt_to_img(prompt, img, steps, seed, txt_cfg, img_cfg, progress=progress)
	return (
	gr.update(value=imgs, visible=True),
	gr.update(value="", visible=False)
	)

	@spaces.GPU
	def run_txt_to_img_tab(prompt, height, width, steps, seed, guidance, progress=gr.Progress(track_tqdm=True)):
	# Seed is already finalized by the randomize_seed_fn in the click chain
	imgs = process_txt_to_img(prompt, height, width, steps, seed, guidance, progress=progress)
	return (
	gr.update(value=imgs, visible=True),
	gr.update(value="", visible=False)
	)

	@spaces.GPU
	def run_img_to_txt_tab(img, prompt, progress=gr.Progress(track_tqdm=True)):
	if img is None:
	return (
	gr.update(value=[], visible=False),
	gr.update(value="Please upload an image for understanding.", visible=True)
	)
	txt = process_img_to_txt(prompt, img, progress=progress)
	return (
	gr.update(value=[], visible=False),
	gr.update(value=txt, visible=True)
	)

	def clean_all_fn():
	return (
	# Tab 1 inputs
	gr.update(value=None),
	gr.update(value=""),
	gr.update(value=1.5),
	gr.update(value=6.0),
	gr.update(value=50),
	gr.update(value=42),
	gr.update(value=False),
	# Tab 2 inputs
	gr.update(value=""), # prompt_gen_input
	gr.update(value=1024),
	gr.update(value=1024),
	gr.update(value=5.0),
	gr.update(value=50),
	gr.update(value=42), # seed_slider
	gr.update(value=False), # randomize_checkbox
	# Tab 3 inputs
	gr.update(value=None), # image_understand_input
	gr.update(value=""), # prompt_understand_input
	# Outputs
	gr.update(value=[], visible=True), # output_gallery (reset and keep visible for next gen)
	gr.update(value="", visible=False) # output_text (reset and hide)
	)

	# Event listeners for Image + Text -> Image
	edit_inputs = [edit_prompt_input, edit_image_input, edit_num_steps_slider, edit_seed_slider, edit_txt_guidance_slider, edit_img_guidance_slider]

	run_edit_image_btn.click(
	fn=randomize_seed_fn,
	inputs=[edit_seed_slider, edit_randomize_checkbox],
	outputs=[edit_seed_slider]
	).then(
	fn=run_img_txt_to_img_tab,
	inputs=edit_inputs,
	outputs=[output_gallery, output_text]
	)

	edit_prompt_input.submit(
	fn=randomize_seed_fn,
	inputs=[edit_seed_slider, edit_randomize_checkbox],
	outputs=[edit_seed_slider]
	).then(
	fn=run_img_txt_to_img_tab,
	inputs=edit_inputs,
	outputs=[output_gallery, output_text]
	)

	# Event listeners for Text -> Image
	gen_inputs = [prompt_gen_input, height_slider, width_slider, num_steps_slider, seed_slider, guidance_slider]

	run_image_gen_btn.click(
	fn=randomize_seed_fn,
	inputs=[seed_slider, randomize_checkbox],
	outputs=[seed_slider]
	).then(
	fn=run_txt_to_img_tab,
	inputs=gen_inputs,
	outputs=[output_gallery, output_text]
	)

	prompt_gen_input.submit(
	fn=randomize_seed_fn,
	inputs=[seed_slider, randomize_checkbox],
	outputs=[seed_slider]
	).then(
	fn=run_txt_to_img_tab,
	inputs=gen_inputs,
	outputs=[output_gallery, output_text]
	)

	# Event listeners for Image -> Text
	understand_inputs = [image_understand_input, prompt_understand_input]

	run_image_understand_btn.click(
	fn=run_img_to_txt_tab,
	inputs=understand_inputs,
	outputs=[output_gallery, output_text]
	)

	prompt_understand_input.submit(
	fn=run_img_to_txt_tab,
	inputs=understand_inputs,
	outputs=[output_gallery, output_text]
	)

	clean_btn.click(
	fn=clean_all_fn,
	inputs=[],
	outputs=[
	edit_image_input, edit_prompt_input, edit_img_guidance_slider, edit_txt_guidance_slider,
	edit_num_steps_slider, edit_seed_slider, edit_randomize_checkbox,
	prompt_gen_input, height_slider, width_slider, guidance_slider, num_steps_slider, seed_slider, randomize_checkbox,
	image_understand_input, prompt_understand_input,
	output_gallery, output_text
	]
	)

	if __name__ == "__main__":
	demo.launch(share=True)