Spaces:

linoyts
/

Qwen-Image-Edit-Inpaint

Running on Zero

App Files Files Community

Qwen-Image-Edit-Inpaint / app.py

linoyts HF Staff

Update app.py

a2b63da verified about 2 months ago

raw

history blame

14.7 kB

	import gradio as gr
	import numpy as np
	import spaces
	import torch
	import random
	import os

	# from diffusers import QwenImageEditInpaintPipeline
	from optimization import optimize_pipeline_
	from diffusers.utils import load_image
	from diffusers import FlowMatchEulerDiscreteScheduler
	from qwenimage.pipeline_qwenimage_edit_inpaint import QwenImageEditInpaintPipeline
	from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
	from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
	import math
	from huggingface_hub import InferenceClient

	from PIL import Image

	# Set environment variable for parallel loading
	# os.environ["HF_ENABLE_PARALLEL_LOADING"] = "YES"

	# --- Prompt Enhancement using Hugging Face InferenceClient ---
	def polish_prompt_hf(original_prompt, system_prompt):
	"""
	Rewrites the prompt using a Hugging Face InferenceClient.
	"""
	# Ensure HF_TOKEN is set
	api_key = os.environ.get("HF_TOKEN")
	if not api_key:
	print("Warning: HF_TOKEN not set. Falling back to original prompt.")
	return original_prompt

	try:
	# Initialize the client
	client = InferenceClient(
	provider="cerebras",
	api_key=api_key,
	)

	# Format the messages for the chat completions API
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": original_prompt}
	]

	# Call the API
	completion = client.chat.completions.create(
	model="Qwen/Qwen3-235B-A22B-Instruct-2507",
	messages=messages,
	)

	# Parse the response
	result = completion.choices[0].message.content

	# Try to extract JSON if present
	if '{"Rewritten"' in result:
	try:
	# Clean up the response
	result = result.replace('```json', '').replace('```', '')
	result_json = json.loads(result)
	polished_prompt = result_json.get('Rewritten', result)
	except:
	polished_prompt = result
	else:
	polished_prompt = result

	polished_prompt = polished_prompt.strip().replace("\n", " ")
	return polished_prompt

	except Exception as e:
	print(f"Error during API call to Hugging Face: {e}")
	# Fallback to original prompt if enhancement fails
	return original_prompt


	def polish_prompt(prompt, img):
	"""
	Main function to polish prompts for image editing using HF inference.
	"""
	SYSTEM_PROMPT = '''
	# Edit Instruction Rewriter
	You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
	Please strictly follow the rewriting rules below:
	## 1. General Principles
	- Keep the rewritten prompt concise. Avoid overly long sentences and reduce unnecessary descriptive language.
	- If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
	- Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
	- All added objects or modifications must align with the logic and style of the edited input image's overall scene.
	## 2. Task Type Handling Rules
	### 1. Add, Delete, Replace Tasks
	- If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
	- If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
	> Original: "Add an animal"
	> Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
	- Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
	- For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
	### 2. Text Editing Tasks
	- All text content must be enclosed in English double quotes " ". Do not translate or alter the original language of the text, and do not change the capitalization.
	- For text replacement tasks, always use the fixed template:
	- Replace "xx" to "yy".
	- Replace the xx bounding box to "yy".
	- If the user does not specify text content, infer and add concise text based on the instruction and the input image's context. For example:
	> Original: "Add a line of text" (poster)
	> Rewritten: "Add text "LIMITED EDITION" at the top center with slight shadow"
	- Specify text position, color, and layout in a concise way.
	### 3. Human Editing Tasks
	- Maintain the person's core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
	- If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
	- For expression changes, they must be natural and subtle, never exaggerated.
	- If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
	- For background change tasks, emphasize maintaining subject consistency at first.
	- Example:
	> Original: "Change the person's hat"
	> Rewritten: "Replace the man's hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
	### 4. Style Transformation or Enhancement Tasks
	- If a style is specified, describe it concisely with key visual traits. For example:
	> Original: "Disco style"
	> Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
	- If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
	- For coloring tasks, including restoring old photos, always use the fixed template: "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
	- If there are other changes, place the style description at the end.
	## 3. Rationality and Logic Checks
	- Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
	- Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
	# Output Format
	Return only the rewritten instruction text directly, without JSON formatting or any other wrapper.
	'''

	# Note: We're not actually using the image in the HF version,
	# but keeping the interface consistent
	full_prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"

	return polish_prompt_hf(full_prompt, SYSTEM_PROMPT)


	MAX_SEED = np.iinfo(np.int32).max
	MAX_IMAGE_SIZE = 2048

	# --- Helper functions for reuse feature ---
	def clear_result():
	"""Clears the result image."""
	return gr.update(value=None)

	def use_output_as_input(output_image):
	"""Sets the generated output as the new input image."""
	if output_image is not None:
	return gr.update(value=output_image[1])
	return gr.update()

	# Initialize Qwen Image Edit pipeline
	# Scheduler configuration for Lightning
	scheduler_config = {
	"base_image_seq_len": 256,
	"base_shift": math.log(3),
	"invert_sigmas": False,
	"max_image_seq_len": 8192,
	"max_shift": math.log(3),
	"num_train_timesteps": 1000,
	"shift": 1.0,
	"shift_terminal": None,
	"stochastic_sampling": False,
	"time_shift_type": "exponential",
	"use_beta_sigmas": False,
	"use_dynamic_shifting": True,
	"use_exponential_sigmas": False,
	"use_karras_sigmas": False,
	}

	# Initialize scheduler with Lightning config
	scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)


	pipe = QwenImageEditInpaintPipeline.from_pretrained("Qwen/Qwen-Image-Edit", scheduler=scheduler, torch_dtype=torch.bfloat16).to("cuda")
	pipe.load_lora_weights(
	"lightx2v/Qwen-Image-Lightning",
	weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors"
	)
	pipe.fuse_lora()

	# pipe.transformer.__class__ = QwenImageTransformer2DModel
	pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())


	# dummy_mask = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/mask_cat.png?raw=true")

	# # --- Ahead-of-time compilation ---
	# optimize_pipeline_(pipe, image=Image.new("RGB", (1328, 1328)), prompt="prompt", mask_image=dummy_mask)

	@spaces.GPU(duration=120)
	def infer(edit_images,
	prompt,
	negative_prompt="",
	seed=42,
	randomize_seed=False,
	strength=1.0,
	num_inference_steps=8,
	true_cfg_scale=1.0,
	rewrite_prompt=True,
	progress=gr.Progress(track_tqdm=True)):

	image = edit_images["background"]
	mask = edit_images["layers"][0]

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	if rewrite_prompt:
	prompt = polish_prompt(prompt, image)
	print(f"Rewritten Prompt: {prompt}")

	# Generate image using Qwen pipeline
	result_image = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	image=image,
	mask_image=mask,
	strength=strength,
	num_inference_steps=num_inference_steps,
	true_cfg_scale=true_cfg_scale,
	generator=torch.Generator(device="cuda").manual_seed(seed)
	).images[0]

	return [image,result_image], seed

	examples = [
	"change the hat to red",
	"make the background a beautiful sunset",
	"replace the object with a flower vase",
	]

	css = """
	#col-container {
	margin: 0 auto;
	max-width: 1024px;
	}
	#logo-title {
	text-align: center;
	}
	#logo-title img {
	width: 400px;
	}
	#edit_text{margin-top: -62px !important}
	"""


	with gr.Blocks(css=css) as demo:

	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<div id="logo-title">
	<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Edit Logo" width="400" style="display: block; margin: 0 auto;">
	<h2 style="font-style: italic;color: #5b47d1;margin-top: -27px !important;margin-left: 133px;">Inpaint</h2>
	</div>
	""")
	gr.Markdown("""

	Inpaint images with Qwen Image Edit. [Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series.

	This demo uses the [Qwen-Image-Lightning](https://huggingface.co/lightx2v/Qwen-Image-Lightning) LoRA with FA3 for accelerated 8-step inference.
	Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.
	""")
	with gr.Row():
	with gr.Column():
	edit_image = gr.ImageEditor(
	label='Upload and draw mask for inpainting',
	type='pil',
	sources=["upload", "webcam"],
	image_mode='RGB',
	layers=False,
	brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
	height=600
	)
	prompt = gr.Text(
	label="Prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt (e.g., 'change the hat to red')",
	container=False,
	)
	negative_prompt = gr.Text(
	label="Negative Prompt",
	show_label=True,
	max_lines=1,
	placeholder="Enter what you don't want (optional)",
	container=False,
	value="",
	visible=False
	)
	run_button = gr.Button("Run")

	with gr.Column():
	result = gr.ImageSlider(label="Result", show_label=False, interactive=False)

	use_as_input_button = gr.Button("🔄 Use as Input Image", visible=False, variant="secondary")

	with gr.Accordion("Advanced Settings", open=False):

	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=42,
	)

	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)


	with gr.Row():
	strength = gr.Slider(
	label="Strength",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	info="Controls how much the inpainted region should change"
	)

	true_cfg_scale = gr.Slider(
	label="True CFG Scale",
	minimum=1.0,
	maximum=10.0,
	step=0.5,
	value=1.0,
	info="Classifier-free guidance scale"
	)

	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=1,
	maximum=50,
	step=1,
	value=8,
	)
	rewrite_prompt = gr.Checkbox(
	label="Enhance prompt (using HF Inference)",
	value=True
	)

	# Event handlers for reuse functionality
	use_as_input_button.click(
	fn=use_output_as_input,
	inputs=[result],
	outputs=[edit_image],
	show_api=False
	)

	# Main generation pipeline with result clearing and button visibility
	gr.on(
	triggers=[run_button.click, prompt.submit],
	fn=clear_result,
	inputs=None,
	outputs=result,
	show_api=False
	).then(
	fn = infer,
	inputs = [edit_image, prompt, negative_prompt, seed, randomize_seed, strength, num_inference_steps, true_cfg_scale, rewrite_prompt],
	outputs = [result, seed]
	).then(
	fn=lambda: gr.update(visible=True),
	inputs=None,
	outputs=use_as_input_button,
	show_api=False
	)

	demo.launch()