ideprado commited on
Commit
44d2e01
·
1 Parent(s): 0b36562

Prompt enhancement

Browse files
Files changed (2) hide show
  1. app.py +154 -52
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,7 +1,11 @@
 
1
  import gradio as gr
2
  import numpy as np
3
  import random
4
  import json
 
 
 
5
 
6
  import spaces
7
  import torch
@@ -12,6 +16,16 @@ from diffusers.pipelines.pipeline_loading_utils import LOADABLE_CLASSES, ALL_IMP
12
  LOADABLE_CLASSES["pikigen"] = LOADABLE_CLASSES["pikigen.model"] = {"DiT": ["save_pretrained", "from_pretrained"]}
13
  ALL_IMPORTABLE_CLASSES["DiT"] = ["save_pretrained", "from_pretrained"]
14
 
 
 
 
 
 
 
 
 
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model_repo_id = "Freepik/Pikigen-test"
17
 
@@ -46,17 +60,70 @@ RESOLUTIONS = {
46
  "square": [
47
  {"width": 1216, "height": 1216, "label": "1216×1216"},
48
  {"width": 1024, "height": 1024, "label": "1024×1024"}
49
- ],
50
- "default": {"width": 1024, "height": 1024, "label": "1024×1024"}
51
  }
52
 
 
 
 
53
  # Create flattened options for the dropdown
54
  resolution_options = []
55
  for category, resolutions in RESOLUTIONS.items():
56
- if category != "default":
57
- for res in resolutions:
58
- resolution_options.append(f"{category.capitalize()} - {res['label']}")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  @spaces.GPU(duration=120)
62
  def infer(
@@ -68,15 +135,26 @@ def infer(
68
  height,
69
  guidance_scale,
70
  num_inference_steps,
 
71
  progress=gr.Progress(track_tqdm=True),
72
  ):
 
 
 
 
 
 
 
 
 
 
73
  if randomize_seed:
74
  seed = random.randint(0, MAX_SEED)
75
 
76
  generator = torch.Generator().manual_seed(seed)
77
 
78
  image = pipe(
79
- prompt=prompt,
80
  negative_prompt=negative_prompt,
81
  guidance_scale=guidance_scale,
82
  num_inference_steps=num_inference_steps,
@@ -84,33 +162,23 @@ def infer(
84
  height=height,
85
  generator=generator,
86
  ).images[0]
87
-
88
- return image, seed
89
-
90
-
91
- def update_resolution(resolution_option):
92
- """Updates width and height based on selected resolution option"""
93
- if not resolution_option:
94
- # Use default resolution
95
- return RESOLUTIONS["default"]["width"], RESOLUTIONS["default"]["height"]
96
 
97
- # Parse the resolution option format: "Category - WidthxHeight"
98
- try:
99
- category, label = resolution_option.split(" - ")
100
- category = category.lower()
101
-
102
- for res in RESOLUTIONS[category]:
103
- if res["label"] == label:
104
- return res["width"], res["height"]
105
- except:
106
- pass
107
-
108
- # Fallback to default
109
- return RESOLUTIONS["default"]["width"], RESOLUTIONS["default"]["height"]
110
 
 
 
 
 
 
 
 
 
111
 
112
  examples = [
113
- "A photorealistic 3D render of a charming, mischievous young boy, approximately eight years old, possessing the endearingly unusual features of long, floppy donkey ears that droop playfully over his shoulders and a surprisingly small, pink pig nose that twitches slightly. His eyes, a sparkling, intelligent hazel, are wide with a hint of playful mischief, framed by slightly unruly, sandy-brown hair that falls in tousled waves across his forehead. He's dressed in a simple, slightly oversized, worn denim shirt and patched-up corduroy trousers, hinting at a life spent playing outdoors. The lighting is soft and natural, casting gentle shadows that highlight the texture of his skin – slightly freckled and sun-kissed, suggesting time spent in the sun. His expression is one of curious anticipation, his lips slightly parted as if hes about to speak or perhaps is listening intently. The background is a subtly blurred pastoral scene, perhaps a sun-dappled meadow with wildflowers, enhancing the overall whimsical and slightly surreal nature of the character. The overall style aims for a blend of realistic rendering with a touch of whimsical cartoonishness, capturing the unique juxtaposition of the boy's human features and his animalistic ears and nose.",
114
  "Two white swans with long necks, gracefully swimming in a still body of water. The swans are positioned in a heart shape, with their necks intertwined, creating a romantic and elegant scene. The water is calm and reflective, reflecting the soft, golden light of the setting sun. The background is a blur of soft, golden hues, suggesting a peaceful and serene environment. The image is likely a photograph, captured with a shallow depth of field, which emphasizes the swans and creates a sense of intimacy. The soft lighting and the gentle curves of the swans create a sense of tranquility and beauty. The overall mood of the image is one of love, peace, and serenity.",
115
  """A watercolor painting of the American flag waving in the wind. The flag is painted in a vibrant red, white, and blue, with the stars in the blue field appearing slightly blurred, creating a sense of motion. The red stripes are painted with a slightly textured brushstroke, giving the flag a realistic and weathered look. The flag is positioned diagonally across the image, with the top left corner extending beyond the frame. The background is a simple white, allowing the flag to be the focal point. Below the flag, in bold red letters, is the text "PRESIDENTS DAY," with "21 FEBRUARY" in blue text above it. Below the text, in black, is "UNITED STATES OF AMERICA." The overall style of the image is patriotic and celebratory, with the watercolor technique adding a touch of artistic flair. The image evokes a sense of pride and national unity, making it a fitting tribute to Presidents Day.""",
116
  "A captivating photo, shot with a shallow depth of field, of a stunning blonde woman with cascading waves of platinum blonde hair that fall past her shoulders, catching the light. Her eyes, a striking shade of emerald green, are intensely focused on something just off-camera, creating a sense of intrigue. Sunlight streams softly onto her face, highlighting the delicate curve of her cheekbones and the subtle freckles scattered across her nose. She's wearing a flowing, bohemian-style maxi dress, the fabric a deep sapphire blue that complements her hair and eyes beautifully. The dress is adorned with intricate embroidery along the neckline and sleeves, adding a touch of elegance. The background is intentionally blurred, suggesting a sun-drenched garden setting with hints of vibrant flowers and lush greenery, drawing the viewer's eye to the woman's captivating features. The overall mood is serene yet captivating, evoking a feeling of summer warmth and quiet contemplation. The image should have a natural, slightly ethereal quality, with soft, diffused lighting that enhances her beauty without harsh shadows.",
@@ -119,35 +187,71 @@ examples = [
119
  css = """
120
  #col-container {
121
  margin: 0 auto;
122
- max-width: 640px;
 
 
 
 
123
  }
124
  """
125
 
126
- with gr.Blocks(css=css) as demo:
127
  with gr.Column(elem_id="col-container"):
128
  gr.Markdown(" # F-lite Text-to-Image Demo")
129
 
130
- with gr.Row():
131
  prompt = gr.Text(
132
  label="Prompt",
133
  show_label=False,
134
  max_lines=1,
135
  placeholder="Enter your prompt",
136
  container=False,
 
 
 
 
 
 
 
 
 
137
  )
138
 
139
- run_button = gr.Button("Run", scale=0, variant="primary")
140
 
141
  result = gr.Image(label="Result", show_label=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  with gr.Accordion("Advanced Settings", open=False):
 
 
 
 
 
 
 
 
 
144
  with gr.Tabs() as resolution_tabs:
145
  with gr.TabItem("Preset Resolutions"):
146
  resolution_dropdown = gr.Dropdown(
147
- label="Select Resolution",
148
- choices=[""] + resolution_options,
149
- value="",
150
- allow_custom_value=False,
151
  )
152
 
153
  with gr.TabItem("Custom Resolution"):
@@ -157,7 +261,7 @@ with gr.Blocks(css=css) as demo:
157
  minimum=256,
158
  maximum=MAX_IMAGE_SIZE,
159
  step=32,
160
- value=RESOLUTIONS["default"]["width"],
161
  )
162
 
163
  height = gr.Slider(
@@ -165,25 +269,18 @@ with gr.Blocks(css=css) as demo:
165
  minimum=256,
166
  maximum=MAX_IMAGE_SIZE,
167
  step=32,
168
- value=RESOLUTIONS["default"]["height"],
169
  )
170
 
171
- negative_prompt = gr.Text(
172
- label="Negative prompt",
173
- max_lines=1,
174
- placeholder="Enter a negative prompt",
175
- visible=True,
176
- )
177
-
178
  seed = gr.Slider(
179
  label="Seed",
180
  minimum=0,
181
  maximum=MAX_SEED,
182
  step=1,
183
- value=0,
184
  )
185
 
186
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
187
 
188
  with gr.Row():
189
  guidance_scale = gr.Slider(
@@ -203,15 +300,18 @@ with gr.Blocks(css=css) as demo:
203
  )
204
 
205
  # Examples should explicitly target only the prompt input
206
- gr.Examples(examples=examples, inputs=prompt, example_labels=[ex[:120] + "..." if len(ex) > 120 else ex for ex in examples])
 
207
 
208
- # Update width and height when resolution is selected from dropdown
209
  resolution_dropdown.change(
210
  fn=update_resolution,
211
- inputs=[resolution_dropdown],
212
  outputs=[width, height]
213
  )
214
 
 
 
215
  gr.on(
216
  triggers=[run_button.click, prompt.submit],
217
  fn=infer,
@@ -224,8 +324,10 @@ with gr.Blocks(css=css) as demo:
224
  height,
225
  guidance_scale,
226
  num_inference_steps,
 
227
  ],
228
- outputs=[result, seed],
 
229
  )
230
 
231
  if __name__ == "__main__":
 
1
+ from dotenv import load_dotenv
2
  import gradio as gr
3
  import numpy as np
4
  import random
5
  import json
6
+ import os
7
+ import logging
8
+ import google.generativeai as genai
9
 
10
  import spaces
11
  import torch
 
16
  LOADABLE_CLASSES["pikigen"] = LOADABLE_CLASSES["pikigen.model"] = {"DiT": ["save_pretrained", "from_pretrained"]}
17
  ALL_IMPORTABLE_CLASSES["DiT"] = ["save_pretrained", "from_pretrained"]
18
 
19
+ load_dotenv()
20
+
21
+ # Initialize Gemini API if API key is available
22
+ if os.getenv("GEMINI_API_KEY"):
23
+ gemini_available = True
24
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
25
+ else:
26
+ gemini_available = False
27
+ logging.warning("GEMINI_API_KEY not found in environment variables. Prompt enrichment will not work.")
28
+
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  model_repo_id = "Freepik/Pikigen-test"
31
 
 
60
  "square": [
61
  {"width": 1216, "height": 1216, "label": "1216×1216"},
62
  {"width": 1024, "height": 1024, "label": "1024×1024"}
63
+ ]
 
64
  }
65
 
66
+ # Default resolution
67
+ DEFAULT_RESOLUTION = {"width": 1024, "height": 1024, "label": "1024×1024"}
68
+
69
  # Create flattened options for the dropdown
70
  resolution_options = []
71
  for category, resolutions in RESOLUTIONS.items():
72
+ resolution_options.append([f"{category.capitalize()}", None]) # Category header
73
+ for res in resolutions:
74
+ resolution_options.append([f" {res['label']}", f"{category}:{res['width']}:{res['height']}"])
75
 
76
+ def enrich_prompt_with_gemini(prompt, max_tokens=1024):
77
+ """
78
+ Enrich a prompt using Google's Gemini API.
79
+
80
+ Args:
81
+ prompt: The original prompt to enrich
82
+ max_tokens: Maximum number of tokens for the response
83
+
84
+ Returns:
85
+ tuple: (enriched_prompt, error_message)
86
+ """
87
+ try:
88
+ if not os.getenv("GEMINI_API_KEY"):
89
+ return None, "GEMINI_API_KEY not found in environment variables. Please add it to your .env file."
90
+
91
+ model = genai.GenerativeModel('gemini-1.5-flash')
92
+
93
+ enrichment_prompt = f"""
94
+ You are a prompt enhancer for image generation.
95
+ Take the following basic prompt and make it longer, very descriptive, and detailed.
96
+ Write the description in a paragraph, avoiding bullet points.
97
+
98
+ Original prompt: {prompt}
99
+
100
+ Enhanced prompt:
101
+ """
102
+
103
+ response = model.generate_content(enrichment_prompt, generation_config={
104
+ "max_output_tokens": max_tokens,
105
+ "temperature": 1,
106
+ })
107
+
108
+ enriched_prompt = response.text.strip()
109
+ return enriched_prompt, None
110
+
111
+ except Exception as e:
112
+ error_message = f"Error enriching prompt: {str(e)}"
113
+ logging.error(error_message)
114
+ return None, error_message
115
+
116
+ # Function to update width and height based on selected resolution
117
+ def update_resolution(resolution_value):
118
+ """Updates width and height based on selected resolution value"""
119
+ if not resolution_value:
120
+ return DEFAULT_RESOLUTION["width"], DEFAULT_RESOLUTION["height"]
121
+
122
+ try:
123
+ category, width, height = resolution_value.split(":")
124
+ return int(width), int(height)
125
+ except:
126
+ return DEFAULT_RESOLUTION["width"], DEFAULT_RESOLUTION["height"]
127
 
128
  @spaces.GPU(duration=120)
129
  def infer(
 
135
  height,
136
  guidance_scale,
137
  num_inference_steps,
138
+ use_prompt_enrichment,
139
  progress=gr.Progress(track_tqdm=True),
140
  ):
141
+ enriched_prompt_str = None
142
+ error_message_str = None
143
+ generation_prompt = prompt # Default to original prompt
144
+
145
+ if use_prompt_enrichment and gemini_available:
146
+ enriched_prompt_str, error_message_str = enrich_prompt_with_gemini(prompt)
147
+ if enriched_prompt_str:
148
+ generation_prompt = enriched_prompt_str # Use enriched prompt if successful
149
+ # If enrichment fails, generation_prompt remains the original prompt
150
+
151
  if randomize_seed:
152
  seed = random.randint(0, MAX_SEED)
153
 
154
  generator = torch.Generator().manual_seed(seed)
155
 
156
  image = pipe(
157
+ prompt=generation_prompt,
158
  negative_prompt=negative_prompt,
159
  guidance_scale=guidance_scale,
160
  num_inference_steps=num_inference_steps,
 
162
  height=height,
163
  generator=generator,
164
  ).images[0]
 
 
 
 
 
 
 
 
 
165
 
166
+ # Prepare Gradio updates for the enriched prompt display
167
+ enriched_prompt_display_update = gr.update(visible=False)
168
+ enriched_prompt_text_update = gr.update(value="")
169
+ enrichment_error_update = gr.update(visible=False, value="")
 
 
 
 
 
 
 
 
 
170
 
171
+ if enriched_prompt_str:
172
+ enriched_prompt_display_update = gr.update(visible=True)
173
+ enriched_prompt_text_update = gr.update(value=enriched_prompt_str)
174
+ elif error_message_str:
175
+ enriched_prompt_display_update = gr.update(visible=True)
176
+ enrichment_error_update = gr.update(visible=True, value=error_message_str)
177
+
178
+ return image, seed, enriched_prompt_display_update, enriched_prompt_text_update, enrichment_error_update
179
 
180
  examples = [
181
+ "A photorealistic 3D render of a charming, mischievous young boy, approximately eight years old, possessing the endearingly unusual features of long, floppy donkey ears that droop playfully over his shoulders and a surprisingly small, pink pig nose that twitches slightly. His eyes, a sparkling, intelligent hazel, are wide with a hint of playful mischief, framed by slightly unruly, sandy-brown hair that falls in tousled waves across his forehead. He's dressed in a simple, slightly oversized, worn denim shirt and patched-up corduroy trousers, hinting at a life spent playing outdoors. The lighting is soft and natural, casting gentle shadows that highlight the texture of his skin – slightly freckled and sun-kissed, suggesting time spent in the sun. His expression is one of curious anticipation, his lips slightly parted as if he's about to speak or perhaps is listening intently. The background is a subtly blurred pastoral scene, perhaps a sun-dappled meadow with wildflowers, enhancing the overall whimsical and slightly surreal nature of the character. The overall style aims for a blend of realistic rendering with a touch of whimsical cartoonishness, capturing the unique juxtaposition of the boy's human features and his animalistic ears and nose.",
182
  "Two white swans with long necks, gracefully swimming in a still body of water. The swans are positioned in a heart shape, with their necks intertwined, creating a romantic and elegant scene. The water is calm and reflective, reflecting the soft, golden light of the setting sun. The background is a blur of soft, golden hues, suggesting a peaceful and serene environment. The image is likely a photograph, captured with a shallow depth of field, which emphasizes the swans and creates a sense of intimacy. The soft lighting and the gentle curves of the swans create a sense of tranquility and beauty. The overall mood of the image is one of love, peace, and serenity.",
183
  """A watercolor painting of the American flag waving in the wind. The flag is painted in a vibrant red, white, and blue, with the stars in the blue field appearing slightly blurred, creating a sense of motion. The red stripes are painted with a slightly textured brushstroke, giving the flag a realistic and weathered look. The flag is positioned diagonally across the image, with the top left corner extending beyond the frame. The background is a simple white, allowing the flag to be the focal point. Below the flag, in bold red letters, is the text "PRESIDENTS DAY," with "21 FEBRUARY" in blue text above it. Below the text, in black, is "UNITED STATES OF AMERICA." The overall style of the image is patriotic and celebratory, with the watercolor technique adding a touch of artistic flair. The image evokes a sense of pride and national unity, making it a fitting tribute to Presidents Day.""",
184
  "A captivating photo, shot with a shallow depth of field, of a stunning blonde woman with cascading waves of platinum blonde hair that fall past her shoulders, catching the light. Her eyes, a striking shade of emerald green, are intensely focused on something just off-camera, creating a sense of intrigue. Sunlight streams softly onto her face, highlighting the delicate curve of her cheekbones and the subtle freckles scattered across her nose. She's wearing a flowing, bohemian-style maxi dress, the fabric a deep sapphire blue that complements her hair and eyes beautifully. The dress is adorned with intricate embroidery along the neckline and sleeves, adding a touch of elegance. The background is intentionally blurred, suggesting a sun-drenched garden setting with hints of vibrant flowers and lush greenery, drawing the viewer's eye to the woman's captivating features. The overall mood is serene yet captivating, evoking a feeling of summer warmth and quiet contemplation. The image should have a natural, slightly ethereal quality, with soft, diffused lighting that enhances her beauty without harsh shadows.",
 
187
  css = """
188
  #col-container {
189
  margin: 0 auto;
190
+ max-width: 1024px;
191
+ }
192
+ .prompt-row > .gr-form {
193
+ gap: 0.5rem !important; /* Reduce gap between checkbox and button */
194
+ align-items: center; /* Align items vertically */
195
  }
196
  """
197
 
198
+ with gr.Blocks(css=css, theme="ParityError/Interstellar") as demo:
199
  with gr.Column(elem_id="col-container"):
200
  gr.Markdown(" # F-lite Text-to-Image Demo")
201
 
202
+ with gr.Row(elem_classes="prompt-row"):
203
  prompt = gr.Text(
204
  label="Prompt",
205
  show_label=False,
206
  max_lines=1,
207
  placeholder="Enter your prompt",
208
  container=False,
209
+ scale=6 # Give prompt more space
210
+ )
211
+
212
+ use_prompt_enrichment = gr.Checkbox(
213
+ label="Enrich",
214
+ value=True if gemini_available else False,
215
+ visible=gemini_available, # Hide checkbox if Gemini not available
216
+ scale=1, # Give checkbox some space
217
+ min_width=100 # Ensure label isn't cut off
218
  )
219
 
220
+ run_button = gr.Button("Run", scale=1, variant="primary", min_width=100)
221
 
222
  result = gr.Image(label="Result", show_label=False)
223
+
224
+ # Enriched prompt display (outside Advanced Settings)
225
+ enriched_prompt_display = gr.Accordion("Enriched Prompt", open=False, visible=False)
226
+ with enriched_prompt_display:
227
+ enriched_prompt_text = gr.Textbox(
228
+ label="Enriched Prompt",
229
+ interactive=False,
230
+ lines=8
231
+ )
232
+ enrichment_error = gr.Textbox(
233
+ label="Error",
234
+ visible=False,
235
+ interactive=False,
236
+ )
237
 
238
  with gr.Accordion("Advanced Settings", open=False):
239
+ negative_prompt = gr.Text(
240
+ label="Negative prompt",
241
+ max_lines=1,
242
+ placeholder="Enter a negative prompt",
243
+ visible=True,
244
+ )
245
+
246
+ # Removed checkbox and enriched prompt display from here
247
+
248
  with gr.Tabs() as resolution_tabs:
249
  with gr.TabItem("Preset Resolutions"):
250
  resolution_dropdown = gr.Dropdown(
251
+ label="Resolution",
252
+ choices=resolution_options,
253
+ value="square:1024:1024",
254
+ type="value"
255
  )
256
 
257
  with gr.TabItem("Custom Resolution"):
 
261
  minimum=256,
262
  maximum=MAX_IMAGE_SIZE,
263
  step=32,
264
+ value=DEFAULT_RESOLUTION["width"],
265
  )
266
 
267
  height = gr.Slider(
 
269
  minimum=256,
270
  maximum=MAX_IMAGE_SIZE,
271
  step=32,
272
+ value=DEFAULT_RESOLUTION["height"],
273
  )
274
 
 
 
 
 
 
 
 
275
  seed = gr.Slider(
276
  label="Seed",
277
  minimum=0,
278
  maximum=MAX_SEED,
279
  step=1,
280
+ value=42,
281
  )
282
 
283
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
284
 
285
  with gr.Row():
286
  guidance_scale = gr.Slider(
 
300
  )
301
 
302
  # Examples should explicitly target only the prompt input
303
+ max_length = 180
304
+ gr.Examples(examples=examples, inputs=prompt, example_labels=[ex[:max_length] + "..." if len(ex) > max_length else ex for ex in examples])
305
 
306
+ # Update width and height when resolution dropdown changes
307
  resolution_dropdown.change(
308
  fn=update_resolution,
309
+ inputs=resolution_dropdown,
310
  outputs=[width, height]
311
  )
312
 
313
+ # Removed separate function and gr.on for enriched prompt display update
314
+
315
  gr.on(
316
  triggers=[run_button.click, prompt.submit],
317
  fn=infer,
 
324
  height,
325
  guidance_scale,
326
  num_inference_steps,
327
+ use_prompt_enrichment,
328
  ],
329
+ # Outputs now include updates for the enriched prompt display components
330
+ outputs=[result, seed, enriched_prompt_display, enriched_prompt_text, enrichment_error],
331
  )
332
 
333
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -6,4 +6,6 @@ protobuf
6
  rich
7
  sentencepiece
8
  torch
9
- transformers
 
 
 
6
  rich
7
  sentencepiece
8
  torch
9
+ transformers
10
+ google-generativeai
11
+ dotenv