Spaces:

ShinnosukeU
/

diffusion-prompt-generator

Sleeping

@@ -1,12 +1,6 @@
 ---
-title: Diffusion Prompt Generator
-emoji: 📈
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
 sdk_version: 5.45.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: diffusion-prompt-generator
+app_file: app.py
 sdk: gradio
 sdk_version: 5.45.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import base64
+from io import BytesIO
+from textwrap import dedent
+import gradio as gr
+import jinja2
+from openai import OpenAI
+client = OpenAI()
+GENERAL_PROMPT_TEMPLATE = jinja2.Template("""You are an expert prompt engineer for cinematic-style image generation.
+Transform the user's simple prompt into a highly descriptive paragraph that produces a visually striking image. The photo of the user will be provided to you, so you should use it to infer the subject's appearance and incorporate accurate descriptors.
+Emphasizing dynamic and engaging editorial posing. Integrate secondary subjects, environmental elements, and leading lines naturally into the scene to direct attention toward the main subject—examples like architecture, nature, lighting, or abstract forms can inspire but do not need to be used literally. Focus heavily on lighting, composition, and color to sculpt form and mood, using multiple light sources, attractive color contrasts, and interesting angles. Choose the artistic style, color grading, and atmosphere that best enhance the subject and context of the prompt, creating a cohesive and visually compelling image without being constrained to any particular existing style. Use a photorealistic style. Make sure that the background is very cool and suits the prompt. Make sure that the prompt is very aesthetic, creative and vivid.
+Tips:
+- Make sure prompt is not too long.
+- Only include facial features of the subject in the prompt from the photo. Ignore the background or the clothes of the subject in the photo.
+- Use dynamic camera angles and poses if appropriate.
+Examples:
+Input: A photo of me in a race bib
+Input photo: Black man
+Output prompt: A stylized, cinematic portrait of a Black man captured from the chest up, set against a
+glowing deep red background. The image is tightly framed in vertical format, emphasizing his
+upper torso, neck, and face in moody, directional light. He wears a torn black tank top with
+rugged edges and a marathon race bib pinned to the front. Around his neck hangs a thin silver chain. His hair is
+styled in tight braids, and he wears futuristic wraparound sunglasses in metallic blue, engraved across the lens — subtly visible in the reflections. The lighting is
+soft but focused, casting strong shadow contours along his collarbone and highlighting the
+reflective elements of both glasses and sweat on his skin. The mood is intense and editorial
+— a blend of raw athleticism and streetwear elegance, evoking focus, style, and subtle
+rebellion. The torn shirt and race bib hint at exertion and context, while the engraved
+eyewear and red glow turn the portrait into a branded fashion statement.
+Why the output is good:
+- The detailed styling (torn tank top, race bib, metallic sunglasses)
+- Specific lighting directions (soft but focused, shadow contours) shape the mood.
+Input: A photo of me in a pool
+Input photo: A muscular man
+Output prompt: A top-down editorial photo of a muscular man falling off a bright pink inflatable pool float,
+mid-fall with his body twisting toward the water. He wears black swim shorts and silver
+Oakley sunglasses. His arms are flailing slightly, and water droplets hang frozen in the air
+around him, hit by harsh flash. The float is distorted by motion, and splash trails from his legs
+as they hit the surface. The pool is a sunlit turquoise, with subtle tile reflection and lens
+specks near the corners. There's bloom from the water highlights, and the entire shot has an
+analog, fashion-campaign feel with no visible grain. Use a Photorealistic Style. Resolution
+1792x1024. Fisheye! Motion blur
+Why the output is good:
+- Unique perspective (top-down) combined with dynamic action (falling off,
+mid-fall, twisting, flailing).
+- Specifies analog, fashion-campaign feel but requests no visible grain, guiding the texture.
+- Adding Fisheye and Motion blur at the end reinforces these key elements.
+Input: A photo of me as Batman
+Input photo: Asian man
+Portrait of asian man as Batman in the style of Rembrandt black and white, chiaroscuro lighting, deep shadows, and luminous highlights. His face emerges from darkness, one eye catching a sliver of light, the other lost in shadow. The cowl is rendered like aged leather, with thick, textured brushstrokes and visible impasto. The Batsymbol is faint, almost erased, as if worn by time. Background: void of form, only grain and darkness. Style: baroque oil painting translated to monochrome — dramatic, emotional
+Why the output is good:
+- The overall style fits the theme of the Batman.
+HERE is the user's prompt:
+{{ user_prompt }}
+""")
+def process_prompt(image, target_label, user_prompt, style):
+    image_url = None
+    buffer = BytesIO()
+    image.convert("RGB").save(buffer, format="JPEG", quality=90)
+    b64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    image_url = f"data:image/jpeg;base64,{b64_image}"
+    if style == "Chromatic Cinematic":
+        system_content = """You are an expert prompt engineer for chromatic cinematic-style image generation. Transform the user's simple prompt into a highly descriptive paragraph that produces a visually striking image with strong contrast and aesthetic color grading such as Wes Anderson. Frame close to the camera so the subject is immediately recognizable, emphasizing dynamic and exaggerated editorial posing. Integrate secondary subjects, environmental elements, and leading lines naturally into the scene to direct attention toward the main subject—examples like architectural beams, diagonal staircases, waves, or shadows can inspire but do not need to be used literally. Focus heavily on lighting to sculpt the form and mood, using two lighting sources from different directions, attractive color combinations, and interesting lighting angles (e.g., dramatic diagonal or overhead from the top-left corner). When referencing a style like Wes Anderson, describe the scene, composition, or color grading (e.g., bold symmetry, saturated pastels) without simply copying his visuals. Use a photorealistic style. Resolution 1792x1024."""
+        user_content = (
+            f"Use the uploaded image to infer the subject's appearance attribtues. Instead of refercing pronouns in the prompt (i.e. me/she siting on a chair), use the attributes to describe the subjet (i.e. the man with the glasses sitting on the chair). "
+            f"Then transform this prompt into a detailed chromatic cinematic style description: User's prompt: {user_prompt}"
+        )
+    elif style == "Film Noir":
+        system_content = "You are an expert prompt engineer for cinematic-style image generation in the film noir aesthetic. Transform the user's simple prompt into a highly descriptive paragraph that produces a visually striking image with high contrast, deep shadows, and moody lighting characteristic of classic noir. Frame close to the camera so the subject is immediately recognizable, emphasizing tense, dramatic, or expressive editorial posing. Integrate secondary subjects, environmental elements, and leading lines naturally into the scene to direct attention toward the main subject—examples like rain-slicked streets, lampposts casting long shadows, Venetian blinds, or fog can inspire but do not need to be used literally. Focus heavily on lighting to sculpt form and mood, using harsh key lights, soft fill lights, and strong directional shadows to create tension and depth. When referencing a style like film noir, describe the scene, composition, or tonal contrasts (e.g., stark black-and-white contrasts, smoky atmospheres, reflective wet surfaces) without simply copying existing visuals. Use a photorealistic style. Resolution 1792x1024."
+        user_content = (
+            "Use the uploaded image to infer the subject's appearance and incorporate accurate descriptors. "
+            f"User's prompt: {user_prompt}"
+        )
+    elif style == "General":
+        system_content = "You are expert prompt engineer"
+        user_content = GENERAL_PROMPT_TEMPLATE.render(user_prompt=user_prompt)
+    response = client.responses.create(
+        model="gpt-5",
+        reasoning={"effort": "low"},
+        input=[
+            {
+                "role": "system",
+                "content": system_content
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": user_content},
+                    {"type": "input_image", "image_url": image_url}
+                ]
+            }
+        ],
+    )
+    return f"{response.output_text} {target_label.strip()}"
+demo = gr.Interface(
+    fn=process_prompt,
+    inputs=[
+        gr.Image(
+            label="Upload reference image",
+            type="pil",
+        ),
+        gr.Textbox(
+            label="Enter target label",
+            placeholder="SMRA",
+        ),
+        gr.Textbox(
+            label="Enter your prompt",
+            placeholder="picture of me while sitting in a chair in the ocean",
+        ),
+        gr.Dropdown(
+            choices=["General"],
+            #choices=["Chromatic Cinematic", "Neon Noir", "General"],
+            label="Style Selection",
+            info="Choose the visual style for your enhanced prompt"
+        ),
+    ],
+    outputs=gr.Textbox(
+        label="Style Prompt",
+        lines=20,
+    ),
+)
+demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+[project]
+name = "prompt-aesthics"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12.7"
+dependencies = [
+    "gradio>=5.45.0",
+    "jinja2>=3.1.6",
+    "openai>=1.107.1",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff