Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 9

Commit

1a8c2bf

verified ·

1 Parent(s): 4f45e40

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -21

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from PIL import Image
 import os
 import tempfile
 from gtts import gTTS
-import io
 # Preload and cache all models at app startup
 @st.cache_resource
@@ -13,23 +12,41 @@ def load_models():
     """Load all models and cache them for faster execution"""
     models = {
         "image_captioner": pipeline("image-to-text", model="sooh-j/blip-image-captioning-base"),
-        "story_generator": pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct")
     }
     return models
 # Simple image-to-text function using cached model
 @st.cache_data
-def img2text(image, _models):
-    """Convert image to text with caching"""
-    result = _models["image_captioner"](image)
     return result[0]["generated_text"]
 @st.cache_data
 def text2story(caption, _models):
-    """Generate a short story from image caption"""
     story_generator = _models["story_generator"]
-    # Format prompt
     prompt = f"""<|system|>
 You are a creative short story writer. Write a brief, engaging story that expands on the given image caption.
 The story should be under 100 words and have a natural beginning, middle, and end.
@@ -38,19 +55,23 @@ Image caption: "{caption}"
 Create a short story that expands on this image caption and brings it to life.
 <|assistant|>"""
-    # Generate story
     response = story_generator(
         prompt,
-        max_new_tokens=100,
         do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.2,
         eos_token_id=story_generator.tokenizer.eos_token_id
     )
-    # Extract just the assistant's response
-    story_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
     return story_text
 # Text-to-speech function
@@ -70,8 +91,9 @@ def text2audio(story_text):
     os.unlink(temp_filename)
     return audio_bytes
-# Load models at startup
 models = load_models()
 # Streamlit app interface
 st.title("Image to Audio Story")
@@ -80,23 +102,26 @@ st.title("Image to Audio Story")
 uploaded_file = st.file_uploader("Upload an image")
 if uploaded_file is not None:
-    # Display image at a smaller size (200px width instead of 300px)
     image = Image.open(uploaded_file)
-    st.image(image, caption="Uploaded Image", width=200)
     # Process image
     with st.spinner("Processing..."):
-        # Generate caption directly from the image (no need to convert to bytes)
-        caption = img2text(image, models)
         st.write(f"**Caption:** {caption}")
-        # Generate story
         story = text2story(caption, models)
         word_count = len(story.split())
         st.write(f"**Story ({word_count} words):**")
         st.write(story)
-        # Generate audio
         if 'audio' not in st.session_state:
             st.session_state.audio = text2audio(story)

 import os
 import tempfile
 from gtts import gTTS
 # Preload and cache all models at app startup
 @st.cache_resource
     """Load all models and cache them for faster execution"""
     models = {
         "image_captioner": pipeline("image-to-text", model="sooh-j/blip-image-captioning-base"),
+        "story_generator": pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     }
     return models
+# Convert PIL Image to bytes for caching compatibility
+def get_image_bytes(pil_img):
+    """Convert PIL image to bytes for hashing"""
+    import io
+    buf = io.BytesIO()
+    pil_img.save(buf, format='JPEG')
+    return buf.getvalue()
 # Simple image-to-text function using cached model
 @st.cache_data
+def img2text(image_bytes, _models):
+    """Convert image to text with caching - using underscore for unhashable arg"""
+    import io
+    pil_img = Image.open(io.BytesIO(image_bytes))
+    result = _models["image_captioner"](pil_img)
     return result[0]["generated_text"]
 @st.cache_data
 def text2story(caption, _models):
+    """Generate a short story from image caption.
+    Args:
+        caption: Caption describing the image
+        _models: Dictionary containing loaded models
+    Returns:
+        A generated story that expands on the image caption
+    """
     story_generator = _models["story_generator"]
+    # Format prompt to ensure the story expands on the image caption
     prompt = f"""<|system|>
 You are a creative short story writer. Write a brief, engaging story that expands on the given image caption.
 The story should be under 100 words and have a natural beginning, middle, and end.
 Create a short story that expands on this image caption and brings it to life.
 <|assistant|>"""
+    # Generate story with parameters tuned for brevity and coherence
     response = story_generator(
         prompt,
+        max_new_tokens=100,  # Allow enough tokens for a complete story
         do_sample=True,
+        temperature=0.7,     # Balanced creativity
+        top_p=0.9,           # Focus on more likely tokens
+        repetition_penalty=1.2,  # Avoid repetitive patterns
         eos_token_id=story_generator.tokenizer.eos_token_id
     )
+    # Extract just the generated story text
+    raw_story = response[0]['generated_text']
+    # Parse out just the assistant's response from the conversation format
+    story_text = raw_story.split("<|assistant|>")[-1].strip()
     return story_text
 # Text-to-speech function
     os.unlink(temp_filename)
     return audio_bytes
+# Load models at startup - this happens before the app interface is displayed
 models = load_models()
+st.write("✅ Models loaded and cached!")
 # Streamlit app interface
 st.title("Image to Audio Story")
 uploaded_file = st.file_uploader("Upload an image")
 if uploaded_file is not None:
+    # Display image
     image = Image.open(uploaded_file)
+    st.image(image, caption="Uploaded Image", width=300)
     # Process image
     with st.spinner("Processing..."):
+        # Convert to bytes for caching
+        image_bytes = get_image_bytes(image)
+        # Generate caption
+        caption = img2text(image_bytes, models)
         st.write(f"**Caption:** {caption}")
+        # Generate story that expands on the caption
         story = text2story(caption, models)
         word_count = len(story.split())
         st.write(f"**Story ({word_count} words):**")
         st.write(story)
+        # Pre-generate audio
         if 'audio' not in st.session_state:
             st.session_state.audio = text2audio(story)