Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 9

Commit

4e987e0

verified ·

1 Parent(s): 9287e9e

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -165

app.py CHANGED Viewed

@@ -1,86 +1,22 @@
-# Imports
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
-import torch
 import os
 import tempfile
-import time
-import numpy as np
-# Use Streamlit's caching mechanisms to optimize model loading
 @st.cache_resource
-def load_image_to_text_pipeline():
-    """Load and cache the image-to-text model"""
-    return pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
-@st.cache_resource
-def load_text_generation_pipeline():
-    """Load and cache the text generation model"""
-    return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-@st.cache_resource
-def load_tts_pipeline():
-    """Load and cache the text-to-speech pipeline as fallback"""
-    try:
-        return pipeline("text-to-speech", model="facebook/mms-tts-eng")
-    except:
-        # Return None if loading fails
-        return None
-# Initialize all models at app startup
-with st.spinner("Loading models (this may take a moment the first time)..."):
-    # Load all models at startup and cache them
-    img2text_model = load_image_to_text_pipeline()
-    story_generator_model = load_text_generation_pipeline()
-    tts_fallback_model = load_tts_pipeline()
-# For TTS, try multiple options in order of preference
-try:
-    # Try importing gTTS
-    from gtts import gTTS
-    has_gtts = True
-except ImportError:
-    has_gtts = False
-    if tts_fallback_model is None:
-        st.warning("No text-to-speech capability available. Audio generation will be disabled.")
-# Cache the text-to-audio conversion
-@st.cache_data
-def text2audio(story_text):
-    """Convert text to audio with caching to avoid regenerating the same audio"""
-    if has_gtts:
-        # Use gTTS
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
-        temp_filename = temp_file.name
-        temp_file.close()
-        # Use gTTS to convert text to speech
-        tts = gTTS(text=story_text, lang='en', slow=False)
-        tts.save(temp_filename)
-        # Read the audio file
-        with open(temp_filename, 'rb') as audio_file:
-            audio_bytes = audio_file.read()
-        # Clean up the temporary file
-        os.unlink(temp_filename)
-        return audio_bytes, 'audio/mp3'
-    elif tts_fallback_model is not None:
-        # Use transformers TTS
-        speech = tts_fallback_model(story_text)
-        # Return the audio data
-        if 'audio' in speech:
-            return speech['audio'], speech.get('sampling_rate', 16000)
-        elif 'audio_array' in speech:
-            return speech['audio_array'], speech.get('sampling_rate', 16000)
-    # If we got here, no TTS method worked
-    raise Exception("No text-to-speech capability available")
-# Convert PIL Image to bytes for hashing in cache
 def get_image_bytes(pil_img):
     """Convert PIL image to bytes for hashing"""
     import io
@@ -90,29 +26,21 @@ def get_image_bytes(pil_img):
 # Simple image-to-text function using cached model
 @st.cache_data
-def img2text(image_bytes):
-    """Convert image to text with caching - using bytes for caching compatibility"""
-    # Convert bytes back to PIL image for processing
     import io
-    from PIL import Image
     pil_img = Image.open(io.BytesIO(image_bytes))
-    # Process with the model
-    result = img2text_model(pil_img)
     return result[0]["generated_text"]
-# Helper function to count words
-def count_words(text):
-    return len(text.split())
-# Improved text-to-story function without "Once upon a time" constraint
 @st.cache_data
-def text2story(text):
-    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. "
     # Generate a longer text to ensure we get a complete story
-    story_result = generator(
         prompt,
         max_length=300,
         num_return_sequences=1,
@@ -148,89 +76,57 @@ def text2story(text):
     # If no good ending is found, return as is
     return story_text
-# Function to reset progress when a new file is uploaded
-def reset_progress():
-    st.session_state.progress = {
-        'caption_generated': False,
-        'story_generated': False,
-        'audio_generated': False,
-        'caption': '',
-        'story': '',
-        'audio_data': None,
-        'audio_format': None
-    }
-# Basic Streamlit interface
-st.title("Image to Audio Story")
-# Add processing status indicator
-status_container = st.empty()
-# Initialize session state for tracking progress
-if 'progress' not in st.session_state:
-    st.session_state.progress = {
-        'caption_generated': False,
-        'story_generated': False,
-        'audio_generated': False,
-        'caption': '',
-        'story': '',
-        'audio_data': None,
-        'audio_format': None
-    }
 # File uploader
-uploaded_file = st.file_uploader("Upload an image", on_change=reset_progress)
-# Process the image if uploaded
 if uploaded_file is not None:
     # Display image
-    st.image(uploaded_file, caption="Uploaded Image")
-    # Convert to PIL Image
     image = Image.open(uploaded_file)
-    # Convert image to bytes for caching compatibility
-    image_bytes = get_image_bytes(image)
-    # Image to Text (if not already done)
-    if not st.session_state.progress['caption_generated']:
-        status_container.info("Generating caption...")
-        st.session_state.progress['caption'] = img2text(image_bytes)
-        st.session_state.progress['caption_generated'] = True
-    st.write(f"Caption: {st.session_state.progress['caption']}")
-    # Text to Story (if not already done)
-    if not st.session_state.progress['story_generated']:
-        status_container.info("Creating story...")
-        st.session_state.progress['story'] = text2story(st.session_state.progress['caption'])
-        st.session_state.progress['story_generated'] = True
-    # Display word count for transparency
-    word_count = count_words(st.session_state.progress['story'])
-    st.write(f"Story ({word_count} words):")
-    st.write(st.session_state.progress['story'])
-    # Pre-generate audio in background (if not already done)
-    if not st.session_state.progress['audio_generated'] and (has_gtts or tts_fallback_model is not None):
-        status_container.info("Pre-generating audio in background...")
-        try:
-            st.session_state.progress['audio_data'], st.session_state.progress['audio_format'] = text2audio(st.session_state.progress['story'])
-            st.session_state.progress['audio_generated'] = True
-            status_container.success("Ready to play audio!")
-        except Exception as e:
-            status_container.error(f"Error pre-generating audio: {e}")
-    # Button to play audio
     if st.button("Play the audio"):
-        if st.session_state.progress['audio_generated']:
-            # Display the audio player
-            if isinstance(st.session_state.progress['audio_format'], str) and st.session_state.progress['audio_format'].startswith('audio/'):
-                st.audio(st.session_state.progress['audio_data'], format=st.session_state.progress['audio_format'])
-            else:
-                st.audio(st.session_state.progress['audio_data'], sample_rate=st.session_state.progress['audio_format'])
-        else:
-            # Handle case where audio generation failed or is not available
-            st.error("Unable to play audio. Audio generation was not successful.")
-else:
-    status_container.info("Upload an image to begin")

+# Imports - just the essentials
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
 import os
 import tempfile
+from gtts import gTTS
+# Preload and cache all models at app startup
 @st.cache_resource
+def load_models():
+    """Load all models and cache them for faster execution"""
+    models = {
+        "image_captioner": pipeline("image-to-text", model="sooh-j/blip-image-captioning-base"),
+        "story_generator": pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    }
+    return models
+# Convert PIL Image to bytes for caching compatibility
 def get_image_bytes(pil_img):
     """Convert PIL image to bytes for hashing"""
     import io
 # Simple image-to-text function using cached model
 @st.cache_data
+def img2text(image_bytes, models):
+    """Convert image to text with caching"""
     import io
     pil_img = Image.open(io.BytesIO(image_bytes))
+    result = models["image_captioner"](pil_img)
     return result[0]["generated_text"]
+# Generate story from text - using your approach with caching
 @st.cache_data
+def text2story(text, models):
+    """Generate a story from text with sensible endings"""
     prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. "
     # Generate a longer text to ensure we get a complete story
+    story_result = models["story_generator"](
         prompt,
         max_length=300,
         num_return_sequences=1,
     # If no good ending is found, return as is
     return story_text
+# Text-to-speech function
+@st.cache_data
+def text2audio(story_text):
+    """Convert text to audio"""
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+    temp_filename = temp_file.name
+    temp_file.close()
+    tts = gTTS(text=story_text, lang='en')
+    tts.save(temp_filename)
+    with open(temp_filename, 'rb') as audio_file:
+        audio_bytes = audio_file.read()
+    os.unlink(temp_filename)
+    return audio_bytes
+# Load models at startup - this happens before the app interface is displayed
+models = load_models()
+st.write("✅ Models loaded and cached!")
+# Streamlit app interface
+st.title("Image to Audio Story")
 # File uploader
+uploaded_file = st.file_uploader("Upload an image")
 if uploaded_file is not None:
     # Display image
     image = Image.open(uploaded_file)
+    st.image(image, caption="Uploaded Image", width=300)
+    # Process image
+    with st.spinner("Processing..."):
+        # Convert to bytes for caching
+        image_bytes = get_image_bytes(image)
+        # Generate caption
+        caption = img2text(image_bytes, models)
+        st.write(f"**Caption:** {caption}")
+        # Generate story
+        story = text2story(caption, models)
+        word_count = len(story.split())
+        st.write(f"**Story ({word_count} words):**")
+        st.write(story)
+        # Pre-generate audio
+        if 'audio' not in st.session_state:
+            st.session_state.audio = text2audio(story)
+    # Play audio button
     if st.button("Play the audio"):
+        st.audio(st.session_state.audio, format="audio/mp3")