Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import moviepy.editor as mpe | |
| from PIL import Image, ImageDraw, ImageFont | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from min_dalle import MinDalle | |
| from gtts import gTTS | |
| from pydub import AudioSegment | |
| import nltk | |
| import textwrap | |
| import os | |
| import glob | |
| import subprocess | |
| import imageio_ffmpeg | |
| import os | |
| # Define a fallback for environments without GPU | |
| if os.environ.get("SPACES_ZERO_GPU") is not None: | |
| import spaces | |
| else: | |
| class spaces: | |
| def GPU(func): | |
| def wrapper(*args, **kwargs): | |
| return func(*args, **kwargs) | |
| return wrapper | |
| # Ensure 'punkt' is downloaded for nltk | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| # Download FFmpeg using imageio_ffmpeg (more robust) | |
| try: | |
| imageio_ffmpeg.get_ffmpeg_exe() | |
| print("FFmpeg downloaded successfully (if not already present).") | |
| except Exception as e: | |
| print(f"Error downloading FFmpeg using imageio_ffmpeg: {e}") | |
| raise | |
| description = "Video Story Generator with Audio \n PS: Generation of video by using Artificial Intelligence by dalle-mini and distilbart and gtss " | |
| title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss " | |
| tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6") | |
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| print(f"Using device: {device}") | |
| def get_output_video(text): | |
| print("Starting get_output_video function...") | |
| inputs = tokenizer(text, | |
| max_length=1024, | |
| truncation=True, | |
| return_tensors="pt").to(device) | |
| summary_ids = model.generate(inputs["input_ids"]) | |
| summary = tokenizer.batch_decode(summary_ids, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False) | |
| plot = list(summary[0].split('.')) | |
| print(f"Summarized plot: {plot}") | |
| ''' | |
| The required models will be downloaded to models_root if they are not already there. | |
| Set the dtype to torch.float16 to save GPU memory. | |
| If you have an Ampere architecture GPU you can use torch.bfloat16. | |
| Set the device to either "cuda" or "cpu". Once everything has finished initializing, | |
| float32 is faster than float16 but uses more GPU memory. | |
| ''' | |
| #@spaces.GPU(duration=60 * 3) | |
| def generate_image( | |
| is_mega: bool, | |
| text: str, | |
| seed: int, | |
| grid_size: int, | |
| top_k: int, | |
| image_path: str, | |
| models_root: str, | |
| fp16: bool, | |
| ): | |
| print(f"Generating image for: {text}") | |
| model = MinDalle( | |
| is_mega=is_mega, | |
| models_root=models_root, | |
| is_reusable=True, | |
| is_verbose=True, | |
| dtype=torch.float16 if fp16 else torch.float32, | |
| device=device | |
| ) | |
| # Ensure correct dtype for inputs | |
| image = model.generate_image( | |
| text, | |
| seed, | |
| grid_size, | |
| top_k=top_k, | |
| is_verbose=True | |
| ) | |
| print(f"Image generated successfully.") | |
| return image | |
| generated_images = [] | |
| for i, senten in enumerate(plot[:-1]): | |
| print(f"Generating image {i+1} of {len(plot)-1}...") | |
| try: | |
| image = generate_image( | |
| is_mega=True, | |
| text=senten, | |
| seed=1, | |
| grid_size=1, # param {type:"integer"} | |
| top_k=256, # param {type:"integer"} | |
| image_path='generated', | |
| models_root='pretrained', | |
| fp16=True, ) | |
| generated_images.append(image) | |
| print(f"Image {i+1} generated and appended.") | |
| except Exception as e: | |
| print(f"Error generating image {i+1}: {e}") | |
| raise | |
| # Step 4- Creation of the subtitles | |
| sentences = plot[:-1] | |
| num_sentences = len(sentences) | |
| assert len(generated_images) == len(sentences), print('Something is wrong') | |
| # We can generate our list of subtitles | |
| from nltk import tokenize | |
| c = 0 | |
| sub_names = [] | |
| for k in range(len(generated_images)): | |
| subtitles = tokenize.sent_tokenize(sentences[k]) | |
| sub_names.append(subtitles) | |
| print(f"Subtitles generated for image {k+1}: {subtitles}") | |
| # Step 5- Adding Subtitles to the Images | |
| def draw_multiple_line_text(image, text, font, text_color, text_start_height): | |
| draw = ImageDraw.Draw(image) | |
| image_width, image_height = image.size | |
| y_text = text_start_height | |
| lines = textwrap.wrap(text, width=40) | |
| for line in lines: | |
| line_width, line_height = font.getbbox(line)[2:4] # Use getbbox for better size calculation | |
| draw.text(((image_width - line_width) / 2, y_text), | |
| line, font=font, fill=text_color) | |
| y_text += line_height | |
| def add_text_to_img(text1, image_input): | |
| ''' | |
| Testing draw_multiple_line_text | |
| ''' | |
| image = image_input | |
| fontsize = 20 # Increased font size | |
| path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf" | |
| if not os.path.exists(path_font): | |
| # Try alternative location on different systems | |
| path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" | |
| if not os.path.exists(path_font): | |
| print("Font file not found. Subtitles might not be rendered correctly.") | |
| path_font = None | |
| if path_font is not None: | |
| try: | |
| font = ImageFont.truetype(path_font, fontsize) | |
| text_color = (255, 255, 0) | |
| text_start_height = 200 | |
| draw_multiple_line_text(image, text1, font, text_color, text_start_height) | |
| except Exception as e: | |
| print(f"Error loading or using font: {e}") | |
| return image | |
| generated_images_sub = [] | |
| for k in range(len(generated_images)): | |
| imagenes = generated_images[k].copy() | |
| text_to_add = sub_names[k][0] | |
| result = add_text_to_img(text_to_add, imagenes) | |
| generated_images_sub.append(result) | |
| print(f"Subtitles added to image {k+1}.") | |
| # Step 7 - Creation of audio | |
| c = 0 | |
| mp3_names = [] | |
| mp3_lengths = [] | |
| for k in range(len(generated_images)): | |
| text_to_add = sub_names[k][0] | |
| print(f"Generating audio for: {text_to_add}") | |
| f_name = 'audio_' + str(c) + '.mp3' | |
| mp3_names.append(f_name) | |
| # The text that you want to convert to audio | |
| mytext = text_to_add | |
| # Language in which you want to convert | |
| language = 'en' | |
| # Passing the text and language to the engine, | |
| # here we have marked slow=False. Which tells | |
| # the module that the converted audio should | |
| # have a high speed | |
| myobj = gTTS(text=mytext, lang=language, slow=False) | |
| # Saving the converted audio in a mp3 file named | |
| sound_file = f_name | |
| myobj.save(sound_file) | |
| audio = AudioSegment.from_file(sound_file, format="mp3") | |
| duration = len(audio) / 1000 | |
| mp3_lengths.append(duration) | |
| print(f"Audio duration: {duration} seconds") | |
| c += 1 | |
| # Step 8 - Merge audio files | |
| cwd = os.getcwd().replace(chr(92), '/') | |
| export_path = 'result.mp3' | |
| silence = AudioSegment.silent(duration=500) | |
| full_audio = AudioSegment.empty() | |
| for n, mp3_file in enumerate(mp3_names): | |
| mp3_file = mp3_file.replace(chr(92), '/') | |
| print(f"Merging audio file: {mp3_file}") | |
| # Load the current mp3 into `audio_segment` | |
| audio_segment = AudioSegment.from_mp3(mp3_file) | |
| # Just accumulate the new `audio_segment` + `silence` | |
| full_audio += audio_segment + silence | |
| print(f'Merging audio {n+1} completed.') | |
| # The loop will exit once all files in the list have been used | |
| # Then export | |
| full_audio.export(export_path, format='mp3') | |
| print('\nAudio merging done!') | |
| # Step 9 - Creation of the video with adjusted times of the sound | |
| c = 0 | |
| file_names = [] | |
| for img in generated_images_sub: | |
| f_name = 'img_' + str(c) + '.jpg' | |
| file_names.append(f_name) | |
| img.save(f_name) | |
| print(f"Saving image: {f_name}") | |
| c += 1 | |
| print(f"Image file names: {file_names}") | |
| clips = [] | |
| d = 0 | |
| for m in file_names: | |
| duration = mp3_lengths[d] | |
| print(f"Creating video clip {d+1} with duration: {duration} seconds") | |
| clips.append(mpe.ImageClip(m).set_duration(duration + 0.5)) | |
| d += 1 | |
| concat_clip = mpe.concatenate_videoclips(clips, method="compose") | |
| concat_clip.write_videofile("result_new.mp4", fps=24) | |
| print("Video clips concatenated and saved as result_new.mp4") | |
| # Step 10 - Merge Video + Audio | |
| movie_name = 'result_new.mp4' | |
| export_path = 'result.mp3' | |
| movie_final = 'result_final.mp4' | |
| def combine_audio(vidname, audname, outname, fps=24): | |
| my_clip = mpe.VideoFileClip(vidname) | |
| audio_background = mpe.AudioFileClip(audname) | |
| final_clip = my_clip.set_audio(audio_background) | |
| final_clip.write_videofile(outname, fps=fps) | |
| combine_audio(movie_name, export_path, movie_final) # create a new file | |
| print("Video and audio merged successfully!") | |
| # Cleanup intermediate files | |
| for f in file_names: | |
| os.remove(f) | |
| for f in mp3_names: | |
| os.remove(f) | |
| os.remove("result_new.mp4") | |
| os.remove("result.mp3") | |
| print("Intermediate files cleaned up.") | |
| print("Finished get_output_video function.") | |
| return 'result_final.mp4' | |
| text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.' | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown("# Video Generator from stories with Artificial Intelligence") | |
| gr.Markdown( | |
| "A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.") | |
| with gr.Row(): | |
| # Left column (inputs) | |
| with gr.Column(): | |
| input_start_text = gr.Textbox(value=text, | |
| label="Type your story here, for now a sample story is added already!") | |
| with gr.Row(): | |
| button_gen_video = gr.Button("Generate Video") | |
| # Right column (outputs) | |
| with gr.Column(): | |
| output_interpolation = gr.Video(label="Generated Video") | |
| gr.Markdown("<h3>Future Works </h3>") | |
| gr.Markdown( | |
| "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ") | |
| button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation) | |
| demo.launch(debug=True) |