import gradio as gr import librosa from PIL import Image, ImageDraw, ImageFont from mutagen.mp3 import MP3 from mutagen.id3 import ID3, APIC, TIT2, TPE1 import io from colorthief import ColorThief import colorsys import math import os from multiprocessing import Pool, cpu_count import tempfile import ffmpeg import subprocess import traceback import time import shutil import LRC import LRC2SRT path = "" # Update with your path def safe_read(i: int, a: list): if i >= len(a): return 128 else: return a[i] def getRenderCords(ta: list, idx: int, res: int = 1024, size: tuple = (1280, 720)) -> list: i = idx - res // 2 x, y = size[0] * .9 / -2, (ta[i] - 128) * (size[1] / 2000) + (size[1] * .7 / -2) c = [] while i < idx + (res // 2): c.append((x, y)) i += 1 y = (safe_read(i,ta) - 128) * (size[1] / 2000) + (size[1] * .7 / -2) x += (size[0] * .9) / res return c def center_to_top_left(coords, width=1280, height=720): new_coords = [] for x, y in coords: new_coords.append(totopleft((x, y), width=width, height=height)) return new_coords def totopleft(coord, width=1280, height=720): return coord[0] + width / 2, height / 2 - coord[1] def getTrigger(ad: int, a: list, max: int = 1024) -> int: i = ad while not (safe_read(i,a) < 126 and safe_read(i+6,a) < 130 or i - ad > max): i += 1 return i def extract_cover_image(mp3_file): audio = MP3(mp3_file, ID3=ID3) if audio.tags == None: return -1 for tag in audio.tags.values(): if isinstance(tag, APIC): image_data = tag.data cover_image = Image.open(io.BytesIO(image_data)) return cover_image print("No cover image found in the MP3 file.") return None def getTitleAndArtist(mp3_file): audio = MP3(mp3_file, ID3=ID3) title = audio.get('TIT2', TIT2(encoding=3, text='')).text[0] artist = audio.get('TPE1', TPE1(encoding=3, text='')).text[0] return title, artist def getColour(img): with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile: img.save(tmpfile.name, format="PNG") color_thief = ColorThief(tmpfile.name) dominant_color = color_thief.get_color(quality=1) os.remove(tmpfile.name) return dominant_color def clamp(number): return max(0, min(number, 1)) def normalizeColour(C) -> tuple[int, int, int]: cc = colorsys.rgb_to_hsv(C[0] / 255, C[1] / 255, C[2] / 255) ccc = colorsys.hsv_to_rgb(cc[0], clamp(1.3 * cc[1]), .8) return math.floor(ccc[0] * 255), math.floor(ccc[1] * 255), math.floor(ccc[2] * 255) def normalizeColourBar(C) -> tuple[int, int, int]: cc = colorsys.rgb_to_hsv(C[0] / 255, C[1] / 255, C[2] / 255) ccc = colorsys.hsv_to_rgb(cc[0], clamp(1.4 * cc[1]), .6) return math.floor(ccc[0] * 255), math.floor(ccc[1] * 255), math.floor(ccc[2] * 255) def stamp_text(draw, text, font, position, align='left'): text_bbox = draw.textbbox((0, 0), text, font=font) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] x, y = position y -= text_height // 2 if align == 'center': x -= text_width // 2 elif align == 'right': x -= text_width draw.text((x, y), text, font=font, fill="#fff") def linear_interpolate(start, stop, progress): return start + progress * (stop - start) def filecount(p): files = os.listdir() file_count = len(files) return file_count def render_frame(params): n, samples_array, cover_img, title, artist, dominant_color, width, height, fps, name, oscres, sr = params num_frames = len(samples_array) // (sr // fps) img = Image.new('RGB', (width, height), normalizeColour(dominant_color)) d = ImageDraw.Draw(img) s = (sr // fps) * n if s > len(samples_array): return e = center_to_top_left(getRenderCords(samples_array, getTrigger(s, samples_array, max=oscres),res=oscres,size=(width, height)), width=width, height=height) d.line(e, fill='#fff', width=2) cs = math.floor(min(width, height) / 2) cov = cover_img.resize((cs, cs)) img.paste(cov, (((width // 2) - cs // 2), math.floor(height * .1))) fontT = ImageFont.truetype(path+'Lexend-Bold.ttf', 50*(min(width, height)/720)//1) fontA = ImageFont.truetype(path+'Lexend-Bold.ttf', 40*(min(width, height)/720)//1) fontD = ImageFont.truetype(path+'SpaceMono-Bold.ttf', 30*(min(width, height)/720)//1) stamp_text(d, title, fontT, totopleft((0, min(width, height) * .3 // -2), width=width, height=height), 'center') stamp_text(d, artist, fontA, totopleft((0, min(width, height) * .44 // -2), width=width, height=height), 'center') d.line(center_to_top_left([(width * .96 // -2, height * .95 // -2), (width * .96 // 2, height * .95 // -2)], width=width, height=height), fill=normalizeColourBar(dominant_color), width=15 * height // 360) d.line(center_to_top_left([(width * .95 // -2, height * .95 // -2), (linear_interpolate(width * .95 // -2, width * .95 // 2, s / len(samples_array)), height * .95 // -2)],width=width, height=height), fill='#fff', width=10 * height // 360) img.save(path+f'out/{name}/{str(n)}.png', 'PNG',) return 1 # Indicate one frame processed def RenderVid(af, n, fps=30): (ffmpeg .input(path+f'out/{n}/%d.png', framerate=fps) .input(af) .output(n + '.mp4', vcodec='libx264', r=fps, pix_fmt='yuv420p', acodec='aac', shortest=None) .run() ) gr.Interface.download(f"{n}.mp4") invisible_chars = ["\u200B", "\uFEFF"] def remove_bom(data: str) -> str: BOM = '\ufeff' return data.lstrip(BOM) def stripinvisibles(s): e = remove_bom(s) for i in invisible_chars: e.replace(i,"") return e def main(file, name, fps=30, res: tuple=(1280,720), oscres=512, sr=11025, lyrics=None, img=None, tit=None, ast=None): p = gr.Progress() LRC2SRT.clear() if os.path.exists("out.srt"): os.remove("out.srt") haslyrics = False if lyrics: p(0.5,"parsing lyrics") try: outf = open("out.srt",mode="x", encoding="UTF8") sf = stripinvisibles(open(lyrics, encoding="UTF8").read()) print(sf[0]) if sf[0] == '[': gr.Info("Lyrics of LRC type was detected, converting to SRT") LRC2SRT.convert_to_srt(sf) outf.write('\n'.join(LRC2SRT.SRT)) haslyrics = True elif sf[0].isdigit(): outf.write(sf) gr.Info("Lyrics of SRT type was detected") haslyrics = True else: gr.Warning("Lyrics file is invalid, skipping") except Exception as e: print(traceback.format_exc()) gr.Warning("Failed to parse lyrics, ensure there are no blank lines in between") os.makedirs(path+f'out/{name}/', exist_ok=True) global iii iii = 0 # Load the audio file p(0.25,"loading file") audio_path = file y, sr = librosa.load(audio_path, sr=sr) # Resample to 11025 Hz y_u8 = (y * 128 + 128).astype('uint8') samples_array = y_u8.tolist() p(0.5,"extracting metadata") # Extract cover image, title, and artist cover_file = None if img: cover_file = Image.open(img) cover_img = extract_cover_image(audio_path) if cover_img == None: if img: cover_img = cover_file else: raise gr.Error("Mp3 must have a cover image, upload the image under the 'Metadata' section") elif cover_img == -1 and not (tit or ast): raise gr.Error("Mp3 is missing tags, add the info under the 'Metadata' section") title, artist = getTitleAndArtist(audio_path) if title == '' or artist == '': if not (tit or ast): gr.Warning('Missing Title or Artist') else: title, artist = tit, ast dominant_color = getColour(cover_img) # Frame rendering parameters width, height, fps = res[0], res[1], fps num_frames = len(samples_array) // (sr // fps) # Prepare parameters for each frame params = [(n, samples_array, cover_img, title, artist, dominant_color, width, height, fps, name, oscres, sr) for n in range(num_frames)] try: with Pool(cpu_count()) as pool: num_frames = len(samples_array) // (sr // fps) # Use imap to get progress updates for _ in pool.imap_unordered(render_frame, params): iii += 1 # Increment frame count for progress p((iii,num_frames),desc="Rendering Frames") except Exception as e: print('Ended in error: ' + traceback.format_exc(), iii) gr.e p = gr.Progress() p(0.5,desc="Compiling video") print('FFMPEG') if haslyrics: ffmpeg_cmd = [ "ffmpeg", '-framerate', '30', '-i', path + f'out/{name}/%d.png', # Input PNG images '-i', file, # Input MP3 audio '-i', path + 'out.srt', # Input SRT subtitles '-c:v', 'libx264', '-r', '30', '-pix_fmt', 'yuv420p', '-c:a', 'aac', '-c:s', 'mov_text', # Use mov_text codec for subtitles '-y', path + f'{name}.mp4' # Output MP4 filename ] else: ffmpeg_cmd = [ "ffmpeg", '-framerate', '30', '-i', path+f'out/{name}/%d.png', # Input PNG images '-i', f'{file}', # Input MP3 audio '-c:v', 'libx264', '-r', '30', '-pix_fmt', 'yuv420p', '-c:a', 'aac', '-y', path+f'{name}.mp4' # Output MP4 filename ] subprocess.run(ffmpeg_cmd) def gradio_interface(audio_file, lyrics, output_name, fps=30, vidwidth=1280, vidheight=720, oscres=512, img=None, tit=None, ast=None): resolution = f"{vidwidth}x{vidheight}" res = tuple(map(int, resolution.split('x'))) main(audio_file, output_name, fps=fps, res=res, oscres=oscres, lyrics=lyrics, img=img, tit=tit, ast=ast) time.sleep(5) shutil.rmtree("out") return f"{output_name}.mp4" # Define Gradio interface with accordions inputs = [ gr.Accordion( title="Audio Settings", items=[ gr.components.File(label="Upload your MP3 file", file_count='single', file_types=['mp3']), gr.components.File(label="(Optional) Upload Lyrics as LRC or SRT", file_count='single', file_types=['lrc','srt']) ], open=True ), gr.Accordion( title="Video Output Settings", items=[ gr.components.Textbox(label="Output Video Name", value='video'), gr.components.Slider(label="Frames per Second", minimum=20, maximum=60, step=1, value=30), gr.components.Slider(label="Output Video Width", minimum=100, maximum=2000, value=1280, step=2), gr.components.Slider(label="Output Video Height", minimum=100, maximum=2000, value=720, step=2) ] ), gr.Accordion( title="Advanced Options", items=[ gr.components.Slider(label="Number of Visualization Segments", minimum=256, maximum=2048, step=2, value=512), ] ), gr.Accordion( title="Mp3 Metadata (Use if mp3 does not have tags)", items=[ gr.components.Image(label='Cover Art'), gr.components.Textbox(label='Title'), gr.components.Textbox(label='Artists') ] ) ] iface = gr.Interface( fn=gradio_interface, inputs=inputs, outputs=gr.components.Video(label="Output"), title="MP3 to Video Visualization", description=""" Upload an MP3 file and configure parameters to create a visualization video. Optionally upload a word or line synced lyric file Ensure a blank line at the end to avoid conversion errors""" ) # Launch Gradio interface iface.launch()