Spaces:

Ribot
/

PodMagic

Sleeping

App Files Files Community

Ribot commited on May 22

Commit

7c806f5

verified ·

1 Parent(s): c0153c3

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -96

app.py CHANGED Viewed

@@ -1,110 +1,85 @@
 import os
 import re
-import sys
-import subprocess
-import importlib.util
-import shutil
-import tempfile
-# === INSTALLATION AUTOMATIQUE DES DÉPENDANCES ===
-def install_if_missing(package_name, import_name=None):
-    import_name = import_name or package_name
-    if importlib.util.find_spec(import_name) is None:
-        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
-for package in [("requests",), ("bs4", "bs4"), ("gradio",)]:
-    install_if_missing(*package)
 import requests
-import gradio as gr
 from bs4 import BeautifulSoup
-# === UTILITAIRES ===
-def slugify(text, max_length=50):
-    text = text.lower()
-    text = re.sub(r'[^\w\s-]', '', text)
-    text = re.sub(r'[-\s]+', '_', text)
-    return text[:max_length].strip('_')
-def get_episode_pages(main_url):
-    response = requests.get(main_url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.text, 'html.parser')
-    episode_urls = []
-    for a in soup.find_all('a', href=True):
-        href = a['href']
-        if "/franceculture/podcasts/" in href and not href.endswith('/serie'):
-            full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
-            episode_urls.append(full_url)
-    return list(dict.fromkeys(episode_urls))
-def get_mp3_link_with_ithema(page_url):
     try:
-        response = requests.get(page_url)
         response.raise_for_status()
-        html = response.text
-        matches = re.findall(r'https://[^"]*ithema[^"]*\.mp3', html)
-        return matches[0] if matches else None
-    except Exception:
-        return None
-def get_podcast_title(main_url):
-    try:
-        response = requests.get(main_url)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        title_tag = soup.find('h1') or soup.find('title')
-        return slugify(title_tag.get_text()) if title_tag else "podcast"
-    except:
-        return "podcast"
-def download_and_zip_podcast(main_url):
-    try:
-        title = get_podcast_title(main_url)
-        episode_pages = get_episode_pages(main_url)
-        if not episode_pages:
-            return "Aucune page d’épisode trouvée.", None
-        mp3_links = []
-        for page in episode_pages:
-            mp3 = get_mp3_link_with_ithema(page)
-            if mp3:
-                mp3_links.append(mp3)
-        if not mp3_links:
-            return "Aucun fichier MP3 contenant 'ithema' trouvé.", None
-        temp_dir = tempfile.mkdtemp()
-        for i, mp3_url in enumerate(mp3_links, start=1):
-            filename = f"{title}_{i:02}.mp3"
-            filepath = os.path.join(temp_dir, filename)
-            with requests.get(mp3_url, stream=True) as r:
-                r.raise_for_status()
-                with open(filepath, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
-        zip_path = os.path.join(temp_dir, f"{title}.zip")
-        shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
-        return f"{len(mp3_links)} fichier(s) téléchargé(s).", zip_path
     except Exception as e:
-        return f"Erreur : {str(e)}", None
-# === INTERFACE GRADIO ===
-with gr.Blocks() as app:
-    gr.Markdown("# Téléchargeur de Podcasts MP3 (France Culture)")
-    url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
-    download_button = gr.Button("Télécharger et compresser")
-    output_text = gr.Textbox(label="Message")
-    file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
-    def process(url):
-        msg, zip_path = download_and_zip_podcast(url)
-        return msg, zip_path
-    download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
 if __name__ == "__main__":
-    app.launch(share=True)

 import os
 import re
+import zipfile
 import requests
+import tempfile
+import subprocess
 from bs4 import BeautifulSoup
+import gradio as gr
+# Installation automatique des dépendances (à lancer en local une seule fois)
+try:
+    import bs4
+except ImportError:
+    subprocess.run(["pip", "install", "-q", "gradio", "beautifulsoup4", "requests"])
+def sanitize_filename(name):
+    return re.sub(r"[^\w\-_.]", "_", name.strip())[:50]
+def extract_audio_links_from_html(html_text):
+    pattern = r'https://media\.radiofrance-podcast\.net/[^"]*ITEMA[^"]*\.(mp3|m4a)'
+    return list(set(re.findall(pattern, html_text)))
+def extract_titles_and_links(html_text):
+    # Recherche de segments JSON avec les titres + urls audio
+    pattern = r'title:\\"(.*?)\\",url:\\"(https://media\.radiofrance-podcast\.net/[^"]*ITEMA[^"]*\.(mp3|m4a))\\"'
+    matches = re.findall(pattern, html_text)
+    return [(sanitize_filename(title), url) for title, url, _ in matches]
+def download_and_zip(url):
+    # Téléchargement du HTML
     try:
+        response = requests.get(url)
         response.raise_for_status()
     except Exception as e:
+        return f"Erreur de téléchargement : {e}", None
+    html_text = response.text
+    # Extraction des titres et des liens
+    titles_links = extract_titles_and_links(html_text)
+    if not titles_links:
+        # fallback brut si les titres ne sont pas extraits
+        urls = extract_audio_links_from_html(html_text)
+        titles_links = [(f"track_{i+1:02d}", u) for i, u in enumerate(urls)]
+    if not titles_links:
+        return "Aucun fichier audio trouvé avec ITEMA dans l'URL", None
+    # Création dossier temporaire
+    with tempfile.TemporaryDirectory() as tmpdir:
+        zip_path = os.path.join(tmpdir, "podcasts.zip")
+        with zipfile.ZipFile(zip_path, "w") as zipf:
+            for idx, (title, audio_url) in enumerate(titles_links, 1):
+                ext = ".mp3" if ".mp3" in audio_url else ".m4a"
+                filename = f"{idx:02d}-{title}{ext}"
+                filepath = os.path.join(tmpdir, filename)
+                try:
+                    audio_resp = requests.get(audio_url)
+                    audio_resp.raise_for_status()
+                    with open(filepath, "wb") as f:
+                        f.write(audio_resp.content)
+                    zipf.write(filepath, arcname=filename)
+                except Exception as e:
+                    print(f"Erreur téléchargement {audio_url} : {e}")
+        return "Téléchargement terminé avec succès", zip_path
+def gradio_interface(url):
+    message, zip_file = download_and_zip(url)
+    return message, zip_file
+# Interface Gradio
+demo = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.Textbox(label="URL de la page Radio France (Podcast)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/..."),
+    outputs=[
+        gr.Textbox(label="Message"),
+        gr.File(label="Fichier ZIP des épisodes")
+    ],
+    title="Téléchargement de Podcasts Radio France",
+    description="Collez une URL vers un podcast de Radio France pour télécharger tous les épisodes (mp3/m4a) avec les bons noms."
+)
 if __name__ == "__main__":
+    demo.launch()