Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup as bs | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| import os | |
| import pandas as pd | |
| import re | |
| from tqdm import tqdm | |
| def scrape_song_library(page_count=2054) -> pd.DataFrame: | |
| columns = [ | |
| "Title", | |
| "Artist", | |
| "Length", | |
| "Tempo", | |
| "Beat", | |
| "Energy", | |
| "Danceability", | |
| "Valence", | |
| "Sample", | |
| "Tags", | |
| "DanceRating", | |
| ] | |
| song_df = pd.DataFrame(columns=columns) | |
| for i in tqdm(range(1, page_count + 1), desc="Pages processed"): | |
| link = "https://www.music4dance.net/song/Index?filter=v2-Index&page=" + str(i) | |
| page = requests.get(link) | |
| soup = bs(page.content, "html.parser") | |
| songs = pd.DataFrame(get_songs(soup)) | |
| song_df = pd.concat([song_df, songs], axis=0, ignore_index=True) | |
| return song_df | |
| def get_songs(soup: bs) -> dict: | |
| js_obj = re.compile(r"{(.|\n)*}") | |
| reset_keys = [ | |
| "Title", | |
| "Artist", | |
| "Length", | |
| "Tempo", | |
| "Beat", | |
| "Energy", | |
| "Danceability", | |
| "Valence", | |
| "Sample", | |
| ] | |
| song_text = [str(v) for v in soup.find_all("script") if "histories" in str(v)][0] | |
| songs_data = json.loads(js_obj.search(song_text).group(0)) | |
| songs = [] | |
| for song_data in songs_data["histories"]: | |
| song = {"Tags": set(), "DanceRating": {}} | |
| for feature in song_data["properties"]: | |
| if "name" not in feature or "value" not in feature: | |
| continue | |
| key = feature["name"] | |
| value = feature["value"] | |
| if key in reset_keys: | |
| song[key] = value | |
| elif key == "Tag+": | |
| song["Tags"].add(value) | |
| elif key == "DeleteTag": | |
| try: | |
| song["Tags"].remove(value) | |
| except: | |
| continue | |
| elif key == "DanceRating": | |
| dance = value.replace("+1", "") | |
| prev = song["DanceRating"].get(dance, 0) | |
| song["DanceRating"][dance] = prev + 1 | |
| songs.append(song) | |
| return songs | |
| def scrape_dance_info() -> pd.DataFrame: | |
| js_obj = re.compile(r"{(.|\n)*}") | |
| link = "https://www.music4dance.net/song/Index?filter=v2-Index" | |
| page = requests.get(link) | |
| soup = bs(page.content, "html.parser") | |
| dance_info_text = [str(v) for v in soup.find_all("script") if "environment" in str(v)][0] | |
| dance_info = json.loads(js_obj.search(dance_info_text).group(0)) | |
| dance_info = dance_info["dances"] | |
| wanted_keys = ["name", "id", "synonyms", "tempoRange", "songCount"] | |
| dance_df = pd.DataFrame([{k:v for k, v in dance.items() if k in wanted_keys} | |
| for dance | |
| in dance_info]) | |
| return dance_df | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--page-count", default=2, type=int) | |
| parser.add_argument("--out", default="data/song.csv") | |
| args = parser.parse_args() | |
| out_path = Path(args.out) | |
| out_dir = os.path.dirname(out_path) | |
| if not os.path.exists(out_dir): | |
| print(f"Output location does not exist: {out_dir}") | |
| df = scrape_song_library(args.page_count) | |
| df.to_csv(out_path) | |