-
Notifications
You must be signed in to change notification settings - Fork 0
/
map_subgenres.py
84 lines (75 loc) · 3.39 KB
/
map_subgenres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
def get_genre_info(subgenre_ids: list):
"""
looks up a genre_id in music-brains and extracts relevant genre information
:param subgenre_ids: list of tuples containing genre id and genre name
:return: no return
"""
url_template = "https://musicbrainz.org/genre/"
genre_collection = {}
for subgenre_id in tqdm(subgenre_ids):
url = url_template + subgenre_id[0] # concat url and genre id
response = requests.get(url)
if response.status_code == 200:
# parse html text
soup = BeautifulSoup(response.content, parser="html.parser", features="lxml")
# extract all tables to a list
tables = soup.select('table[class="details"]')
# check every found list
for table in tables:
# target table contains spans of class genrelink
if "genrelink" in str(table): # ugly, better change to check for classes
# prepare dict
genre_info = {
"genre": subgenre_id[0],
"name": subgenre_id[1],
"subgenre of": [],
"subgenres": [],
"has fusion genres": [],
"fusion of": [],
"influenced by": [],
"influenced genres": []
}
# the genre items are in tr tags
genre_items = table.select("tr")
for item in genre_items:
# entry type (subgenre of, subgenres, influenced by, ...) are in th tags
entry_type = item.select("th")
# always only one item in list
if entry_type[0].text:
# genres are in <a> tags with their genre id in href
genres = item.select('a[href]')
for genre in genres:
# remove leading /genre/
genre_id = genre.get("href").replace("/genre/", "")
info = {
"title": genre.text,
"id": genre_id
}
dict_name = entry_type[0].text[:-1] # remove colon at the end of dict key
genre_info[dict_name].append(info)
genre_collection[subgenre_id[0]] = genre_info
with open("data/genres.json", "w", encoding="utf-8") as file_out:
json.dump(genre_collection, file_out)
def count_file_lines(filename):
print("Counting lines...")
with open(filename, 'r') as file:
return sum(1 for _ in file)
genres_found = []
file_name = "/Volumes/Data/musicbrainz/artist/mbdump/artist"
get_total = count_file_lines(file_name)
with open(file_name) as f:
for line in tqdm(f, total=get_total):
json_data = json.loads(line)
genres_found.append(json_data)
ids = []
for genre_data in genres_found:
for k, v in tqdm(genre_data.items()):
for genre_dict in v["genres"]:
name_and_id = (genre_dict["genre-id"], genre_dict["genre-name"])
if name_and_id not in ids:
ids.append(name_and_id)
get_genre_info(ids)