-
Notifications
You must be signed in to change notification settings - Fork 0
/
desc_stats.py
186 lines (152 loc) · 6.57 KB
/
desc_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import json
import os
from PIL import Image
from icecream import ic
from tqdm import tqdm
import matplotlib.pyplot as plt
import squarify
import seaborn as sns
def get_genre_num(genre_type="genre"):
with open("data/genres.json") as f:
content = json.loads(f.read())
count = 0
for key, val in content.items():
if genre_type == "genre":
if not val["subgenre of"]:
count += 1
if genre_type == "subgenre":
if val["subgenre of"]:
count += 1
print(count)
def get_recording_num():
genre_map = {}
with open("data/artists.json") as f:
for line in f:
artist = json.loads(line)
for k, v in artist.items():
if v["genres"]:
for rel in v["releases"]:
if rel["release-group-id"] not in genre_map:
sorted_genres = sorted(v["genres"], key=lambda d: d['genre-count'], reverse=True)
genre_map[rel["release-group-id"]] = sorted_genres[0]["genre-name"]
genre_count = {}
rel_ids = []
with open("data/releases.json") as f:
for line in f:
content = json.loads(line)
rel_ids.append(content["release-group-id"])
for rel_id in rel_ids:
genre = genre_map[rel_id]
if genre not in genre_count:
genre_count[genre] += 1
else:
genre_count[genre] = 1
def get_cover_num():
cover_locations = [
"/Volumes/Data/covers/full_dataset/covers/500",
"/Volumes/Data/covers/full_dataset/covers/large"
]
covers_500 = [cov for cov in os.listdir(cover_locations[0]) if cov != ".DS_Store"]
covers_large = [cov for cov in os.listdir(cover_locations[1]) if cov != ".DS_Store"]
covers = covers_500 + covers_large
print(len(covers))
print(len(set(covers)))
print(covers[0])
print(covers[-1])
def create_genre_map():
with open("/Users/nico/code/CulturalAnalytics-CoverPredictions/data/genres.json") as f:
genre_dict = json.load(f)
genre_collection = {}
for genre_id, genre_data in genre_dict.items():
if not genre_data["subgenre of"]:
genre_collection[genre_id] = genre_data
for genre_id, genre_data in genre_dict.items():
if genre_data["subgenre of"]:
if genre_data["subgenre of"][0]["id"] in list(genre_collection.keys()):
genre_collection[genre_data["subgenre of"][0]["id"]]["subgenres"].append(genre_data)
else:
# TODO: add subgenres that dont have the top level parent genre in their "subgenre of" list
pass
covers_path = "/Volumes/Data/covers/full_dataset/covers/500"
covers_500 = [cov for cov in os.listdir(covers_path) if cov != ".DS_Store"]
file_path = "/Volumes/Data/covers/mb_db/release/mbdump/release"
with open(file_path) as f:
for line in f:
content = json.loads(line)
target_id = content["id"]
target_name = content["title"]
if f"{target_id}.jpg" in covers_500:
for genre_option in content["artist-credit"][0]["artist"]["genres"]:
pass
break
found_index = covers_500.index(f"{target_id}.jpg")
target = covers_500[found_index]
img = Image.open(f"{covers_path}/{target}")
#print(target_name)
img.show()
def count_genres():
covers_paths = [
"/Volumes/Data/covers/full_dataset/covers/500",
"/Volumes/Data/covers/full_dataset/covers/large"
]
genre_collection = {}
for covers_path in covers_paths:
covers = [cov for cov in os.listdir(covers_path) if cov != ".DS_Store"]
file_path = "/Volumes/Data/covers/mb_db/release/mbdump/release"
with open(file_path) as f:
content_length = len(json.loads(f.readline()))
for line in tqdm(f, total=content_length):
content = json.loads(line)
target_id = content["id"]
if f"{target_id}.jpg" in covers:
for genre_option in content["artist-credit"][0]["artist"]["genres"]:
if genre_option["name"] not in genre_collection:
genre_collection[genre_option["name"]] = 1
else:
genre_collection[genre_option["name"]] += 1
with open("data/genre_count.json", "w") as f:
json.dump(genre_collection, f, indent=4, sort_keys=True)
def plot_genres():
with open("data/genre_count.json") as f:
genre_collection = json.load(f)
with open("data/genres.json") as f:
genre_dict = json.load(f)
genre_list = []
subgenre_list = []
for genre in genre_dict.values():
if not genre["subgenre of"]:
genre_list.append(genre["name"])
else:
subgenre_list.append(genre["name"])
filtered_genres = [{}, {}]
save_names = ["vis/genre_tree_map.pdf", "vis/subgenre_tree_map.pdf"]
for genre, count in genre_collection.items():
if genre in genre_list:
filtered_genres[0][genre] = count
else:
filtered_genres[1][genre] = count
sorted_genres = dict(sorted(filtered_genres[0].items(), key=lambda item: item[1], reverse=True))
sorted_subgenres = dict(sorted(filtered_genres[1].items(), key=lambda item: item[1], reverse=True))
with open("data/genre_count_sorted.json", "w") as f:
json.dump(sorted_genres, f, indent=4)
with open("data/subgenre_count_sorted.json", "w") as f:
json.dump(sorted_subgenres, f, indent=4)
for filtered_genre, file_name in zip(filtered_genres, save_names):
filtered_genre = dict(sorted(filtered_genre.items(), key=lambda item: item[1], reverse=True))
genres = list(filtered_genre.keys())
releases = list(filtered_genre.values())
# Normalize the release counts to fit the tree map
norm_releases = squarify.normalize_sizes(releases, dx=100, dy=100)
# Create labels, only including them for genres with more than 1000 releases
labels = [f"{round((rel/sum(releases)*100), 1)}%" if (rel/sum(releases)*100) > 1 else "" for rel in releases]
# Create a tree map with borders
fig = plt.figure(figsize=(18, 8))
colors = sns.color_palette("tab10", n_colors=len(genres))
squarify.plot(sizes=norm_releases, label=labels, alpha=0.7, edgecolor='white', linewidth=2, color=colors)
# Add title and labels
plt.title('')
plt.axis('off')
# Save the plot as an image
plt.savefig(file_name)
#count_genres()
plot_genres()