-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
163 lines (124 loc) · 5.91 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/python3
from search_functions import *
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
logger = create_logger()
service_dict = {
'amazon':find_matches_amazon,
'apple':find_matches_apple,
'gaana':find_matches_gaana,
# 'hungama':find_matches_hungama,
'jiosaavn':find_matches_jiosaavn,
'spotify':find_matches_spotify,
'wynk':find_matches_wynk,
'ytmusic':find_matches_ytmusic,
}
orig_data_path = 'data/all_data_v14.csv'
# for first run of code, put input_csv_path = None here
input_csv_path = 'data/match_grid_expanded_sep_23_wynk_update_rights_holder_final.csv'
def create_holding_dict(service_dict, orig_data_path, input_csv_path):
holding_dict = {}
orig_df = pd.read_csv(orig_data_path, dtype = str, index_col='index')
if not input_csv_path: #ie. if input_csv_path=None for first run of code
grid_df = copy.deepcopy(orig_df)
for service in service_dict:
url_column_name = 'matched_url_' + service
title_column_name = 'matched_title_' + service
artist_column_name = 'matched_artist_' + service
grid_df[url_column_name] = np.nan
grid_df[title_column_name] = np.nan
grid_df[artist_column_name] = np.nan
else:
grid_df = pd.read_csv(input_csv_path, dtype = str, index_col='index')
for service in service_dict:
url_column_name = 'matched_url_' + service
index_df_for_searching = grid_df[grid_df[url_column_name].isnull()]
index_list_for_searching = index_df_for_searching.index.tolist()
service_df = orig_df[orig_df.index.isin(index_list_for_searching)]
holding_dict[service] = copy.deepcopy(service_df)
return holding_dict
holding_dict = create_holding_dict(service_dict, orig_data_path, input_csv_path)
#for first run put round_num = '01'
round_num = '10'
# searching on services for albums that havent gotten a match
with ThreadPoolExecutor(max_workers=8) as pool:
futures = []
for service in service_dict:
output_csv = 'round_' + round_num + '/matches_' + service +\
'_round_' + round_num + '_partial.csv'
function = service_dict[service]
futures.append(pool.submit(
function,
# below are function parameters
albums_df=holding_dict[service],
csv_name=output_csv,
min_matched=5,
min_checked=25,
min_combos=6
))
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logger.exception(e, stack_info=True)
# insert columns with generalised jaccard scores in partial csvs
os.system('python3 other_functions/insert_genjacc_scores.py')
print('insertion of gen jacc scores done')
# find match with highest gen jacc score from five possibilities
os.system('python3 other_functions/choose_genjacc_match.py')
print('choosing best gen jacc match done')
# optional -- print text file to double check chosen match manually
os.system('python3 other_functions/create_txt_false_pos_check.py')
print('creation of txt for false positive check done')
# merge partial csvs together
os.system('python3 other_functions/integrate_partial_csvs.py')
print('merging of csvs done')
# download google results with links from infocards to albums on various services
os.system('python3 other_functions/download_google_results.py')
print('downloading google results done')
# extract links from google infocards
os.system('python3 other_functions/extract_google_links.py')
print('extracting links from downloaded results done')
# find title and artist information for links received from google infocards
os.system('python3 other_functions/extract_google_titles_artists.py')
print('extracting titles and artists for album links done')
# creating txt to check google results for false positives
# if the album's not a match, just remove the pre-filled 'yes'
os.system('python3 other_functions/create_txt_false_pos_check_google.py')
print('checking google results for false positives done')
# integrate results from google infocards
os.system('python3 other_functions/integrate_google_results.py')
print('integrating google results done done')
# next two scripts need to be reworked, commenting them out for now
# # check if albums are playable/ are inactive / have less than 80% of tracks
# os.system('python3 other_functions/check_playability.py')
# print('checking playability of albums done')
# # remove albums that arent playable, targets taken from for_manual_removal.csv
# os.system('python3 other_functions/manual_removal.py')
# print('removal of unplayable albums done')
# find copyright and label info for matches
os.system('python3 other_functions/find_label_info.py')
print('finding label info done')
# come up with options for rights holder based on different criteria
os.system('python3 other_functions/find_rights_holders.py')
print('presenting options for rights holder done')
# choose a rights holder from different options
os.system('python3 other_functions/choose_rights_holder.py')
print('choosing rights holder done')
# calculate ratings and other scores
os.system('python3 other_functions/calculate_scores.py')
print('calculation of ratings and other scores done')
# make graphics for each genre based on scores
os.system('python3 other_functions/create_graphics.py')
print('creation of graphics done')
# calculating what percentage of a label's albums does a service have
os.system('python3 other_functions/guess_label_tieups.py')
print('creation of label_coverage.csv done')
# put % coverage of a label's albums in terms of ratings points
os.system('python3 other_functions/translate_coverage_into_ratings.py')
print('translation of label coverage into ratings points done')
# print out the one music group deal that'll most move needle for each service
os.system('python3 other_functions/print_must_do_deals.py')
print('printing of critical deals done')
# line for cron and at jobs on linux system
# cd /absolute/path/to/folder && stdbuf -o0 -e0 /usr/bin/python3 ./main.py >> /absolute/path/to/folder/responses_logs/cron_logs/`date +\%Y-\%m-\%d-\%H:\%M`-music-search-cron.log 2>&1