-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathimdbAnalyzer.py
148 lines (132 loc) · 4.75 KB
/
imdbAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import json
import pickle
#NOTE: Not very readable for the basic people U_U
# Save python objects in binary format
# !IMPORTANT: 'movies' can relate to series as well... That's IMDB folks!
# - obj/acts.pkl: Dict where key is an actor, and the values the movies
# - obj/directs.pkl: Dict where key is director, and the values the movies
# - obj/movies_actors.pkl: All actors which participated in the movies from acts AND directs
# - obj/movies_directors.pkl: All directors which participated in the movies from acts AND directs
def save_obj(obj, name):
with open('obj/'+ name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
with open('obj/' + name + '.pkl', 'rb') as f:
return pickle.load(f)
class Act(object):
def wsearch(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
def __init__(self, first, last):
self.f = first
self.l = last
def regexp(self):
return (Act.wsearch(self.f), Act.wsearch(self.l))
def __repr__(self):
return f"f: {self.f} l: {self.l}"
## Finding Actors Information!
#act_n = ["Uma Thurman", "Harvey Keitel", "Bill Murray", "Frances McDormand"]
#gen_st = "acts"
act_n = ["Quentin Tarantino", "Wes Anderson"]
gen_st = "directs"
#act_man_f = "actors.list"
#act_woman_f = "actresses.list"
act_man_f = "directors.list"
act_woman_f = "t"
act_l = [(lambda n: Act(n[0], n[1]))([t.lower() for t in s.split()]) for s in act_n]
act_d = {}
mov_d = {}
def st_op(t):
s = t
p = s.find("(")
count = 0
while p < len(s) and p + 1 < len(s) and not s[p+1].isdigit() and not s[p+1] == '?':
tlist = list(s)
tlist[p] = '_'
s = "".join(tlist)
p = s.find("(")
count += 1
if count > 20:
return len(t)
return p
def find_person_movie(l, f):
if l and l[0] != '\t':
for i, a in enumerate(act_l):
if all(x(l[:l.find('\t')].replace(",","")) for x in a.regexp()) and not act_d.get(act_n[i], None):
cs = [l[l.find('\t'):st_op(l)].strip().replace("\t","").replace("\n","")]
l = f.readline()
while l and l[0] == '\t':
mv = l.strip().replace("\t", "").replace("\n","")
cs.append(mv[:st_op(mv)].strip())
l = f.readline()
act_d[act_n[i]] = cs
return l
def act_d_builder(l, f):
if l and l[0] != '\t':
print(l[:l.find('\t')].strip())
if not act_d.get(l[:l.find('\t')].strip(), None):
idx = l[:l.find('\t')].strip()
if not list(filter(None, idx)):
return
cs = [l[l.find('\t'):st_op(l)].strip().replace("\t","").replace("\n","")]
l = f.readline()
while l and l[0] == '\t':
mv = l.strip().replace("\t", "").replace("\n","")
cs.append(mv[:st_op(mv)])
l = f.readline()
act_d[idx] = cs
return l
def mov_actors_builder(l, f, movd):
if l and l[0] != '\t':
idx = l[:l.find('\t')].strip()
if not list(filter(None, idx)):
return
cs = [l[l.find('\t'):st_op(l)].strip().replace("\t","").replace("\n","")]
l = f.readline()
while l and l[0] == '\t':
mv = l.strip().replace("\t", "").replace("\n","")
mv = mv[:st_op(mv)].strip()
if mv in movd:
if not mov_d.get(mv, None):
mov_d[mv] = set()
mov_d[mv].add(idx)
else:
mov_d[mv].add(idx)
if(mv == "Plain Pleasures"):
print(f"Adicionado em {mv} = {idx}")
l = f.readline()
return l
def build_movieDict():
actd = load_obj("acts")
drcd = load_obj("directs")
movd = set()
for block in list(actd.values()) + list(drcd.values()):
for mov in block:
movd.add(mov)
return movd
def print_dict(name):
"DON'T INCLUDE THE .pkl IN THE 'name' ARG!!"
t = load_obj(name)
for k in t.keys():
print(k)
for m in t[k]:
print(f"\t\t{m}")
print("")
def main():
print_dict("directs")
exit()
movd = build_movieDict()
with open(act_man_f, "r", encoding='latin-1') as m, open(act_woman_f, "r", encoding='latin-1') as w:
c = 0
lm = m.readline()
lw = w.readline()
while lm or lw:
lm, lw = find_person_movie(lm, m), find_person_movie(lw, w)
lm, lw = m.readline(), w.readline()
if c%50000 == 0:
print(f"{c}º iteration...")
#input("continua...")
c+=1
#save_obj(act_d,"directs")
if __name__ == '__main__':
main()