-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathlipReader.py
283 lines (262 loc) · 8.36 KB
/
lipReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""
Given a input video (.mp4), outputs the probabilities of the k most
plausible words.
Also outputs a .avi file that shows the input of the neural network and
a summary bar graph
"""
import os
import cv2
import sys
import argparse
import face_recognition
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from model.inception import inception_model_fn
tf.logging.set_verbosity(tf.logging.FATAL)
plt.style.use(['dark_background', 'presentation.mplstyle'])
def videoToArray(path) :
"""
Capture every frame of a .mp4 file and save them to
a 3D numpy array.
Args :
- path: path to the .mp4 file
Return :
- 4D numpy array
"""
vidObj = cv2.VideoCapture(path)
# Some useful info about the video
width = int(vidObj.get(3))
height = int(vidObj.get(4))
fps = int(vidObj.get(5))
n_frames = int(vidObj.get(7))
print("Video info : {}x{}, {} frames".format(
height,
width,
n_frames))
# Create the numpy array that will host all the frames
# Could use np.append later in the loop but this is
# more efficient
video = np.zeros((height, width, n_frames))
video = video.astype(np.uint8)
# Iterate over every frame of the video
i = 0
while True :
# Capture one frame
success, frame = vidObj.read()
if not success :
break;
else :
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Save to one 4D numpy array
video[:, :, i] = frame
i += 1
return video, n_frames, fps
def frameAdjust(video):
"""
Select a fixed number of frames from the input video
Args :
- 3D numpy array
Returns :
- Adjusted numpy array
"""
target = 29
n_frames = video.shape[2]
if target == n_frames :
print("Perfect number of frames !")
return video
else :
if n_frames > target :
# If number of frames is more than 29, we select
# 29 evenly distributed frames
print("Adjusting number of frames")
idx = np.linspace(0, n_frames-1, 29)
idx = np.around(idx, 0).astype(np.int32)
print("Indexes of the selected frames : \n{}".format(idx))
return video[:, :, idx]
else :
# If number of frames is less than 29, duplicate last
# frame at the end of the video
output_video = np.zeros((video.shape[0], video.shape[1], 29)).astype(np.uint8)
output_video[:, :, :n_frames] = video
for i in range(target-n_frames+1) :
output_video[:, :, i+n_frames-1] = output_video[:, :, n_frames-1]
return output_video
def mouthCrop(video) :
"""
Crop a video around the mouth of the speaker
Args :
- 3D numpy array
Returns :
- Cropped numpy array
"""
size = 64
n_frames = 29
cropped_video = np.zeros((size, size, n_frames)).astype(np.uint8)
# For every frame of the image ...
for i in range(n_frames) :
# Compute the face locations (right/left eye and nose tip)
face_locations = face_recognition.face_landmarks(
video[:, :, i],
model="small"
)
if len(face_locations) == 0 :
sys.exit("No face detected in frame {}".format(i))
# To make sure the crop around the mouth just right (not too zoomed
# in or zoomed out), the distance between the eyes is used as
# a reference. The leftmost point of the left eye and the rightmost point
# of the right eye are selected. We then use these
# values to compute the size of the crop
left_point = face_locations[0]["left_eye"][0][0]
right_point = face_locations[0]["right_eye"][1][0]
crop_size = right_point - left_point
# The selection is centered on the x axis point of the nosetip
crop_location_x = face_locations[0]["nose_tip"][0][0]
crop_location_y = face_locations[0]["nose_tip"][0][1]
selection = video[
crop_location_y:crop_location_y+crop_size,
crop_location_x-(crop_size//2):crop_location_x+(crop_size//2),
i
]
# Resize to target size
cropped_video[:, :, i] = cv2.resize(
selection,
dsize=(size, size),
interpolation=cv2.INTER_LINEAR)
return cropped_video
def reshapeAndConvert(video) :
"""
Reshape the video to a 4D array before feeding it to the model function
Also apply normalization to go from [0-255] to [0-1]
Args :
- 3D numpy array
Returns :
- 4D numpy array
"""
size = video.shape[0]
n_frames = video.shape[2]
video = np.reshape(video, (1, size, size, n_frames)).astype(np.float32)
return video / 255.0
def create_dict_word_list(path) :
'''
Create a dict used to transfrom labels from int to str
Args :
- path : Path to the word list
Return :
- Python dictionnary {Word : Label}
'''
count = 0
my_dict = dict()
with open(path+'word_list.txt', 'r') as f:
for line in f:
my_dict.update({count : line[:-1]})
count += 1
return my_dict
# Debugging function
def _write_video(video, path, fps) :
writer = cv2.VideoWriter(
path+".avi",
cv2.VideoWriter_fourcc(*"XVID"),
fps,
(256,256)
)
video = video * 255
for i in range(29) :
writer.write(
cv2.resize(
cv2.cvtColor(
video[0, :, :, i].astype('uint8'),
cv2.COLOR_GRAY2BGR
),
dsize=(256, 256),
interpolation=cv2.INTER_LINEAR
)
)
writer.release()
parser = argparse.ArgumentParser()
parser.add_argument(
'--file',
default=None,
help="Name/path of the video file"
)
parser.add_argument(
'--checkpoint_path',
default=None,
help="Path to the checkpoint file"
)
parser.add_argument(
'--output',
default=".",
help="Name of the ouput file"
)
parser.add_argument(
'--k',
default="10",
help="Show top-k predictions"
)
if __name__ == '__main__':
# Useful stuff
args = parser.parse_args()
assert os.path.isfile(args.file), "Video file not found"
im_size = 64
n_frames = 29
params = {"num_classes": 500}
word_dict = create_dict_word_list("data/")
# Preprocessing
print("Reading frames from {}".format(args.file))
video, n_frames_original, fps = videoToArray(args.file)
video = frameAdjust(video)
print("Cropping video around the speaker's mouth (may take time)")
video = mouthCrop(video)
video = reshapeAndConvert(video)
# Used for debugging, not important :
# fps_output aligns the input video used by the model with the original video
fps_output = int(fps * (n_frames / n_frames_original))
_write_video(video, args.output, fps_output)
# Create the classifier
print("Creating classifier from {}".format(args.checkpoint_path))
classifier = tf.estimator.Estimator(
model_fn=inception_model_fn,
params=params,
model_dir=args.checkpoint_path
)
# Inference time !
print("Computing predictions")
predictions = classifier.predict(
input_fn=tf.estimator.inputs.numpy_input_fn(
{"x": video},
batch_size=1,
shuffle=False
)
)
# Print predictions
predictions = list(predictions)[0]
predicted_class = predictions["classes"]
top_k_classes = (-predictions["probabilities"]).argsort()[:int(args.k)]
predicted_words = list()
print("Predictions :")
for label in top_k_classes :
predicted_words.append(word_dict[label])
print("* {} : {} %".format(
word_dict[label],
predictions["probabilities"][label]*100
))
# Draw plot and write a .png file
print("Rendering prediction plot to {}.png".format(args.output))
idx = [3*i for i in range(int(args.k))]
plt.figure(figsize=(int(args.k)//2+5, 5))
plt.bar(
x=idx,
height=predictions["probabilities"][top_k_classes],
color="goldenrod"
)
plt.xlabel('Words')
plt.ylabel('Probabilities')
plt.title("Most plausible words".format(args.k))
plt.xticks(idx, predicted_words)
plt.savefig(
args.output+".png",
transparent=False,
bbox_inches="tight"
)
print("Done.")