forked from gcrois/transcript-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path400kgendata.py
73 lines (53 loc) · 1.71 KB
/
400kgendata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Generates the data for the 400k dataset #
from definitions import *
import os
import numpy as np
import cv2
from sklearn.utils import shuffle
import time
import pickle
# Read in the image #
def read_image(file_path):
# Reads image in grayscale #
img = cv2.imread(file_path, cv2.IMREAD_COLOR)
return img
# For parsing through directory and loading array of images #
def prep_data(images):
path, dirs, files = next(os.walk(images))
length = len(files)
# make an array for the images #
data = np.zeros((length, ROWS, COLS, CHANNELS), dtype=np.uint8)
# make an array for the identites #
y = np.zeros((length, 1), dtype=np.uint8)
i = 0
for filename in os.listdir(images):
# Used for testing, uncomment to run through certain split of data #
# if(i == LENGTH):
# break
data[i, :] = read_image(images + '/' + filename)
y[i, 0] = classification[images.split("/")[-1]]
i += 1
return data, y
############### Run data processing ###############################
myData = 'data/400k/sorted/'
xf = []
yf = []
print("starting data processing")
tic = time.time()
# Loop through directories #
for x in os.listdir(myData):
print(x)
tmp = myData + x
if os.path.isfile(tmp):
continue
x, y = prep_data(tmp)
xf.append(x)
yf.append(y)
print("data processing done. Took:", round(time.time() - tic, 2), " seconds")
# Fix array dimensionality #
combinedX = [item for sublist in xf for item in sublist]
yl = [item for sublist in yf for item in sublist]
X, Y = shuffle(combinedX, yl)
X = np.array(X)
Y = np.array(Y)
pickle.dump((X, Y), open("data/400k_training_data.pickle", "wb"), protocol=4)