-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimportdenvertrip.m
148 lines (129 loc) · 6.17 KB
/
importdenvertrip.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
function denverbcycletripdatapublic = importdenvertrip(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as a matrix.
% DENVERBCYCLETRIPDATAPUBLIC = IMPORTFILE(FILENAME) Reads data from text
% file FILENAME for the default selection.
%
% DENVERBCYCLETRIPDATAPUBLIC = IMPORTFILE(FILENAME, STARTROW, ENDROW)
% Reads data from rows STARTROW through ENDROW of text file FILENAME.
%
% Example:
% denverbcycletripdatapublic = importfile('2016denverbcycletripdata_public.csv', 2, 419612);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2017/12/05 02:00:31
%% Initialize variables.
delimiter = ',';
if nargin<=2
startRow = 2;
endRow = inf;
end
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%q%q%q%q%q%q%q%q%q%q%q%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this
% code. If an error occurs for a different file, try regenerating the code
% from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', delimiter, 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', delimiter, 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[2,4,11]
% Converts text in the input cell array to numbers. Replaced non-numeric
% text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and
% suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
dateFormatIndex = 1;
blankDates = cell(1,size(raw,2));
anyBlankDates = false(size(raw,1),1);
invalidDates = cell(1,size(raw,2));
anyInvalidDates = false(size(raw,1),1);
for col=[6,9]% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{col} = datetime(dataArray{col}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss'); %#ok<AGROW>
catch
try
% Handle dates surrounded by quotes
dataArray{col} = cellfun(@(x) x(2:end-1), dataArray{col}, 'UniformOutput', false);
dates{col} = datetime(dataArray{col}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss'); %#ok<AGROW>
catch
dates{col} = repmat(datetime([NaN NaN NaN]), size(dataArray{col})); %#ok<AGROW>
end
end
dateFormatIndex = dateFormatIndex + 1;
blankDates{col} = dataArray{col} == '';
anyBlankDates = blankDates{col} | anyBlankDates;
invalidDates{col} = isnan(dates{col}.Hour) - blankDates{col};
anyInvalidDates = invalidDates{col} | anyInvalidDates;
end
dates = dates(:,[6,9]);
blankDates = blankDates(:,[6,9]);
invalidDates = invalidDates(:,[6,9]);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [2,4,11]);
rawStringColumns = string(raw(:, [1,3,5,7,8,10]));
%% Replace non-numeric cells with NaN
R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells
rawNumericColumns(R) = {NaN}; % Replace non-numeric cells
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
for catIdx = [1,2,3,4,5,6]
idx = (rawStringColumns(:, catIdx) == "<undefined>");
rawStringColumns(idx, catIdx) = "";
end
%% Create output variable
denverbcycletripdatapublic = table;
denverbcycletripdatapublic.UsersProgram = categorical(rawStringColumns(:, 1));
denverbcycletripdatapublic.UserID = cell2mat(rawNumericColumns(:, 1));
denverbcycletripdatapublic.MembershipType = categorical(rawStringColumns(:, 2));
denverbcycletripdatapublic.Bike = cell2mat(rawNumericColumns(:, 2));
denverbcycletripdatapublic.CheckoutDate = categorical(rawStringColumns(:, 3));
denverbcycletripdatapublic.CheckoutTime = dates{:, 1};
denverbcycletripdatapublic.CheckoutKiosk = categorical(rawStringColumns(:, 4));
denverbcycletripdatapublic.ReturnDate = categorical(rawStringColumns(:, 5));
denverbcycletripdatapublic.ReturnTime = dates{:, 2};
denverbcycletripdatapublic.ReturnKiosk = categorical(rawStringColumns(:, 6));
denverbcycletripdatapublic.DurationMinutes = cell2mat(rawNumericColumns(:, 3));
% For code requiring serial dates (datenum) instead of datetime, uncomment
% the following line(s) below to return the imported dates as datenum(s).
% denverbcycletripdatapublic.CheckoutTime=datenum(denverbcycletripdatapublic.CheckoutTime);denverbcycletripdatapublic.ReturnTime=datenum(denverbcycletripdatapublic.ReturnTime);