-
Notifications
You must be signed in to change notification settings - Fork 16
/
convert.py
240 lines (198 loc) · 8.92 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
"""
Three functions for converting data generated by E-Prime experiment to more
useable csv format.
1. etext_to_rcsv: Converts exported 'E-Prime text' file to reduced csv based
on desired column headers. Make sure, when exporting the edat file as
'E-Prime text', that Unicode is turned off.
2. text_to_csv: Converts text file produced by successful completion of
E-Prime experiment to csv. Output from text_to_csv can be used to deduce
information necessary for text_to_rcsv (e.g. columns to merge, columns to
rename, etc.). These variables would then be saved in the task-specific json
file.
3. text_to_rcsv: Converts text file produced by successful completion of
E-Prime experiment to reduced csv, using information from the variables
contained in headers.pickle. The output from this should be
indistinguishable from the output of etext_to_rcsv, only without the tedious
step of exporting the 'E-Prime text' file by hand.
command line usage: python convert_eprime.py [function_name] [inputs]
"""
from __future__ import print_function
from builtins import range
import os
import sys
import json
import inspect
from collections import OrderedDict
import numpy as np
import pandas as pd
from .utils import remove_unicode
def etext_to_rcsv(in_file, param_file, out_file=None):
"""
Reads exported 'E-Prime text' file, reduces columns based on tasks-specific
list of headers, and writes out reduced csv.
Converts exported 'E-Prime text' file to reduced csv based on desired column
headers. Make sure, when exporting the edat file as 'E-Prime text', that
Unicode is turned off.
Parameters
----------
in_file : str
Exported E-Prime text file to convert and reduce.
param_file : str
A json file with relevant task-specific parameters.
out_file : str
Name of output file (csv format) to generate. If not set, then a file
will be written out with the same name as the input file, but with a csv
suffix instead of txt.
Examples
----------
>>> from convert_eprime.convert import etext_to_rcsv
>>> etext_file = 'subj0001_stop_signal_task-0.txt'
>>> param_file = '../config_files/nac_stopsignal.json'
>>> etext_to_rcsv(etext_file, param_file) # doctest: +ALLOW_UNICODE
'Output file successfully created- subj0001_stop_signal_task-0.csv'
"""
with open(param_file, 'r') as file_object:
param_dict = json.load(file_object)
filename, suffix = os.path.splitext(in_file)
if suffix == '.txt':
# Remove first three lines of exported E-Prime tab-delimited text file.
rem_lines = list(range(3))
delimiter_ = '\t'
elif suffix == '.csv':
# Remove no lines of comma-delimited csv file.
rem_lines = []
delimiter_ = ','
else:
raise Exception('File not txt or csv: {0}'.format(in_file))
df = pd.read_csv(in_file, skiprows=rem_lines, sep=delimiter_)
header_list = param_dict.get('headers')
df = df[header_list]
if param_dict['rem_nulls']:
df = df.dropna(axis=0, how='all')
if out_file is None:
out_file = filename + '.csv'
df.to_csv(out_file, index=False)
print('Output file successfully created- {0}'.format(out_file))
def text_to_csv(text_file, out_file):
"""
Converts text file produced by successful completion of E-Prime experiment
to csv. Output from text_to_csv can be used to determine information
necessary for text_to_rcsv (e.g. columns to merge, columns to rename,
etc.).
Parameters
----------
text_file : str
Raw E-Prime text file to convert.
out_file : str
Name of output file (csv format) to generate.
Examples
----------
>>> from convert_eprime.convert import text_to_csv
>>> in_file = 'subj0001_stop_signal_task-0.txt'
>>> out_file = 'subj0001_0.csv'
>>> text_to_csv(in_file, out_file) # doctest: +ALLOW_UNICODE
'Output file successfully created- subj0001_0.csv'
"""
df = _text_to_df(text_file)
df.to_csv(out_file, index=False)
print('Output file successfully created- {0}'.format(out_file))
def text_to_rcsv(text_file, edat_file, param_file, out_file):
"""
Converts text file produced by successful completion of E-Prime experiment
to reduced csv. Considerably more complex than text_to_csv.
Parameters
----------
text_file : str
Raw E-Prime text file to convert.
edat_file : str
Raw E-Prime edat file paired with text_file. Only used for its file
type, because sometimes files will differ between version of E-Prime
(edat vs. edat2 suffix).
param_file : str
A json file with relevant task-specific parameters.
out_file : str
Name of output file (csv format) to generate.
Examples
----------
>>> from convert_eprime.convert import text_to_rcsv
>>> in_file = 'subj0001_stop_signal_task-0.txt'
>>> edat_file = 'subj0001_stop_signal_task-0.edat2'
>>> out_file = 'subj0001_0.csv'
>>> param_file = '../config_files/nac_stopsignal.json'
>>> text_to_rcsv(in_file, edat_file, out_file, param_file) # doctest: +ALLOW_UNICODE
'Output file successfully created- subj0001_0.csv'
"""
with open(param_file, 'r') as file_object:
param_dict = json.load(file_object)
df = _text_to_df(text_file)
# Rename columns
_, edat_suffix = os.path.splitext(edat_file)
replace_dict = param_dict.get('replace_dict')
if replace_dict:
replacements = replace_dict.get(edat_suffix)
df = df.rename(columns=replacements)
# Merge columns
merge_cols = param_dict.get('merge_cols')
for col in list(merge_cols.keys()):
df[col] = df[merge_cols[col]].fillna('').sum(axis=1)
# Drop NaNs based on specific columns
if param_dict.get('rem_nulls', False):
df = df.dropna(subset=param_dict.get('null_cols'), how='all')
# Reduce DataFrame to desired columns
header_list = param_dict.get('headers')
df = df[header_list]
# Write out reduced csv
df.to_csv(out_file, index=False)
print('Output file successfully created- {0}'.format(out_file))
def _text_to_df(text_file):
"""
Convert a raw E-Prime output text file into a pandas DataFrame.
"""
# Load the text file as a list.
with open(text_file, 'rb') as fo:
text_data = list(fo)
# Remove unicode characters.
filtered_data = [remove_unicode(row.decode('utf-8', 'ignore')) for row in text_data]
# Determine where rows begin and end.
start_index = [i for i, row in enumerate(filtered_data) if row == '*** LogFrame Start ***']
end_index = [i for i, row in enumerate(filtered_data) if row == '*** LogFrame End ***']
if len(start_index) != len(end_index) or start_index[0] >= end_index[0]:
print('Warning: LogFrame Starts and Ends do not match up.',
'Including header metadata just in case.')
# In cases of an experiment crash, the final LogFrame is never written, and the experiment metadata
# (Subject, VersionNumber, etc.) isn't collected by the indices above. We can manually include the
# metadata-containing Header Frame to collect these data from a partial-run crash dump.
start_index = [i for i,row in enumerate(filtered_data) if row == '*** Header Start ***'] + start_index
end_index = [i for i,row in enumerate(filtered_data) if row == '*** Header End ***'] + end_index
n_rows = min(len(start_index), len(end_index))
# Find column headers and remove duplicates.
headers = []
data_by_rows = []
for i in range(n_rows):
one_row = filtered_data[start_index[i]+1:end_index[i]]
data_by_rows.append(one_row)
for col_val in one_row:
split_header_idx = col_val.index(':')
headers.append(col_val[:split_header_idx])
headers = list(OrderedDict.fromkeys(headers))
# Preallocate list of lists composed of NULLs.
data_matrix = np.empty((n_rows, len(headers)), dtype=object)
data_matrix[:] = np.nan
# Fill list of lists with relevant data from data_by_rows and headers.
for i in range(n_rows):
for cell_data in data_by_rows[i]:
split_header_idx = cell_data.index(':')
for k_header, header in enumerate(headers):
if cell_data[:split_header_idx] == header:
data_matrix[i, k_header] = cell_data[split_header_idx+1:].lstrip()
df = pd.DataFrame(columns=headers, data=data_matrix)
# Columns with one value at the beginning, the end, or end - 1 should be
# filled with that value.
for col in df.columns:
non_nan_idx = np.where(df[col].values == df[col].values)[0]
if len(non_nan_idx) == 1 and non_nan_idx[0] in [0, df.shape[0]-1,
df.shape[0]-2]:
df.loc[:, col] = df.loc[non_nan_idx[0], col]
return df