Skip to content

Commit

Permalink
feat: make for chemspectra
Browse files Browse the repository at this point in the history
jcampdx.read has an option for read errors

read DATA TABLE= (XY..XY)

fix comma as decimal separators

fix parsing xy values

update parsing xy data

store xydata format before delete it

refactor: update to read data in jcamp v6
  • Loading branch information
JasonYCHuang authored and Lan Le committed Mar 19, 2024
1 parent e5007e7 commit 53cab0a
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 18 deletions.
7 changes: 6 additions & 1 deletion nmrglue/fileio/bruker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2177,7 +2177,12 @@ def read_jcamp(filename, encoding=locale.getpreferredencoding()):
with open(filename, 'r', encoding=encoding) as f:
while True: # loop until end of file is found

line = f.readline().rstrip() # read a line
try:
line = f.readline().rstrip() # read a line
except Exception as e:
warn("Unable read line, leave it as a comment")
line = "$$"

if line == '': # end of file found
break

Expand Down
97 changes: 80 additions & 17 deletions nmrglue/fileio/jcampdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ def _detect_format(dataline):
firstvalue_re = re.compile(
r"(\s)*([+-]?\d+\.?\d*|[+-]?\.\d+)([eE][+-]?\d+)?(\s)*")

xy_re = re.compile('^[0-9\.]+,[ ]?[0-9\.]+')

index = firstvalue_re.match(dataline).end()
if index is None:
return -1
Expand All @@ -248,6 +250,10 @@ def _detect_format(dataline):
return 1
if firstchar in _DUP_DIGITS:
return 1

if re.search(xy_re, dataline):
return 2

return 0


Expand Down Expand Up @@ -429,10 +435,39 @@ def _parse_pseudo(datalines):
return data


def _parse_xy_xy(datalines):
pts = []
len_group_data = 0
for dataline in datalines:
if not dataline:
continue
xy_re = re.compile('[^ ][0-9\.]+, [0-9\.]+')
group_data = re.findall(xy_re, dataline)
len_group_data = len(group_data)
if len_group_data == 0:
xy_re = re.compile('[^ ][0-9\.]+,[0-9\.]+;')
group_data = re.findall(xy_re, dataline)

for data in group_data:
clean_data = data.replace(', ', ',')
clean_data = clean_data.replace(';', '')
x, y = clean_data.split(',')
pts.append([float(x), float(y)])

if len_group_data > 1:
return [pts]
else:
return pts


def _parse_data(datastring):
'''
Creates numpy array from datalines
'''
probe_data = datastring[80:320]
if ',' in probe_data and not('.' in probe_data): # fix comma as decimal points
datastring = datastring.replace(',', '.')

datalines = datastring.split("\n")
headerline = datalines[0]

Expand All @@ -446,6 +481,8 @@ def _parse_data(datastring):
data = _parse_pseudo(datalines)
elif mode == 0:
data = _parse_affn_pac(datalines)
elif mode == 2:
data = _parse_xy_xy(datalines)
else:
return None
if data is None:
Expand Down Expand Up @@ -495,7 +532,7 @@ def find_yfactors(dic):
return (factor_r, factor_i)


def getdataarray(dic):
def getdataarray(dic, show_all_data=False):
'''
Main function for data array parsing, input is the
raw dictionary from _readrawdic
Expand Down Expand Up @@ -525,19 +562,23 @@ def getdataarray(dic):
idatalist.append(data)
else:
rdatalist.append(data)
if len(rdatalist) > 1:
warn("NTUPLES: multiple real arrays, returning first one only")
if len(idatalist) > 1:
warn("NTUPLES: multiple imaginary arrays, \
returning first one only")
if rdatalist:
if idatalist:
data = [rdatalist[0], idatalist[0]]
else:
data = rdatalist[0]

if show_all_data:
data = { 'real': rdatalist, 'imaginary': idatalist }
else:
if idatalist:
data = [None, idatalist[0]]
if len(rdatalist) > 1:
warn("NTUPLES: multiple real arrays, returning first one only")
if len(idatalist) > 1:
warn("NTUPLES: multiple imaginary arrays, \
returning first one only")
if rdatalist:
if idatalist:
data = [rdatalist[0], idatalist[0]]
else:
data = rdatalist[0]
else:
if idatalist:
data = [None, idatalist[0]]

if data is None: # XYDATA
try:
Expand All @@ -552,14 +593,27 @@ def getdataarray(dic):
except KeyError:
warn("XYDATA not found ")

if data is None:
return None
if data is None: # PEAK TABLE
try:
valuelist = dic["PEAKTABLE"]
if len(valuelist) == 1:
data, datatype = _parse_data(valuelist[0])
else:
warn("Multiple PEAKTABLE arrays in JCAMP-DX file, \
returning first one only")
except KeyError:
warn("PEAKTABLE not found ")

# apply YFACTOR to data if available
if is_ntuples:
yfactor_r, yfactor_i = find_yfactors(dic)
if yfactor_r is None or yfactor_r is None:
warn("NTUPLES: YFACTORs not applied, parsing failed")
elif show_all_data:
for i, _ in enumerate(data['real']):
data['real'][i] = data['real'][i] * yfactor_r
for i, _ in enumerate(data['imaginary']):
data['imaginary'][i] = data['imaginary'][i] * yfactor_i
else:
data[0] = data[0] * yfactor_r
data[1] = data[1] * yfactor_i
Expand All @@ -575,7 +629,7 @@ def getdataarray(dic):
return data


def read(filename):
def read(filename, show_all_data=False, read_err=None):
"""
Read JCAMP-DX file
Expand All @@ -601,13 +655,14 @@ def read(filename):
# first read everything (including data array) to "raw" dictionary,
# in which data values are read as raw strings including whitespace
# and newlines
dic = _readrawdic(filename)
dic = _readrawdic(filename, read_err)

# select the relevant data section.
# first try to parse NMRSPECTRUM sections in order,
# and go with first that has proper data:
data = None
correctdic = None

try:
subdiclist = dic["_datatype_NMRSPECTRUM"]
for subdic in subdiclist:
Expand Down Expand Up @@ -669,9 +724,17 @@ def read(filename):
for key, valuelist in correctdic.items():
dic[key] = valuelist

# remove data tables from dic
try:
dic['XYDATA_OLD'] = dic["XYDATA"]
del dic["XYDATA"]
except KeyError:
pass

# clean main dic from possible empty entries
dic = {key: value for key, value in dic.items() if value}


return dic, data


Expand Down

0 comments on commit 53cab0a

Please sign in to comment.