From 53cab0a769d5cdc5ea0c3cda45eb3cada1be6915 Mon Sep 17 00:00:00 2001 From: jasonych99 Date: Tue, 9 Oct 2018 14:28:44 +0200 Subject: [PATCH] feat: make for chemspectra jcampdx.read has an option for read errors read DATA TABLE= (XY..XY) fix comma as decimal separators fix parsing xy values update parsing xy data store xydata format before delete it refactor: update to read data in jcamp v6 --- nmrglue/fileio/bruker.py | 7 ++- nmrglue/fileio/jcampdx.py | 97 ++++++++++++++++++++++++++++++++------- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/nmrglue/fileio/bruker.py b/nmrglue/fileio/bruker.py index 9de2f52a..0c9408a6 100644 --- a/nmrglue/fileio/bruker.py +++ b/nmrglue/fileio/bruker.py @@ -2177,7 +2177,12 @@ def read_jcamp(filename, encoding=locale.getpreferredencoding()): with open(filename, 'r', encoding=encoding) as f: while True: # loop until end of file is found - line = f.readline().rstrip() # read a line + try: + line = f.readline().rstrip() # read a line + except Exception as e: + warn("Unable read line, leave it as a comment") + line = "$$" + if line == '': # end of file found break diff --git a/nmrglue/fileio/jcampdx.py b/nmrglue/fileio/jcampdx.py index 1d41e2b5..d5d39015 100644 --- a/nmrglue/fileio/jcampdx.py +++ b/nmrglue/fileio/jcampdx.py @@ -234,6 +234,8 @@ def _detect_format(dataline): firstvalue_re = re.compile( r"(\s)*([+-]?\d+\.?\d*|[+-]?\.\d+)([eE][+-]?\d+)?(\s)*") + xy_re = re.compile('^[0-9\.]+,[ ]?[0-9\.]+') + index = firstvalue_re.match(dataline).end() if index is None: return -1 @@ -248,6 +250,10 @@ def _detect_format(dataline): return 1 if firstchar in _DUP_DIGITS: return 1 + + if re.search(xy_re, dataline): + return 2 + return 0 @@ -429,10 +435,39 @@ def _parse_pseudo(datalines): return data +def _parse_xy_xy(datalines): + pts = [] + len_group_data = 0 + for dataline in datalines: + if not dataline: + continue + xy_re = re.compile('[^ ][0-9\.]+, [0-9\.]+') + group_data = re.findall(xy_re, dataline) + len_group_data = len(group_data) + if len_group_data == 0: + xy_re = re.compile('[^ ][0-9\.]+,[0-9\.]+;') + group_data = re.findall(xy_re, dataline) + + for data in group_data: + clean_data = data.replace(', ', ',') + clean_data = clean_data.replace(';', '') + x, y = clean_data.split(',') + pts.append([float(x), float(y)]) + + if len_group_data > 1: + return [pts] + else: + return pts + + def _parse_data(datastring): ''' Creates numpy array from datalines ''' + probe_data = datastring[80:320] + if ',' in probe_data and not('.' in probe_data): # fix comma as decimal points + datastring = datastring.replace(',', '.') + datalines = datastring.split("\n") headerline = datalines[0] @@ -446,6 +481,8 @@ def _parse_data(datastring): data = _parse_pseudo(datalines) elif mode == 0: data = _parse_affn_pac(datalines) + elif mode == 2: + data = _parse_xy_xy(datalines) else: return None if data is None: @@ -495,7 +532,7 @@ def find_yfactors(dic): return (factor_r, factor_i) -def getdataarray(dic): +def getdataarray(dic, show_all_data=False): ''' Main function for data array parsing, input is the raw dictionary from _readrawdic @@ -525,19 +562,23 @@ def getdataarray(dic): idatalist.append(data) else: rdatalist.append(data) - if len(rdatalist) > 1: - warn("NTUPLES: multiple real arrays, returning first one only") - if len(idatalist) > 1: - warn("NTUPLES: multiple imaginary arrays, \ - returning first one only") - if rdatalist: - if idatalist: - data = [rdatalist[0], idatalist[0]] - else: - data = rdatalist[0] + + if show_all_data: + data = { 'real': rdatalist, 'imaginary': idatalist } else: - if idatalist: - data = [None, idatalist[0]] + if len(rdatalist) > 1: + warn("NTUPLES: multiple real arrays, returning first one only") + if len(idatalist) > 1: + warn("NTUPLES: multiple imaginary arrays, \ + returning first one only") + if rdatalist: + if idatalist: + data = [rdatalist[0], idatalist[0]] + else: + data = rdatalist[0] + else: + if idatalist: + data = [None, idatalist[0]] if data is None: # XYDATA try: @@ -552,14 +593,27 @@ def getdataarray(dic): except KeyError: warn("XYDATA not found ") - if data is None: - return None + if data is None: # PEAK TABLE + try: + valuelist = dic["PEAKTABLE"] + if len(valuelist) == 1: + data, datatype = _parse_data(valuelist[0]) + else: + warn("Multiple PEAKTABLE arrays in JCAMP-DX file, \ + returning first one only") + except KeyError: + warn("PEAKTABLE not found ") # apply YFACTOR to data if available if is_ntuples: yfactor_r, yfactor_i = find_yfactors(dic) if yfactor_r is None or yfactor_r is None: warn("NTUPLES: YFACTORs not applied, parsing failed") + elif show_all_data: + for i, _ in enumerate(data['real']): + data['real'][i] = data['real'][i] * yfactor_r + for i, _ in enumerate(data['imaginary']): + data['imaginary'][i] = data['imaginary'][i] * yfactor_i else: data[0] = data[0] * yfactor_r data[1] = data[1] * yfactor_i @@ -575,7 +629,7 @@ def getdataarray(dic): return data -def read(filename): +def read(filename, show_all_data=False, read_err=None): """ Read JCAMP-DX file @@ -601,13 +655,14 @@ def read(filename): # first read everything (including data array) to "raw" dictionary, # in which data values are read as raw strings including whitespace # and newlines - dic = _readrawdic(filename) + dic = _readrawdic(filename, read_err) # select the relevant data section. # first try to parse NMRSPECTRUM sections in order, # and go with first that has proper data: data = None correctdic = None + try: subdiclist = dic["_datatype_NMRSPECTRUM"] for subdic in subdiclist: @@ -669,9 +724,17 @@ def read(filename): for key, valuelist in correctdic.items(): dic[key] = valuelist + # remove data tables from dic + try: + dic['XYDATA_OLD'] = dic["XYDATA"] + del dic["XYDATA"] + except KeyError: + pass + # clean main dic from possible empty entries dic = {key: value for key, value in dic.items() if value} + return dic, data