From 16c17b4dba8381615d57267c01f1f2f310fc522c Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Wed, 10 Jan 2024 12:47:36 -0500 Subject: [PATCH] ENH: Speed up reading of small buffers (#12343) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- doc/changes/devel/12343.newfeature.rst | 1 + mne/_fiff/open.py | 29 +++--- mne/_fiff/tag.py | 97 +++++++----------- mne/_fiff/tree.py | 47 +-------- mne/commands/tests/test_commands.py | 13 ++- mne/epochs.py | 5 +- mne/io/fiff/raw.py | 136 ++++++++++++------------- mne/io/fiff/tests/test_raw_fiff.py | 12 +-- 8 files changed, 140 insertions(+), 200 deletions(-) create mode 100644 doc/changes/devel/12343.newfeature.rst diff --git a/doc/changes/devel/12343.newfeature.rst b/doc/changes/devel/12343.newfeature.rst new file mode 100644 index 00000000000..9825f924e48 --- /dev/null +++ b/doc/changes/devel/12343.newfeature.rst @@ -0,0 +1 @@ +Speed up raw FIF reading when using small buffer sizes by `Eric Larson`_. \ No newline at end of file diff --git a/mne/_fiff/open.py b/mne/_fiff/open.py index d1794317772..02fcda445a0 100644 --- a/mne/_fiff/open.py +++ b/mne/_fiff/open.py @@ -13,7 +13,13 @@ from ..utils import _file_like, logger, verbose, warn from .constants import FIFF -from .tag import Tag, _call_dict_names, _matrix_info, read_tag, read_tag_info +from .tag import ( + Tag, + _call_dict_names, + _matrix_info, + _read_tag_header, + read_tag, +) from .tree import dir_tree_find, make_dir_tree @@ -139,7 +145,7 @@ def _fiff_open(fname, fid, preload): with fid as fid_old: fid = BytesIO(fid_old.read()) - tag = read_tag_info(fid) + tag = _read_tag_header(fid, 0) # Check that this looks like a fif file prefix = f"file {repr(fname)} does not" @@ -152,7 +158,7 @@ def _fiff_open(fname, fid, preload): if tag.size != 20: raise ValueError(f"{prefix} start with a file id tag") - tag = read_tag(fid) + tag = read_tag(fid, tag.next_pos) if tag.kind != FIFF.FIFF_DIR_POINTER: raise ValueError(f"{prefix} have a directory pointer") @@ -176,16 +182,15 @@ def _fiff_open(fname, fid, preload): directory = dir_tag.data read_slow = False if read_slow: - fid.seek(0, 0) + pos = 0 + fid.seek(pos, 0) directory = list() - while tag.next >= 0: - pos = fid.tell() - tag = read_tag_info(fid) + while pos is not None: + tag = _read_tag_header(fid, pos) if tag is None: break # HACK : to fix file ending with empty tag... - else: - tag.pos = pos - directory.append(tag) + pos = tag.next_pos + directory.append(tag) tree, _ = make_dir_tree(fid, directory) @@ -309,7 +314,7 @@ def _show_tree( for k, kn, size, pos, type_ in zip(kinds[:-1], kinds[1:], sizes, poss, types): if not tag_found and k != tag_id: continue - tag = Tag(k, size, 0, pos) + tag = Tag(kind=k, type=type_, size=size, next=FIFF.FIFFV_NEXT_NONE, pos=pos) if read_limit is None or size <= read_limit: try: tag = read_tag(fid, pos) @@ -348,7 +353,7 @@ def _show_tree( ) else: postpend += " ... type=" + str(type(tag.data)) - postpend = ">" * 20 + "BAD" if not good else postpend + postpend = ">" * 20 + f"BAD @{pos}" if not good else postpend matrix_info = _matrix_info(tag) if matrix_info is not None: _, type_, _, _ = matrix_info diff --git a/mne/_fiff/tag.py b/mne/_fiff/tag.py index 81ed12baf6f..e1ae5ae571a 100644 --- a/mne/_fiff/tag.py +++ b/mne/_fiff/tag.py @@ -7,7 +7,9 @@ import html import re import struct +from dataclasses import dataclass from functools import partial +from typing import Any import numpy as np from scipy.sparse import csc_matrix, csr_matrix @@ -28,40 +30,16 @@ # HELPERS +@dataclass class Tag: - """Tag in FIF tree structure. + """Tag in FIF tree structure.""" - Parameters - ---------- - kind : int - Kind of Tag. - type_ : int - Type of Tag. - size : int - Size in bytes. - int : next - Position of next Tag. - pos : int - Position of Tag is the original file. - """ - - def __init__(self, kind, type_, size, next, pos=None): - self.kind = int(kind) - self.type = int(type_) - self.size = int(size) - self.next = int(next) - self.pos = pos if pos is not None else next - self.pos = int(self.pos) - self.data = None - - def __repr__(self): # noqa: D105 - attrs = list() - for attr in ("kind", "type", "size", "next", "pos", "data"): - try: - attrs.append(f"{attr} {getattr(self, attr)}") - except AttributeError: - pass - return "" + kind: int + type: int + size: int + next: int + pos: int + data: Any = None def __eq__(self, tag): # noqa: D105 return int( @@ -73,17 +51,15 @@ def __eq__(self, tag): # noqa: D105 and self.data == tag.data ) - -def read_tag_info(fid): - """Read Tag info (or header).""" - tag = _read_tag_header(fid) - if tag is None: - return None - if tag.next == 0: - fid.seek(tag.size, 1) - elif tag.next > 0: - fid.seek(tag.next, 0) - return tag + @property + def next_pos(self): + """The next tag position.""" + if self.next == FIFF.FIFFV_NEXT_SEQ: # 0 + return self.pos + 16 + self.size + elif self.next > 0: + return self.next + else: # self.next should be -1 if we get here + return None # safest to return None so that things like fid.seek die def _frombuffer_rows(fid, tag_size, dtype=None, shape=None, rlims=None): @@ -157,16 +133,18 @@ def _loc_to_eeg_loc(loc): # by the function names. -def _read_tag_header(fid): +def _read_tag_header(fid, pos): """Read only the header of a Tag.""" - s = fid.read(4 * 4) + fid.seek(pos, 0) + s = fid.read(16) if len(s) != 16: where = fid.tell() - len(s) extra = f" in file {fid.name}" if hasattr(fid, "name") else "" warn(f"Invalid tag with only {len(s)}/16 bytes at position {where}{extra}") return None # struct.unpack faster than np.frombuffer, saves ~10% of time some places - return Tag(*struct.unpack(">iIii", s)) + kind, type_, size, next_ = struct.unpack(">iIii", s) + return Tag(kind, type_, size, next_, pos) def _read_matrix(fid, tag, shape, rlims): @@ -178,10 +156,10 @@ def _read_matrix(fid, tag, shape, rlims): matrix_coding, matrix_type, bit, dtype = _matrix_info(tag) + pos = tag.pos + 16 + fid.seek(pos + tag.size - 4, 0) if matrix_coding == "dense": # Find dimensions and return to the beginning of tag data - pos = fid.tell() - fid.seek(tag.size - 4, 1) ndim = int(np.frombuffer(fid.read(4), dtype=">i4").item()) fid.seek(-(ndim + 1) * 4, 1) dims = np.frombuffer(fid.read(4 * ndim), dtype=">i4")[::-1] @@ -205,8 +183,6 @@ def _read_matrix(fid, tag, shape, rlims): data.shape = dims else: # Find dimensions and return to the beginning of tag data - pos = fid.tell() - fid.seek(tag.size - 4, 1) ndim = int(np.frombuffer(fid.read(4), dtype=">i4").item()) fid.seek(-(ndim + 2) * 4, 1) dims = np.frombuffer(fid.read(4 * (ndim + 1)), dtype=">i4") @@ -388,7 +364,16 @@ def _read_old_pack(fid, tag, shape, rlims): def _read_dir_entry_struct(fid, tag, shape, rlims): """Read dir entry struct tag.""" - return [_read_tag_header(fid) for _ in range(tag.size // 16 - 1)] + pos = tag.pos + 16 + entries = list() + for offset in range(1, tag.size // 16): + ent = _read_tag_header(fid, pos + offset * 16) + # The position of the real tag on disk is stored in the "next" entry within the + # directory, so we need to overwrite ent.pos. For safety let's also overwrite + # ent.next to point nowhere + ent.pos, ent.next = ent.next, FIFF.FIFFV_NEXT_NONE + entries.append(ent) + return entries def _read_julian(fid, tag, shape, rlims): @@ -439,7 +424,7 @@ def _read_julian(fid, tag, shape, rlims): _call_dict_names[key] = dtype -def read_tag(fid, pos=None, shape=None, rlims=None): +def read_tag(fid, pos, shape=None, rlims=None): """Read a Tag from a file at a given position. Parameters @@ -462,9 +447,7 @@ def read_tag(fid, pos=None, shape=None, rlims=None): tag : Tag The Tag read. """ - if pos is not None: - fid.seek(pos, 0) - tag = _read_tag_header(fid) + tag = _read_tag_header(fid, pos) if tag is None: return tag if tag.size > 0: @@ -477,10 +460,6 @@ def read_tag(fid, pos=None, shape=None, rlims=None): except KeyError: raise Exception(f"Unimplemented tag data type {tag.type}") from None tag.data = fun(fid, tag, shape, rlims) - if tag.next != FIFF.FIFFV_NEXT_SEQ: - # f.seek(tag.next,0) - fid.seek(tag.next, 1) # XXX : fix? pb when tag.next < 0 - return tag diff --git a/mne/_fiff/tree.py b/mne/_fiff/tree.py index 6aa7b5f4539..556dab1a537 100644 --- a/mne/_fiff/tree.py +++ b/mne/_fiff/tree.py @@ -4,12 +4,10 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. -import numpy as np from ..utils import logger, verbose from .constants import FIFF -from .tag import Tag, read_tag -from .write import _write, end_block, start_block, write_id +from .tag import read_tag def dir_tree_find(tree, kind): @@ -108,46 +106,3 @@ def make_dir_tree(fid, directory, start=0, indent=0, verbose=None): logger.debug(" " * indent + "end } %d" % block) last = this return tree, last - - -############################################################################### -# Writing - - -def copy_tree(fidin, in_id, nodes, fidout): - """Copy directory subtrees from fidin to fidout.""" - if len(nodes) <= 0: - return - - if not isinstance(nodes, list): - nodes = [nodes] - - for node in nodes: - start_block(fidout, node["block"]) - if node["id"] is not None: - if in_id is not None: - write_id(fidout, FIFF.FIFF_PARENT_FILE_ID, in_id) - - write_id(fidout, FIFF.FIFF_BLOCK_ID, in_id) - write_id(fidout, FIFF.FIFF_PARENT_BLOCK_ID, node["id"]) - - if node["directory"] is not None: - for d in node["directory"]: - # Do not copy these tags - if ( - d.kind == FIFF.FIFF_BLOCK_ID - or d.kind == FIFF.FIFF_PARENT_BLOCK_ID - or d.kind == FIFF.FIFF_PARENT_FILE_ID - ): - continue - - # Read and write tags, pass data through transparently - fidin.seek(d.pos, 0) - tag = Tag(*np.fromfile(fidin, (">i4,>I4,>i4,>i4"), 1)[0]) - tag.data = np.fromfile(fidin, ">B", tag.size) - _write(fidout, tag.data, tag.kind, 1, tag.type, ">B") - - for child in node["children"]: - copy_tree(fidin, in_id, child, fidout) - - end_block(fidout, node["block"]) diff --git a/mne/commands/tests/test_commands.py b/mne/commands/tests/test_commands.py index ea87c717db0..26e1f7fa540 100644 --- a/mne/commands/tests/test_commands.py +++ b/mne/commands/tests/test_commands.py @@ -43,7 +43,7 @@ mne_what, ) from mne.datasets import testing -from mne.io import read_info, read_raw_fif +from mne.io import read_info, read_raw_fif, show_fiff from mne.utils import ( ArgvSetter, _record_warnings, @@ -100,13 +100,22 @@ def test_compare_fiff(): check_usage(mne_compare_fiff) -def test_show_fiff(): +def test_show_fiff(tmp_path): """Test mne compare_fiff.""" check_usage(mne_show_fiff) with ArgvSetter((raw_fname,)): mne_show_fiff.run() with ArgvSetter((raw_fname, "--tag=102")): mne_show_fiff.run() + bad_fname = tmp_path / "test_bad_raw.fif" + with open(bad_fname, "wb") as fout: + with open(raw_fname, "rb") as fin: + fout.write(fin.read(100000)) + with pytest.warns(RuntimeWarning, match="Invalid tag"): + lines = show_fiff(bad_fname, output=list) + last_line = lines[-1] + assert last_line.endswith(">>>>BAD @9015") + assert "302 = FIFF_EPOCH (734412b >f4)" in last_line @requires_mne diff --git a/mne/epochs.py b/mne/epochs.py index c042c207905..d11ba5f59aa 100644 --- a/mne/epochs.py +++ b/mne/epochs.py @@ -39,7 +39,7 @@ pick_info, ) from ._fiff.proj import ProjMixin, setup_proj -from ._fiff.tag import read_tag, read_tag_info +from ._fiff.tag import _read_tag_header, read_tag from ._fiff.tree import dir_tree_find from ._fiff.utils import _make_split_fnames from ._fiff.write import ( @@ -3779,8 +3779,7 @@ def _read_one_epoch_file(f, tree, preload): elif kind == FIFF.FIFF_EPOCH: # delay reading until later fid.seek(pos, 0) - data_tag = read_tag_info(fid) - data_tag.pos = pos + data_tag = _read_tag_header(fid, pos) data_tag.type = data_tag.type ^ (1 << 30) elif kind in [FIFF.FIFF_MNE_BASELINE_MIN, 304]: # Constant 304 was used before v0.11 diff --git a/mne/io/fiff/raw.py b/mne/io/fiff/raw.py index 1c13189f723..39f0466e1eb 100644 --- a/mne/io/fiff/raw.py +++ b/mne/io/fiff/raw.py @@ -16,7 +16,7 @@ from ..._fiff.constants import FIFF from ..._fiff.meas_info import read_meas_info from ..._fiff.open import _fiff_get_fid, _get_next_fname, fiff_open -from ..._fiff.tag import read_tag, read_tag_info +from ..._fiff.tag import _call_dict, read_tag from ..._fiff.tree import dir_tree_find from ..._fiff.utils import _mult_cal_one from ...annotations import Annotations, _read_annotations_fif @@ -255,48 +255,40 @@ def _read_raw_file( nskip = 0 orig_format = None + _byte_dict = { + FIFF.FIFFT_DAU_PACK16: 2, + FIFF.FIFFT_SHORT: 2, + FIFF.FIFFT_FLOAT: 4, + FIFF.FIFFT_DOUBLE: 8, + FIFF.FIFFT_INT: 4, + FIFF.FIFFT_COMPLEX_FLOAT: 8, + FIFF.FIFFT_COMPLEX_DOUBLE: 16, + } + _orig_format_dict = { + FIFF.FIFFT_DAU_PACK16: "short", + FIFF.FIFFT_SHORT: "short", + FIFF.FIFFT_FLOAT: "single", + FIFF.FIFFT_DOUBLE: "double", + FIFF.FIFFT_INT: "int", + FIFF.FIFFT_COMPLEX_FLOAT: "single", + FIFF.FIFFT_COMPLEX_DOUBLE: "double", + } + for k in range(first, nent): ent = directory[k] # There can be skips in the data (e.g., if the user unclicked) # an re-clicked the button - if ent.kind == FIFF.FIFF_DATA_SKIP: - tag = read_tag(fid, ent.pos) - nskip = int(tag.data.item()) - elif ent.kind == FIFF.FIFF_DATA_BUFFER: + if ent.kind == FIFF.FIFF_DATA_BUFFER: # Figure out the number of samples in this buffer - if ent.type == FIFF.FIFFT_DAU_PACK16: - nsamp = ent.size // (2 * nchan) - elif ent.type == FIFF.FIFFT_SHORT: - nsamp = ent.size // (2 * nchan) - elif ent.type == FIFF.FIFFT_FLOAT: - nsamp = ent.size // (4 * nchan) - elif ent.type == FIFF.FIFFT_DOUBLE: - nsamp = ent.size // (8 * nchan) - elif ent.type == FIFF.FIFFT_INT: - nsamp = ent.size // (4 * nchan) - elif ent.type == FIFF.FIFFT_COMPLEX_FLOAT: - nsamp = ent.size // (8 * nchan) - elif ent.type == FIFF.FIFFT_COMPLEX_DOUBLE: - nsamp = ent.size // (16 * nchan) - else: - raise ValueError( - "Cannot handle data buffers of type " "%d" % ent.type - ) + try: + div = _byte_dict[ent.type] + except KeyError: + raise RuntimeError( + f"Cannot handle data buffers of type {ent.type}" + ) from None + nsamp = ent.size // (div * nchan) if orig_format is None: - if ent.type == FIFF.FIFFT_DAU_PACK16: - orig_format = "short" - elif ent.type == FIFF.FIFFT_SHORT: - orig_format = "short" - elif ent.type == FIFF.FIFFT_FLOAT: - orig_format = "single" - elif ent.type == FIFF.FIFFT_DOUBLE: - orig_format = "double" - elif ent.type == FIFF.FIFFT_INT: - orig_format = "int" - elif ent.type == FIFF.FIFFT_COMPLEX_FLOAT: - orig_format = "single" - elif ent.type == FIFF.FIFFT_COMPLEX_DOUBLE: - orig_format = "double" + orig_format = _orig_format_dict[ent.type] # Do we have an initial skip pending? if first_skip > 0: @@ -327,6 +319,9 @@ def _read_raw_file( ) ) first_samp += nsamp + elif ent.kind == FIFF.FIFF_DATA_SKIP: + tag = read_tag(fid, ent.pos) + nskip = int(tag.data.item()) next_fname = _get_next_fname(fid, fname_rep, tree) @@ -381,22 +376,17 @@ def _dtype(self): if self._dtype_ is not None: return self._dtype_ dtype = None - for raw_extra, filename in zip(self._raw_extras, self._filenames): + for raw_extra in self._raw_extras: for ent in raw_extra["ent"]: if ent is not None: - with _fiff_get_fid(filename) as fid: - fid.seek(ent.pos, 0) - tag = read_tag_info(fid) - if tag is not None: - if tag.type in ( - FIFF.FIFFT_COMPLEX_FLOAT, - FIFF.FIFFT_COMPLEX_DOUBLE, - ): - dtype = np.complex128 - else: - dtype = np.float64 - if dtype is not None: - break + if ent.type in ( + FIFF.FIFFT_COMPLEX_FLOAT, + FIFF.FIFFT_COMPLEX_DOUBLE, + ): + dtype = np.complex128 + else: + dtype = np.float64 + break if dtype is not None: break if dtype is None: @@ -421,27 +411,31 @@ def _read_segment_file(self, data, idx, fi, start, stop, cals, mult): first_pick = max(start - first, 0) last_pick = min(nsamp, stop - first) picksamp = last_pick - first_pick - # only read data if it exists - if ent is not None: - one = read_tag( - fid, - ent.pos, - shape=(nsamp, nchan), - rlims=(first_pick, last_pick), - ).data - try: - one.shape = (picksamp, nchan) - except AttributeError: # one is None - n_bad += picksamp - else: - _mult_cal_one( - data[:, offset : (offset + picksamp)], - one.T, - idx, - cals, - mult, - ) + this_start = offset offset += picksamp + this_stop = offset + # only read data if it exists + if ent is None: + continue # just use zeros for gaps + # faster to always read full tag, taking advantage of knowing the header + # already (cutting out some of read_tag) ... + fid.seek(ent.pos + 16, 0) + one = _call_dict[ent.type](fid, ent, shape=None, rlims=None) + try: + one.shape = (nsamp, nchan) + except AttributeError: # one is None + n_bad += picksamp + else: + # ... then pick samples we want + if first_pick != 0 or last_pick != nsamp: + one = one[first_pick:last_pick] + _mult_cal_one( + data[:, this_start:this_stop], + one.T, + idx, + cals, + mult, + ) if n_bad: warn( f"FIF raw buffer could not be read, acquisition error " diff --git a/mne/io/fiff/tests/test_raw_fiff.py b/mne/io/fiff/tests/test_raw_fiff.py index 329d205e8d3..154d70b0dee 100644 --- a/mne/io/fiff/tests/test_raw_fiff.py +++ b/mne/io/fiff/tests/test_raw_fiff.py @@ -30,8 +30,7 @@ pick_types, ) from mne._fiff.constants import FIFF -from mne._fiff.open import read_tag, read_tag_info -from mne._fiff.tag import _read_tag_header +from mne._fiff.tag import _read_tag_header, read_tag from mne.annotations import Annotations from mne.datasets import testing from mne.filter import filter_data @@ -2044,8 +2043,7 @@ def test_bad_acq(fname): raw = read_raw_fif(fname, allow_maxshield="yes").load_data() with open(fname, "rb") as fid: for ent in raw._raw_extras[0]["ent"]: - fid.seek(ent.pos, 0) - tag = _read_tag_header(fid) + tag = _read_tag_header(fid, ent.pos) # hack these, others (kind, type) should be correct tag.pos, tag.next = ent.pos, ent.next assert tag == ent @@ -2085,9 +2083,9 @@ def test_corrupted(tmp_path, offset): # at the end, so use the skip one (straight from acq). raw = read_raw_fif(skip_fname) with open(skip_fname, "rb") as fid: - tag = read_tag_info(fid) - tag = read_tag(fid) - dirpos = int(tag.data.item()) + file_id_tag = read_tag(fid, 0) + dir_pos_tag = read_tag(fid, file_id_tag.next_pos) + dirpos = int(dir_pos_tag.data.item()) assert dirpos == 12641532 fid.seek(0) data = fid.read(dirpos + offset)