diff --git a/heudiconv/convert.py b/heudiconv/convert.py index 8e149bb3..92f1c9b9 100644 --- a/heudiconv/convert.py +++ b/heudiconv/convert.py @@ -46,6 +46,7 @@ set_readonly, treat_infofile, write_config, + has_deprecated_seriesid, ) if TYPE_CHECKING: @@ -172,7 +173,8 @@ def prep_conversion( # if conversion table(s) do not exist -- we need to prepare them # (the *prepare* stage in https://github.com/nipy/heudiconv/issues/134) # if overwrite - recalculate this anyways - reuse_conversion_table = op.exists(edit_file) + # MD: a check for the deprecated series_id is added here to avoid reusing the conversion table + reuse_conversion_table = op.exists(edit_file) and not has_deprecated_seriesid(edit_file) # We also might need to redo it if changes in the heuristic file # detected # ref: https://github.com/nipy/heudiconv/issues/84#issuecomment-330048609 diff --git a/heudiconv/dicoms.py b/heudiconv/dicoms.py index ef51086a..a4212380 100644 --- a/heudiconv/dicoms.py +++ b/heudiconv/dicoms.py @@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Union, overload from unittest.mock import patch import warnings +import hashlib import pydicom as dcm @@ -114,7 +115,7 @@ def create_seqinfo(mw: dw.Wrapper, series_files: list[str], series_id: str) -> S def validate_dicom( fl: str, dcmfilter: Optional[Callable[[dcm.dataset.Dataset], Any]] -) -> Optional[tuple[dw.Wrapper, tuple[int, str], Optional[str]]]: +) -> Optional[tuple[dw.Wrapper, tuple[int, str, str], Optional[str]]]: """ Parse DICOM attributes. Returns None if not valid. """ @@ -135,7 +136,7 @@ def validate_dicom( try: protocol_name = mw.dcm_data.ProtocolName assert isinstance(protocol_name, str) - series_id = (int(mw.dcm_data.SeriesNumber), protocol_name) + series_id = (int(mw.dcm_data.SeriesNumber), protocol_name, mw.dcm_data.SeriesInstanceUID) except AttributeError as e: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', fl, e) return None @@ -159,10 +160,13 @@ def validate_dicom( class SeriesID(NamedTuple): series_number: int protocol_name: str + series_uid: str file_studyUID: Optional[str] = None def __str__(self) -> str: - s = f"{self.series_number}-{self.protocol_name}" + len_hash_hex = 8 + suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:len_hash_hex] + s = f"{self.series_number}-{self.protocol_name}-{suid_hex}" if self.file_studyUID is not None: s += f"-{self.file_studyUID}" return s @@ -285,7 +289,7 @@ def group_dicoms_into_seqinfos( removeidx.append(idx) continue mw, series_id_, file_studyUID = mwinfo - series_id = SeriesID(series_id_[0], series_id_[1]) + series_id = SeriesID(series_id_[0], series_id_[1], series_id_[2]) if per_studyUID: series_id = series_id._replace(file_studyUID=file_studyUID) @@ -320,6 +324,7 @@ def group_dicoms_into_seqinfos( series_id = SeriesID( mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName, + mwgroup[idx].dcm_data.SeriesInstanceUID, ) if per_studyUID: series_id = series_id._replace(file_studyUID=file_studyUID) diff --git a/heudiconv/parser.py b/heudiconv/parser.py index f8a11977..bb8b73e1 100644 --- a/heudiconv/parser.py +++ b/heudiconv/parser.py @@ -278,15 +278,9 @@ def infotoids( ) lgr.info("Study session for %r", study_session_info) - if study_session_info in study_sessions: - if grouping != "all": - # MG - should this blow up to mimic -d invocation? - lgr.warning( - "Existing study session with the same values (%r)." - " Skipping DICOMS %s", - study_session_info, - seqinfo.values(), - ) - continue + if grouping != "all": + assert (study_session_info not in study_sessions), ( + f"Existing study session {study_session_info} " + f"already in analyzed sessions {study_sessions.keys()}") study_sessions[study_session_info] = seqinfo return study_sessions diff --git a/heudiconv/utils.py b/heudiconv/utils.py index 9f988267..17948703 100644 --- a/heudiconv/utils.py +++ b/heudiconv/utils.py @@ -18,6 +18,7 @@ import stat from subprocess import check_output import sys +import string import tempfile from time import sleep from types import ModuleType @@ -222,6 +223,41 @@ def load_json(filename: str | Path, retry: int = 0) -> Any: return data +def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool: + """Return True if series_id is in deprecated format. + A deprecated series ID is in the format "series_number-protocol_name". + A non-deprecated series ID is in the format "series_number-protocol_name-seriesUID_hash_hex" + """ + # Check at least two '-' in the series_id + if series_id.count('-') <= 1: + return True + # Check the first part of the series_id is a number + series_number = series_id.split('-')[0] + if not series_number.isdigit(): + return True + # Check the last part of the series_id is a hash in hexadecimal format of length len_hash_hex + hash_hex = series_id.split('-')[-1] + hex_digits = set(string.hexdigits) + if len(hash_hex) != len_hash_hex or not all(c in hex_digits for c in hash_hex): + return True + # If all previous tests passed, then the series_id is not deprecated + return False + + +def has_deprecated_seriesid(info_file: str) -> bool: + """Return True if any series_id in the info_file is in deprecated format.""" + info = read_config(info_file) + series_ids = [series_id for sublist in info.values() for series_id in sublist] + if any(is_deprecated_seriesid(series_id) for series_id in series_ids): + lgr.warning( + "Found deprecated series identifier in info file in the format" + "- instead of --" + "The existing conversion table will be ignored." + ) + return True + return False + + def assure_no_file_exists(path: str | Path) -> None: """Check if file or symlink (git-annex?) exists, and if so -- remove""" if os.path.lexists(path):