From ac5a1b09d2ec25ac9376b78f2a9bc9be69924e40 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Tue, 10 Oct 2023 14:05:54 -0400
Subject: [PATCH] Concatenate across directions as well as runs (#965)

---
 xcp_d/tests/test_utils_bids.py   | 49 ++++++++++++++++++++++++++++++++
 xcp_d/utils/bids.py              | 27 ++++++++++++++----
 xcp_d/workflows/base.py          |  2 +-
 xcp_d/workflows/concatenation.py |  4 +--
 4 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/xcp_d/tests/test_utils_bids.py b/xcp_d/tests/test_utils_bids.py
index ff99b55ca..7eddb24fc 100644
--- a/xcp_d/tests/test_utils_bids.py
+++ b/xcp_d/tests/test_utils_bids.py
@@ -245,3 +245,52 @@ def test_get_entity(datasets):
     )
     with pytest.raises(ValueError, match="Unknown space"):
         xbids.get_entity(fname, "space")
+
+
+def test_group_across_runs():
+    """Test group_across_runs."""
+    in_files = [
+        "/path/sub-01_task-axcpt_run-03_bold.nii.gz",
+        "/path/sub-01_task-rest_run-03_bold.nii.gz",
+        "/path/sub-01_task-rest_run-01_bold.nii.gz",
+        "/path/sub-01_task-axcpt_run-02_bold.nii.gz",
+        "/path/sub-01_task-rest_run-02_bold.nii.gz",
+        "/path/sub-01_task-axcpt_run-01_bold.nii.gz",
+    ]
+    grouped_files = xbids.group_across_runs(in_files)
+    assert isinstance(grouped_files, list)
+    assert len(grouped_files[0]) == 3
+    assert grouped_files[0] == [
+        "/path/sub-01_task-axcpt_run-01_bold.nii.gz",
+        "/path/sub-01_task-axcpt_run-02_bold.nii.gz",
+        "/path/sub-01_task-axcpt_run-03_bold.nii.gz",
+    ]
+    assert len(grouped_files[1]) == 3
+    assert grouped_files[1] == [
+        "/path/sub-01_task-rest_run-01_bold.nii.gz",
+        "/path/sub-01_task-rest_run-02_bold.nii.gz",
+        "/path/sub-01_task-rest_run-03_bold.nii.gz",
+    ]
+
+    in_files = [
+        "/path/sub-01_task-rest_dir-LR_run-2_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-RL_run-1_bold.nii.gz",
+        "/path/sub-01_task-axcpt_dir-LR_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-RL_run-2_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-LR_run-1_bold.nii.gz",
+        "/path/sub-01_task-axcpt_dir-RL_bold.nii.gz",
+    ]
+    grouped_files = xbids.group_across_runs(in_files)
+    assert isinstance(grouped_files, list)
+    assert len(grouped_files[0]) == 2
+    assert grouped_files[0] == [
+        "/path/sub-01_task-axcpt_dir-LR_bold.nii.gz",
+        "/path/sub-01_task-axcpt_dir-RL_bold.nii.gz",
+    ]
+    assert len(grouped_files[1]) == 4
+    assert grouped_files[1] == [
+        "/path/sub-01_task-rest_dir-LR_run-1_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-RL_run-1_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-LR_run-2_bold.nii.gz",
+        "/path/sub-01_task-rest_dir-RL_run-2_bold.nii.gz",
+    ]
diff --git a/xcp_d/utils/bids.py b/xcp_d/utils/bids.py
index 1321ab536..efa7c9aaf 100644
--- a/xcp_d/utils/bids.py
+++ b/xcp_d/utils/bids.py
@@ -896,7 +896,11 @@ def get_entity(filename, entity):
 
 
 def group_across_runs(in_files):
-    """Group preprocessed BOLD files by unique sets of entities, ignoring run.
+    """Group preprocessed BOLD files by unique sets of entities, ignoring run and direction.
+
+    We only ignore direction for the sake of HCP.
+    This may lead to small problems for non-HCP datasets that differentiate scans based on
+    both run and direction.
 
     Parameters
     ----------
@@ -913,20 +917,31 @@ def group_across_runs(in_files):
 
     # First, extract run information and sort the input files by the runs,
     # so that any cases where files are not already in ascending run order get fixed.
-    run_numbers = []
+    run_numbers, directions = [], []
     for in_file in in_files:
         run = get_entity(in_file, "run")
         if run is None:
             run = 0
 
+        direction = get_entity(in_file, "dir")
+        if direction is None:
+            direction = "none"
+
         run_numbers.append(int(run))
+        directions.append(direction)
+
+    # Combine the three lists into a list of tuples
+    combined_data = list(zip(run_numbers, directions, in_files))
+
+    # Sort the list of tuples first by run and then by direction
+    sorted_data = sorted(combined_data, key=lambda x: (x[0], x[1], x[2]))
 
-    # Sort the files by the run numbers.
-    zipped_pairs = zip(run_numbers, in_files)
-    sorted_in_files = [x for _, x in sorted(zipped_pairs)]
+    # Sort the file list
+    sorted_in_files = [item[2] for item in sorted_data]
 
-    # Extract the unique sets of entities (i.e., the filename, minus the run entity).
+    # Extract the unique sets of entities (i.e., the filename, minus the run and dir entities).
     unique_filenames = [re.sub("_run-[0-9]+_", "_", os.path.basename(f)) for f in sorted_in_files]
+    unique_filenames = [re.sub("_dir-[0-9a-zA-Z]+_", "_", f) for f in unique_filenames]
 
     # Assign each in_file to a group of files with the same entities, except run.
     out_files, grouped_unique_filenames = [], []
diff --git a/xcp_d/workflows/base.py b/xcp_d/workflows/base.py
index 7620c3c94..f2f8a6c7d 100644
--- a/xcp_d/workflows/base.py
+++ b/xcp_d/workflows/base.py
@@ -635,7 +635,7 @@ def init_subject_wf(
     )
 
     n_runs = len(preproc_files)
-    preproc_files = group_across_runs(preproc_files)
+    preproc_files = group_across_runs(preproc_files)  # group files across runs and directions
     run_counter = 0
     for ent_set, task_files in enumerate(preproc_files):
         # Assuming TR is constant across runs for a given combination of entities.
diff --git a/xcp_d/workflows/concatenation.py b/xcp_d/workflows/concatenation.py
index 29b4755c2..1e4a95b2a 100644
--- a/xcp_d/workflows/concatenation.py
+++ b/xcp_d/workflows/concatenation.py
@@ -28,7 +28,7 @@ def init_concatenate_data_wf(
     dcan_qc,
     name="concatenate_data_wf",
 ):
-    """Concatenate postprocessed data.
+    """Concatenate postprocessed data across runs and directions.
 
     Workflow Graph
         .. workflow::
@@ -99,7 +99,7 @@ def init_concatenate_data_wf(
     workflow = Workflow(name=name)
 
     workflow.__desc__ = """
-Postprocessing derivatives from multi-run tasks were then concatenated across runs.
+Postprocessing derivatives from multi-run tasks were then concatenated across runs and directions.
 """
 
     inputnode = pe.Node(