From a8c7aacb2c47feb986f96a292ac43055ea9394bb Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Thu, 30 May 2024 01:06:09 +0100
Subject: [PATCH 1/2] Save flags and metadata when splitting disjoint nodes

And add split_nodes to preprocess_ts. Fixes #373
---
 CHANGELOG.md        |  16 ++-
 tests/test_util.py  | 243 ++++++++++++++++++++++++++++++++++++++++++++
 tsdate/__init__.py  |   1 +
 tsdate/constants.py |  28 +++++
 tsdate/util.py      | 196 +++++++++++++++++++++++------------
 5 files changed, 419 insertions(+), 65 deletions(-)
 create mode 100644 tests/test_util.py
 create mode 100644 tsdate/constants.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 885d6e7e..3b3ec636 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,9 +4,23 @@
 
 **Bugfixes**
 
-- In variational gamma, Rescale messages at end of each iteration to avoid numerical
+- Variational gamma uses a rescaling approach which helps considerably if e.g.
+  population sizes vary over time
+
+- Variational gamma does not use mutational area of branches, but average path
+  length, which reduces bias in tree sequences containing polytomies
+
+- In variational gamma, rescale messages at end of each iteration to avoid numerical
   instability.
 
+**Breaking changes**
+
+- Variational gamma uses an improper (flat) prior, and therefore
+  no longer needs `population_size` specifying.
+
+- The standalone `preprocess_ts` function also applies the `split_disjoint_nodes`
+  method, which creates extra nodes but improves dating accuracy.
+
 ## [0.1.6] - 2024-01-07
 
 **Breaking changes**
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 00000000..b480eb66
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,243 @@
+# MIT License
+#
+# Copyright (c) 2024 Tskit Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Test cases for tsdate utility functions
+"""
+import json
+import logging
+
+import msprime
+import numpy as np
+import pytest
+import tskit
+
+import tsdate
+
+
+class TestSplitDisjointNodes:
+    def test_nosplit(self):
+        ts = tskit.Tree.generate_comb(5).tree_sequence
+        split_ts = tsdate.util.split_disjoint_nodes(ts)
+        assert split_ts.num_nodes == ts.num_nodes
+        assert split_ts.num_edges == ts.num_edges
+        assert split_ts.num_trees == ts.num_trees
+        for node in split_ts.nodes():
+            assert node.flags & tsdate.NODE_SPLIT_BY_PREPROCESS == 0
+        prov = json.loads(split_ts.provenance(-1).record)
+        assert prov["software"]["name"] == "tsdate"
+        assert prov["parameters"]["command"] == "split_disjoint_nodes"
+
+    def test_simple(self):
+        tables = tskit.Tree.generate_comb(5).tree_sequence.dump_tables()
+        tables.delete_intervals([[0.2, 0.8]])
+        tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
+        ts = tables.tree_sequence()
+        num_internal_nodes = ts.num_nodes - ts.num_samples
+        split_ts = tsdate.util.split_disjoint_nodes(ts)
+        num_new_internal_nodes = split_ts.num_nodes - split_ts.num_samples
+        assert split_ts.num_nodes > ts.num_nodes
+        # all internal nodes should be split
+
+        assert num_new_internal_nodes == num_internal_nodes * 2
+        for node in split_ts.nodes():
+            if node.is_sample():
+                assert node.flags & tsdate.NODE_SPLIT_BY_PREPROCESS == 0
+            else:
+                assert node.flags & tsdate.NODE_SPLIT_BY_PREPROCESS != 0
+
+    def test_metadata_warning(self, caplog):
+        # Only sets extra metadata if schema is compatible
+        ts = tskit.Tree.generate_comb(5).tree_sequence
+        tables = ts.dump_tables()
+        tables.delete_intervals([[0.2, 0.8]])
+        tables.nodes.metadata_schema = tskit.MetadataSchema(
+            {
+                "codec": "struct",
+                "type": "object",
+                "properties": {},
+                "additionalProperties": False,
+            }
+        )
+        ts = tables.tree_sequence()
+        with caplog.at_level(logging.WARNING):
+            tsdate.util.split_disjoint_nodes(ts)
+            assert "Could not set 'unsplit_node_id'" in caplog.text
+
+        tables.nodes.metadata_schema = tskit.MetadataSchema(None)
+        tables.nodes.packset_metadata([b"xxx"] * ts.num_nodes)
+        ts = tables.tree_sequence()
+        tsdate.util.split_disjoint_nodes(ts)
+        assert "Could not set 'unsplit_node_id'" in caplog.text
+        for node in ts.nodes():
+            assert node.metadata == b"xxx"
+
+    def test_metadata(self):
+        ts = tskit.Tree.generate_comb(5).tree_sequence
+        tables = ts.dump_tables()
+        tables.delete_intervals([[0.2, 0.8]])
+        tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
+        tables.nodes.packset_metadata(
+            [
+                tables.nodes.metadata_schema.validate_and_encode_row(
+                    {"xxx": f"test{x}"}
+                )
+                for x in range(ts.num_nodes)
+            ]
+        )
+        tables.nodes.flags = tables.nodes.flags | 1 << 16
+        ts = tables.tree_sequence()
+        split_ts = tsdate.util.split_disjoint_nodes(ts)
+        is_nonsample = np.ones(split_ts.num_nodes, dtype=bool)
+        is_nonsample[split_ts.samples()] = False
+        _, counts = np.unique(split_ts.nodes_time[is_nonsample], return_counts=True)
+        assert np.all(counts == 2)
+        ids = {node.id: 0 for node in ts.nodes() if not node.is_sample()}
+        for node in split_ts.nodes():
+            if not node.is_sample():
+                assert "unsplit_node_id" in node.metadata
+                orig_node = ts.node(node.metadata["unsplit_node_id"])
+                assert "unsplit_node_id" not in orig_node.metadata
+                assert "xxx" in node.metadata
+                assert "xxx" in orig_node.metadata
+                assert node.metadata["xxx"] == orig_node.metadata["xxx"]
+                assert node.time == orig_node.time
+                assert node.flags == orig_node.flags | tsdate.NODE_SPLIT_BY_PREPROCESS
+                ids[orig_node.id] += 1
+        assert all([v == 2 for v in ids.values()])
+
+    def test_no_provenance(self):
+        ts = tskit.Tree.generate_comb(5).tree_sequence
+        split_ts = tsdate.util.split_disjoint_nodes(ts, record_provenance=False)
+        assert split_ts.num_provenances == ts.num_provenances
+        split_ts = tsdate.util.split_disjoint_nodes(ts, record_provenance=True)
+        assert split_ts.num_provenances == ts.num_provenances + 1
+
+
+class TestPreprocessTs:
+    def test_no_sites(self):
+        ts = tskit.Tree.generate_comb(3).tree_sequence
+        with pytest.raises(ValueError, match="no sites"):
+            ts = tsdate.preprocess_ts(ts)
+
+    def test_split_disjoint(self):
+        tables = tskit.Tree.generate_comb(5).tree_sequence.dump_tables()
+        tables.delete_intervals([[0.2, 0.8]])
+        tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
+        tables.sites.add_row(0.1, "A")
+        ts = tables.tree_sequence()
+        num_nonsample_nodes = ts.num_nodes - ts.num_samples
+        ts = tsdate.preprocess_ts(ts)
+        num_split_nonsample_nodes = ts.num_nodes - ts.num_samples
+        assert num_split_nonsample_nodes == 2 * num_nonsample_nodes
+
+    def test_no_split_disjoint(self):
+        tables = tskit.Tree.generate_comb(5).tree_sequence.dump_tables()
+        tables.delete_intervals([[0.2, 0.8]])
+        tables.sites.add_row(0.1, "A")
+        ts = tables.tree_sequence()
+        num_nodes = ts.num_nodes
+        ts = tsdate.preprocess_ts(ts, split_disjoint=False)
+        assert ts.num_nodes == num_nodes
+
+    def test_is_simplified(self):
+        tables = tskit.Tree.generate_comb(5).tree_sequence.dump_tables()
+        tables.simplify(np.arange(4), keep_unary=True)  # leaves a unary node
+        tables.sites.add_row(0.5, "A")
+        tables.populations.add_row()
+        tables.individuals.add_row()
+        ts = tables.tree_sequence()
+        tree = ts.first()
+        # Check there is a single unary node
+        assert sum(tree.num_children(u) == 1 for u in tree.nodes()) == 1
+        num_nodes = ts.num_nodes
+        num_populations = ts.num_populations
+        num_sites = ts.num_sites
+        num_individuals = ts.num_individuals
+        ts = tsdate.preprocess_ts(ts)
+        assert ts.num_nodes == num_nodes - 1  # Unary node removed
+        assert ts.num_populations == num_populations
+        assert ts.num_sites == num_sites
+        assert ts.num_individuals == num_individuals
+
+    def test_simplified_params_passed(self):
+        tables = tskit.Tree.generate_comb(3).tree_sequence.dump_tables()
+        tables.sites.add_row(0.5, "A")
+        tables.populations.add_row()
+        tables.individuals.add_row()
+        ts = tables.tree_sequence()
+        num_populations = ts.num_populations
+        num_individuals = ts.num_individuals
+        ts = tsdate.preprocess_ts(ts, filter_individuals=True)
+        assert ts.num_populations == num_populations
+        assert ts.num_individuals == num_individuals - 1
+
+    def test_record_provenance(self):
+        tables = tskit.Tree.generate_comb(3).tree_sequence.dump_tables()
+        tables.sites.add_row(0.5, "A")
+        ts = tables.tree_sequence()
+        num_provenances = ts.num_provenances
+        ts = tsdate.preprocess_ts(ts)
+        assert ts.num_provenances == num_provenances + 1
+        prov = json.loads(ts.provenance(-1).record)
+        assert prov["software"]["name"] == "tsdate"
+        assert prov["parameters"]["command"] == "preprocess_ts"
+        ts = tsdate.preprocess_ts(ts, record_provenance=False)
+        assert ts.num_provenances == num_provenances + 1
+
+    def test_trim_flanks(self):
+        tables = tskit.Tree.generate_comb(3, span=100).tree_sequence.dump_tables()
+        tables.sites.add_row(10, "A")
+        tables.sites.add_row(90, "A")
+        ts = tables.tree_sequence()
+        assert ts.sequence_length == 100
+        assert ts.num_trees == 1
+        ts = tsdate.preprocess_ts(ts)
+        assert ts.num_trees == 3
+        assert ts.first().num_edges == 0
+        assert ts.first().interval.right == 10 - 1
+        assert ts.last().num_edges == 0
+        assert ts.last().interval.left == 90 + 1
+
+    def test_sim_example(self):
+        # Test a larger example
+        ts = msprime.sim_ancestry(
+            20,
+            sequence_length=1e4,
+            recombination_rate=0.0005,
+            record_full_arg=True,
+            random_seed=1,
+        )
+        tables = msprime.sim_mutations(ts, rate=0.01, random_seed=1).dump_tables()
+        tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
+        ts = tables.tree_sequence()
+        num_nodes = ts.simplify().num_nodes
+        num_trees = ts.simplify().num_trees
+        assert num_trees > 50
+        ts = tsdate.preprocess_ts(ts)
+        assert ts.num_nodes > num_nodes  # Nodes added by split_disjoint
+        assert np.sum((ts.nodes_flags & tsdate.NODE_SPLIT_BY_PREPROCESS) != 0) > 0
+        first_empty = int(ts.first().num_edges == 0)
+        last_empty = int(ts.last().num_edges == 0)
+        # Next assumes no breakpoints before first site or after last
+        assert ts.num_trees == num_trees + first_empty + last_empty
+
+    # TODO - test minimum_gap param
diff --git a/tsdate/__init__.py b/tsdate/__init__.py
index ba79af6f..ef5b9f8c 100644
--- a/tsdate/__init__.py
+++ b/tsdate/__init__.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 from .cache import *  # NOQA: F401,F403
+from .constants import *  # NOQA
 from .core import date  # NOQA: F401
 from .core import inside_outside  # NOQA: F401
 from .core import maximization  # NOQA: F401
diff --git a/tsdate/constants.py b/tsdate/constants.py
new file mode 100644
index 00000000..a5946578
--- /dev/null
+++ b/tsdate/constants.py
@@ -0,0 +1,28 @@
+# MIT License
+#
+# Copyright (c) 2024 Tskit Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Collection of constants used in tsdate. We also make use of constants defined in tskit.
+"""
+# Since tsdate is often used together with tsinfer, we try not to use the tsinfer
+# node flag constants here, and start from 1 << 30 rather than 1 << 16
+
+NODE_SPLIT_BY_PREPROCESS = 1 << 30
diff --git a/tsdate/util.py b/tsdate/util.py
index 7f8139a2..fdb10f00 100644
--- a/tsdate/util.py
+++ b/tsdate/util.py
@@ -31,6 +31,7 @@
 import tskit
 from numba.types import UniTuple as _unituple
 
+from . import constants
 from . import provenance
 from .approx import _b1r
 from .approx import _f
@@ -66,42 +67,53 @@ def preprocess_ts(
     *,
     minimum_gap=None,
     remove_telomeres=None,
+    delete_intervals=None,
+    split_disjoint=None,
     filter_populations=False,
     filter_individuals=False,
     filter_sites=False,
-    delete_intervals=None,
+    record_provenance=None,
     **kwargs,
 ):
     """
-    Function to prepare tree sequences for dating by removing gaps without sites and
-    simplifying the tree sequence. Large regions without data can cause
-    overflow/underflow errors in the inside-outside algorithm and poor performance more
-    generally. Removed regions are recorded in the provenance of the resulting tree
-    sequence.
+    Function to prepare tree sequences for dating by modifying the tree sequence
+    to increase the accuracy of dating. This can involve removing data-poor regions,
+    removing locally-unary segments of nodes via simplification, and splitting
+    discontinuous nodes.
 
     :param tskit.TreeSequence tree_sequence: The input tree sequence
         to be preprocessed.
     :param float minimum_gap: The minimum gap between sites to remove from the tree
-        sequence. Default: ``None`` treated as ``1000000``
+        sequence. Default: ``None`` treated as ``1000000``. Removed regions are recorded
+        in the provenance of the resulting tree sequence.
     :param bool remove_telomeres: Should all material before the first site and after the
         last site be removed, regardless of the length. Default: ``None`` treated as
         ``True``
-    :param bool filter_populations: parameter passed to the ``tskit.simplify``
-        command. Unlike calling that command directly, this defaults to ``False``, such
-        that all populations in the tree sequence are kept.
-    :param bool filter_individuals: parameter passed to the ``tskit.simplify``
-        command. Unlike calling that command directly, this defaults to ``False``, such
-        that all individuals in the tree sequence are kept
-    :param bool filter_sites: parameter passed to the ``tskit.simplify``
-        command. Unlike calling that command directly, this defaults to ``False``, such
-        that all sites in the tree sequence are kept
     :param array_like delete_intervals: A list (start, end) pairs describing the
         genomic intervals (gaps) to delete. This is usually left as ``None``
         (the default) in which case ``minimum_gap`` and ``remove_telomeres`` are used
         to determine the gaps to remove, and the calculated intervals are recorded in
         the provenance of the resulting tree sequence.
-    :param \\**kwargs: All further keyword arguments are passed to the ``tskit.simplify``
-        command.
+    :param bool split_disjoint: Run the {func}`split_disjoint_nodes` function
+        on the returned tree sequence, breaking any disjoint node into nodes that can
+        be dated separately (Default: ``None`` treated as ``True``).
+    :param bool filter_populations: parameter passed to the
+        {meth}`tskit.TreeSequence.simplify` command. Unlike calling that command
+        directly, this defaults to ``False``, such that all populations in the tree
+        sequence are kept.
+    :param bool filter_individuals: parameter passed to the
+        {meth}`tskit.TreeSequence.simplify` command. Unlike calling that command
+        directly, this defaults to ``False``, such
+        that all individuals in the tree sequence are kept.
+    :param bool filter_sites: parameter passed to the
+        {meth}`tskit.TreeSequence.simplify` command. Unlike calling that command
+        directly, this defaults to ``False``, such
+        that all sites in the tree sequence are kept.
+    :param bool record_provenance: If ``True``, record details of this call to
+        simplify in the returned tree sequence's provenance information
+        (Default: ``None`` treated as ``True``).
+    :param \\**kwargs: All further keyword arguments are passed to the
+        {meth}`tskit.TreeSequence.simplify` command.
 
     :return: A tree sequence with gaps removed.
     :rtype: tskit.TreeSequence
@@ -109,6 +121,10 @@ def preprocess_ts(
 
     logger.info("Beginning preprocessing")
     logger.info(f"Minimum_gap: {minimum_gap} and remove_telomeres: {remove_telomeres}")
+    if split_disjoint is None:
+        split_disjoint = True
+    if record_provenance is None:
+        record_provenance = True
     if delete_intervals is not None and (
         minimum_gap is not None or remove_telomeres is not None
     ):
@@ -177,17 +193,22 @@ def preprocess_ts(
             record_provenance=False,
             **kwargs,
         )
-    provenance.record_provenance(
-        tables,
-        "preprocess_ts",
-        minimum_gap=minimum_gap,
-        remove_telomeres=remove_telomeres,
-        filter_populations=filter_populations,
-        filter_individuals=filter_individuals,
-        filter_sites=filter_sites,
-        delete_intervals=delete_intervals,
-    )
-    return tables.tree_sequence()
+    if record_provenance:
+        provenance.record_provenance(
+            tables,
+            "preprocess_ts",
+            minimum_gap=minimum_gap,
+            remove_telomeres=remove_telomeres,
+            split_disjoint=split_disjoint,
+            filter_populations=filter_populations,
+            filter_individuals=filter_individuals,
+            filter_sites=filter_sites,
+            delete_intervals=delete_intervals,
+        )
+    ts = tables.tree_sequence()
+    if split_disjoint:
+        ts = split_disjoint_nodes(ts, record_provenance=False)
+    return ts
 
 
 def nodes_time_unconstrained(tree_sequence):
@@ -344,9 +365,43 @@ def mutation_span_array(tree_sequence):
     return mutation_spans, mutation_edges
 
 
-@numba.njit(_unituple(_i1w, 3)(_i1r, _i1r, _f1r, _f1r, _b1r))
+# Some functions for changing tskit metadata
+# See https://github.com/tskit-dev/tskit/discussions/2954
+# TODO - potentially possible to speed up using numba?
+def _reorder_nodes(node_table, order, extra_md_dict):
+    # extra_md_dict ({rowid: new_byte_metadata}) can be used to pass metadata to replace
+    # the existing metadata in a row. This works by creating new rows for the metadata,
+    # based on the algorithm in https://github.com/tskit-dev/tskit/discussions/2954
+    data = [node_table.metadata]
+    # add a list of new byte arrays, then concat
+    md_dtype, md_off_dtype = node_table.metadata.dtype, node_table.metadata_offset.dtype
+    data += [np.array(bytearray(v), dtype=md_dtype) for v in extra_md_dict.values()]
+    md = np.concatenate(data)
+    if len(md) == 0:  # Common edge case: no metadata
+        md_off = np.zeros(len(order) + 1, dtype=md_off_dtype)
+    else:
+        extra_offsets = np.cumsum([len(d) for d in data], dtype=md_off_dtype)[1:]
+        md_off = np.concatenate((node_table.metadata_offset, extra_offsets))
+        arr = tskit.unpack_arrays(md, md_off)
+        if len(extra_md_dict) > 0:
+            # map the keys in extra_md_dict to the new row ids
+            d = {k: i + node_table.num_rows for i, k in enumerate(extra_md_dict.keys())}
+            md, md_off = tskit.pack_arrays([arr[d.get(i, i)] for i in order], md_dtype)
+        else:
+            md, md_off = tskit.pack_arrays([arr[i] for i in order], md_dtype)
+    node_table.set_columns(
+        flags=node_table.flags[order],
+        time=node_table.time[order],
+        population=node_table.population[order],
+        individual=node_table.individual[order],
+        metadata=md,
+        metadata_offset=md_off,
+    )
+
+
+@numba.njit(_unituple(_i1w, 4)(_i1r, _i1r, _f1r, _f1r, _b1r))
 def _split_disjoint_nodes(
-    edges_parent, edges_child, edges_left, edges_right, nodes_exclude
+    edges_parent, edges_child, edges_left, edges_right, node_excluded
 ):
     """
     Split disconnected regions of nodes into separate nodes.
@@ -356,7 +411,7 @@ def _split_disjoint_nodes(
     """
     assert edges_parent.size == edges_child.size == edges_left.size == edges_right.size
     num_edges = edges_parent.size
-    num_nodes = nodes_exclude.size
+    num_nodes = node_excluded.size
 
     # For each edge, check whether parent/child is separated by a gap from the
     # previous edge involving either parent/child. Label disconnected segments
@@ -365,11 +420,11 @@ def _split_disjoint_nodes(
     # TODO: is a sort really needed here?
     edges_segments = np.full((2, num_edges), -1, dtype=np.int32)
     nodes_segments = np.full(num_nodes, -1, dtype=np.int32)
-    nodes_right = np.full(nodes_exclude.size, -np.inf, dtype=np.float64)
+    nodes_right = np.full(node_excluded.size, -np.inf, dtype=np.float64)
     for e in edges_order:
         nodes = edges_parent[e], edges_child[e]
         for i, n in enumerate(nodes):
-            if nodes_exclude[n]:
+            if node_excluded[n]:
                 continue
             nodes_segments[n] += edges_left[e] > nodes_right[n]
             edges_segments[i, e] = nodes_segments[n]
@@ -377,15 +432,18 @@ def _split_disjoint_nodes(
 
     # Create "nodes_segments[i]" supplementary nodes by copying node "i".
     # Store the id of the first supplement for each node in "nodes_map".
-    nodes_order = [i for i in range(num_nodes)]
+    split_nodes = []  # the nodes in the original that were split
     nodes_map = np.full(num_nodes, -1, dtype=np.int32)
     for i, s in enumerate(nodes_segments):
         for j in range(s):
             if j == 0:
                 nodes_map[i] = num_nodes
-            nodes_order.append(i)
+            split_nodes.append(i)
             num_nodes += 1
-    nodes_order = np.array(nodes_order, dtype=np.int32)
+    split_nodes = np.array(split_nodes, dtype=np.int32)
+    nodes_order = np.arange(num_nodes, dtype=np.int32)
+    if len(split_nodes) > 0:
+        nodes_order[-len(split_nodes) :] = split_nodes
 
     # Relabel the nodes on each edge given "nodes_map"
     for e in edges_order:
@@ -397,7 +455,7 @@ def _split_disjoint_nodes(
                 edges_segments[i, e] = n
     edges_parent, edges_child = edges_segments[0, ...], edges_segments[1, ...]
 
-    return edges_parent, edges_child, nodes_order
+    return edges_parent, edges_child, nodes_order, split_nodes
 
 
 @numba.njit(_i1w(_i1r, _f1r, _i1r, _i1r, _i1r, _f1r, _f1r, _i1r, _i1r))
@@ -458,30 +516,29 @@ def _relabel_mutations_node(
     return output
 
 
-# def _naive_relabel_mutations_node(ts, nodes_order, mutations_node):
-#     num_nodes = nodes_order.size
-#     new_node_id = np.full(num_nodes, tskit.NULL)
-#     for t in ts.trees():
-#         for n in t.nodes(): # mapping from original to new node ids
-#             new_node_id[nodes_order[n]] = n
-#         for m in t.mutations():
-#             mutations_node[m.id] = new_node_id[mutations_node[m.id]]
-#     return mutations_node
-
-
-def split_disjoint_nodes(ts):
+def split_disjoint_nodes(ts, *, record_provenance=None):
     """
     For each non-sample node, split regions separated by gaps into distinct
-    nodes.
+    nodes, returning a tree sequence with potentially duplicated nodes.
 
     Where there are multiple disconnected regions, the leftmost one is assigned
     the ID of the original node, and the remainder are assigned new node IDs.
     Population, flags, individual, time, and metadata are all copied into the
-    new nodes.
+    new nodes. Nodes that have been split will be flagged with
+    ``tsdate.NODE_SPLIT_BY_PREPROCESS``. The metadata of these nodes will also be
+    updated with an `unsplit_node_id` field giving the node ID in the input tree
+    sequence to which they correspond. If this metadata cannot be set, a warning
+    is emitted.
+
+    :param bool record_provenance: If ``True``, record details of this call in the
+        returned tree sequence's provenance information (Default: ``None`` treated
+        as ``True``).
     """
-
+    metadata_key = "unsplit_node_id"
+    if record_provenance is None:
+        record_provenance = True
     node_is_sample = np.bitwise_and(ts.nodes_flags, tskit.NODE_IS_SAMPLE).astype(bool)
-    edges_parent, edges_child, nodes_order = _split_disjoint_nodes(
+    edges_parent, edges_child, nodes_order, split_nodes = _split_disjoint_nodes(
         ts.edges_parent,
         ts.edges_child,
         ts.edges_left,
@@ -500,19 +557,25 @@ def split_disjoint_nodes(ts):
         ts.indexes_edge_insertion_order,
         ts.indexes_edge_removal_order,
     )
-
     tables = ts.dump_tables()
-    tables.nodes.set_columns(
-        flags=tables.nodes.flags[nodes_order],
-        time=tables.nodes.time[nodes_order],
-        population=tables.nodes.population[nodes_order],
-        individual=tables.nodes.individual[nodes_order],
-    )
-    # TODO: copy existing metadata for original nodes
-    # TODO: add new metadata indicating origin for split nodes
-    # TODO: add flag for split nodes
+
+    # Update the nodes table (complex because we have made new nodes)
+    flags = tables.nodes.flags
+    flags[split_nodes] |= constants.NODE_SPLIT_BY_PREPROCESS
+    tables.nodes.flags = flags
+    extra_md = {}
+    try:
+        for u in split_nodes:
+            md = ts.node(u).metadata
+            md[metadata_key] = int(u)
+            extra_md[u] = tables.nodes.metadata_schema.validate_and_encode_row(md)
+    except (TypeError, tskit.MetadataValidationError):
+        logger.warning(f"Could not set '{metadata_key}' on node metadata")
+    _reorder_nodes(tables.nodes, nodes_order, extra_md)
+    # Update the edges table
     tables.edges.parent = edges_parent
     tables.edges.child = edges_child
+    # Update the mutations table
     tables.mutations.node = mutations_node
     tables.sort()
 
@@ -520,6 +583,11 @@ def split_disjoint_nodes(ts):
         tables.nodes.time[tables.mutations.node], ts.nodes_time[ts.mutations_node]
     )
 
+    if record_provenance:
+        provenance.record_provenance(
+            tables,
+            "split_disjoint_nodes",
+        )
     return tables.tree_sequence()
 
 

From 13d0269f65a178882c44f933a4bb830e496ee338 Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Mon, 3 Jun 2024 18:36:27 +0100
Subject: [PATCH 2/2] Move old disjoint tests

And address other comments
---
 tests/test_functions.py | 79 --------------------------------------
 tests/test_util.py      | 85 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 86 deletions(-)

diff --git a/tests/test_functions.py b/tests/test_functions.py
index 35ec918e..a6a3d7c7 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -55,7 +55,6 @@
 from tsdate.prior import SpansBySamples
 from tsdate.util import constrain_ages
 from tsdate.util import nodes_time_unconstrained
-from tsdate.util import split_disjoint_nodes
 
 
 class TestBasicFunctions:
@@ -2244,81 +2243,3 @@ def test_bad_arguments(self):
                 demography.to_natural_timescale(time)
             with pytest.raises(ValueError, match="a numpy array"):
                 demography.to_coalescent_timescale(time)
-
-
-class TestNodeSplitting:
-    """
-    Test that node splitting routines have the desired outcome
-    """
-
-    @staticmethod
-    def has_disjoint_nodes(ts):
-        """
-        Brute force check for disjoint nodes, by pulling out edge intervals for
-        each node; taking the union of intervals; checking that a single
-        interval remains
-        """
-
-        def merge_intervals(intervals):
-            intervals = sorted(intervals, key=lambda x: x[0])
-            result = []
-            (start_candidate, stop_candidate) = intervals[0]
-            for start, stop in intervals[1:]:
-                if start <= stop_candidate:
-                    stop_candidate = max(stop, stop_candidate)
-                else:
-                    result.append((start_candidate, stop_candidate))
-                    (start_candidate, stop_candidate) = (start, stop)
-            result.append((start_candidate, stop_candidate))
-            return result
-
-        intervals_by_node = {i: [] for i in range(ts.num_nodes)}
-        for e in ts.edges():
-            intervals_by_node[e.parent].append([e.left, e.right])
-            intervals_by_node[e.child].append([e.left, e.right])
-
-        for n in range(ts.num_nodes):
-            intr = merge_intervals(intervals_by_node[n])
-            if len(intr) != 1:
-                return True
-
-        return False
-
-    @staticmethod
-    def childset_changes_with_root(ts):
-        """
-        If root nodes are split whenever their children change, the next root
-        should have the same child set if it has the same ID
-        """
-        last_childset = frozenset()
-        last_root = tskit.NULL
-        for t in ts.trees():
-            if t.num_edges == 0:
-                last_childset = frozenset()
-                last_root = tskit.NULL
-            else:
-                if t.num_roots > 1:
-                    return False
-                childset = frozenset(list(t.children(t.root)))
-                if t.root == last_root and childset != last_childset:
-                    return False
-                last_childset = childset
-                last_root = t.root
-        return True
-
-    def test_split_disjoint_nodes(self):
-        ts = msprime.sim_ancestry(
-            10,
-            population_size=1e4,
-            recombination_rate=1e-8,
-            sequence_length=1e6,
-            random_seed=1,
-        )
-        ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=1)
-        sample_data = tsinfer.SampleData.from_tree_sequence(ts)
-        inferred_ts = tsinfer.infer(sample_data).simplify()
-        split_ts = split_disjoint_nodes(inferred_ts)
-        assert self.has_disjoint_nodes(inferred_ts)
-        assert not self.has_disjoint_nodes(split_ts)
-        assert split_ts.num_edges == inferred_ts.num_edges
-        assert split_ts.num_nodes > inferred_ts.num_nodes
diff --git a/tests/test_util.py b/tests/test_util.py
index b480eb66..6eb626fe 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -28,31 +28,85 @@
 import msprime
 import numpy as np
 import pytest
+import tsinfer
 import tskit
 
 import tsdate
 
 
 class TestSplitDisjointNodes:
+    @staticmethod
+    def has_disjoint_nodes(ts):
+        """
+        Brute force check for disjoint nodes, by pulling out edge intervals for
+        each node; taking the union of intervals; checking that a single
+        interval remains
+        """
+
+        def merge_intervals(intervals):
+            intervals = sorted(intervals, key=lambda x: x[0])
+            result = []
+            (start_candidate, stop_candidate) = intervals[0]
+            for start, stop in intervals[1:]:
+                if start <= stop_candidate:
+                    stop_candidate = max(stop, stop_candidate)
+                else:
+                    result.append((start_candidate, stop_candidate))
+                    (start_candidate, stop_candidate) = (start, stop)
+            result.append((start_candidate, stop_candidate))
+            return result
+
+        intervals_by_node = {i: [] for i in range(ts.num_nodes)}
+        for e in ts.edges():
+            intervals_by_node[e.parent].append([e.left, e.right])
+            intervals_by_node[e.child].append([e.left, e.right])
+
+        for n in range(ts.num_nodes):
+            intr = merge_intervals(intervals_by_node[n])
+            if len(intr) != 1:
+                return True
+
+        return False
+
+    @staticmethod
+    def childset_changes_with_root(ts):
+        """
+        If root nodes are split whenever their children change, the next root
+        should have the same child set if it has the same ID
+        """
+        last_childset = frozenset()
+        last_root = tskit.NULL
+        for t in ts.trees():
+            if t.num_edges == 0:
+                last_childset = frozenset()
+                last_root = tskit.NULL
+            else:
+                if t.num_roots > 1:
+                    return False
+                childset = frozenset(list(t.children(t.root)))
+                if t.root == last_root and childset != last_childset:
+                    return False
+                last_childset = childset
+                last_root = t.root
+        return True
+
     def test_nosplit(self):
         ts = tskit.Tree.generate_comb(5).tree_sequence
         split_ts = tsdate.util.split_disjoint_nodes(ts)
-        assert split_ts.num_nodes == ts.num_nodes
-        assert split_ts.num_edges == ts.num_edges
-        assert split_ts.num_trees == ts.num_trees
-        for node in split_ts.nodes():
-            assert node.flags & tsdate.NODE_SPLIT_BY_PREPROCESS == 0
+        assert ts.equals(split_ts, ignore_provenance=True)
         prov = json.loads(split_ts.provenance(-1).record)
         assert prov["software"]["name"] == "tsdate"
         assert prov["parameters"]["command"] == "split_disjoint_nodes"
 
-    def test_simple(self):
+    def test_simple(self, caplog):
         tables = tskit.Tree.generate_comb(5).tree_sequence.dump_tables()
         tables.delete_intervals([[0.2, 0.8]])
         tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
         ts = tables.tree_sequence()
         num_internal_nodes = ts.num_nodes - ts.num_samples
-        split_ts = tsdate.util.split_disjoint_nodes(ts)
+        with caplog.at_level(logging.WARNING):
+            split_ts = tsdate.util.split_disjoint_nodes(ts)
+            assert caplog.text == ""
         num_new_internal_nodes = split_ts.num_nodes - split_ts.num_samples
         assert split_ts.num_nodes > ts.num_nodes
         # all internal nodes should be split
@@ -131,6 +185,23 @@ def test_no_provenance(self):
         split_ts = tsdate.util.split_disjoint_nodes(ts, record_provenance=True)
         assert split_ts.num_provenances == ts.num_provenances + 1
 
+    def test_inferred(self):
+        ts = msprime.sim_ancestry(
+            10,
+            population_size=1e4,
+            recombination_rate=1e-8,
+            sequence_length=1e6,
+            random_seed=1,
+        )
+        ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=1)
+        sample_data = tsinfer.SampleData.from_tree_sequence(ts)
+        inferred_ts = tsinfer.infer(sample_data).simplify()
+        split_ts = tsdate.util.split_disjoint_nodes(inferred_ts)
+        assert self.has_disjoint_nodes(inferred_ts)
+        assert not self.has_disjoint_nodes(split_ts)
+        assert split_ts.num_edges == inferred_ts.num_edges
+        assert split_ts.num_nodes > inferred_ts.num_nodes
+
 
 class TestPreprocessTs:
     def test_no_sites(self):