sgkit-dev · percyfal · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 18, 2024
diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py
@@ -15,6 +15,7 @@ def bio2zarr():
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
 bio2zarr.add_command(cli.vcf2zarr_main)
+bio2zarr.add_command(cli.bed2zarr_main)
 bio2zarr.add_command(cli.plink2zarr)
 bio2zarr.add_command(cli.vcfpartition)
 

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
@@ -0,0 +1,229 @@
+import dataclasses
+import logging
+import math
+from enum import Enum
+from typing import Any
+
+import numcodecs
+import numpy as np
+import pandas as pd
+import zarr
+
+from . import core, provenance
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
+BED_ZARR_VERSION = 0.1
+
+
+class BedType(Enum):
+    BED3 = 3
+    BED4 = 4
+    BED5 = 5
+    BED6 = 6
+    BED7 = 7
+    BED8 = 8
+    BED9 = 9
+    BED12 = 12
+
+
+@dataclasses.dataclass
+class BedFieldSummary(core.JsonDataclass):
+    num_chunks: int = 0
+    compressed_size: int = 0
+    uncompressed_size: int = 0
+    max_value: Any = -math.inf
+    min_value: Any = math.inf
+
+    def update(self, other):
+        self.num_chunks = other.num_chunks
+        self.compressed_size = other.compressed_size
+        self.uncompressed_size = other.uncompressed_size
+        self.min_value = min(self.min_value, other.min_value)
+        self.max_value = max(self.max_value, other.max_value)
+
+    @staticmethod
+    def fromdict(d):
+        return BedFieldSummary(**d)
+
+
+@dataclasses.dataclass
+class BedField:
+    category: str
+    name: str
+    alias: str
+    description: str
+    bed_dtype: str
+    summary: BedFieldSummary
+
+    def smallest_dtype(self):
+        """Return the smallest dtype suitable for this field based on
+        type and values"""
+        s = self.summary
+        if self.bed_dtype == "Integer":
+            if not math.isfinite(s.max_value):
+                ret = "i1"
+            else:
+                ret = core.min_int_dtype(s.min_value, s.max_value)
+        elif self.bed_dtype == "Character":
+            ret = "U1"
+        elif self.bed_dtype == "Category":
+            ret = core.min_int_dtype(s.min_value, s.max_value)
+        else:
+            assert self.bed_dtype == "String"
+            ret = "O"
+        return ret
+
+
+def guess_bed_file_type(path):
+    """Check the number of columns in a BED file and guess BED
+    type."""
+    first = pd.read_table(path, header=None, nrows=1)
+    num_cols = first.shape[1]
+    if num_cols < 3:
+        raise ValueError(f"Expected at least 3 columns in BED file, got {num_cols}")
+    if num_cols > 12:
+        raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}")
+    if num_cols in (10, 11):
+        raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns")
+    if num_cols in (9, 12):
+        raise ValueError("BED9 and BED12 are valid but currently unsupported formats")
+    return BedType(num_cols)
+
+
+# See https://samtools.github.io/hts-specs/BEDv1.pdf
+def mandatory_bed_field_definitions():
+    def make_field_def(name, alias, bed_dtype, description=""):
+        return BedField(
+            category="mandatory",
+            name=name,
+            alias=alias,
+            description=description,
+            bed_dtype=bed_dtype,
+            summary=BedFieldSummary(),
+        )
+
+    fields = [
+        make_field_def("contig", "chrom", "Category", "Chromosome name"),
+        make_field_def("start", "chromStart", "Integer", "Name start position"),
+        make_field_def("end", "chromEnd", "Integer", "Name end position"),
+    ]
+    return fields
+
+
+def optional_bed_field_definitions(num_fields=0):
+    def make_field_def(name, alias, bed_dtype, description=""):
+        return BedField(
+            category="optional",
+            name=name,
+            alias=alias,
+            description=description,
+            bed_dtype=bed_dtype,
+            summary=BedFieldSummary(),
+        )
+
+    fields = [
+        make_field_def("name", "name", "Category", "Name description"),
+        make_field_def("score", "score", "Integer", "A numerical value"),
+        make_field_def("strand", "strand", "Character", "Name strand"),
+        make_field_def("thickStart", "thickStart", "Integer", "Thick start position"),
+        make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"),
+        make_field_def("itemRgb", "itemRgb", "Integer", "Display"),
+        make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"),
+        make_field_def("blockSizes", "blockSizes", "Integer", "Block sizes"),
+        make_field_def(
+            "blockStarts", "blockStarts", "Integer", "Block start positions"
+        ),
+    ]
+
+    return fields[:num_fields]
+
+
+def mkfields(bed_type):
+    mandatory = mandatory_bed_field_definitions()
+    optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value)
+    return mandatory + optional
+
+
+def encode_categoricals(data, bed_type):
+    """Convert categoricals to integer encodings."""
+    contig = pd.Categorical(
+        data["contig"], categories=data["contig"].unique(), ordered=True
+    )
+    contig_id = contig.categories.values
+    contig_id = contig_id.astype(f"<U{len(max(contig_id, key=len))}")
+    data["contig"] = contig.codes
+    if bed_type.value >= BedType.BED4.value:
+        name = pd.Categorical(
+            data["name"], categories=data["name"].unique(), ordered=True
+        )
+        name_id = name.categories.values
+        name_id = name_id.astype(f"<U{len(max(name_id, key=len))}")
+        data["name"] = name.codes
+    else:
+        name_id = None
+    return data, contig_id, name_id
+
+
+def update_field_bounds(data, bed_type):
+    """Update field bounds based on data."""
+    ret = []
+    fields = mkfields(bed_type)
+    for f in fields:
+        if f.bed_dtype == "Integer":
+            if f.name in ("itemRgb", "blockSizes", "blockStarts"):
+                if data[f.name].dtype == "O":
+                    values = np.concatenate(data[f.name])
+                else:
+                    values = data[f.name]
+                f.summary.min_value = min(values)
+                f.summary.max_value = max(values)
+            else:
+                f.summary.min_value = data[f.name].min()
+                f.summary.max_value = data[f.name].max()
+        elif f.bed_dtype == "Category":
+            f.summary.min_value = 0
+            f.summary.max_value = max(data[f.name].unique())
+        ret.append(f)
+    return ret
+
+
+def bed2zarr(
+    bed_path,
+    zarr_path,
+    records_chunk_size=None,
+):
+    bed_type = guess_bed_file_type(bed_path)
+    fields = mkfields(bed_type)
+    data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
+    data, contig_id, name_id = encode_categoricals(data, bed_type)
+    fields = update_field_bounds(data, bed_type)
+    dtypes = {f.name: f.smallest_dtype() for f in fields}
+    data.index.name = "records"
+    data = data.astype(dtypes)
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store)
+    root.attrs.update(
+        {
+            "bed_zarr_version": f"{BED_ZARR_VERSION}",
+            "source": f"bio2zarr-{provenance.__version__}",
+        }
+    )
+    for field in fields[0 : bed_type.value]:
+        if field.name == "strand":
+            root.array(
+                field.name,
+                data[field.name].values,
+                chunks=(records_chunk_size,),
+                dtype="<U1",
+            )
+        else:
+            root.array(
+                field.name,
+                data[field.name].values,
+                chunks=(records_chunk_size,),
+            )
+    root.array("contig_id", contig_id, chunks=(len(contig_id),))
+    if bed_type.value >= BedType.BED4.value:
+        root.array("name_id", name_id, chunks=(len(name_id),))
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -8,7 +8,7 @@
 import numcodecs
 import tabulate
 
-from . import plink, provenance, vcf2zarr, vcf_utils
+from . import bed2zarr, plink, provenance, vcf2zarr, vcf_utils
 from .vcf2zarr import icf as icf_mod
 
 logger = logging.getLogger(__name__)
@@ -128,6 +128,14 @@ def list_commands(self, ctx):
     help="Chunk size in the samples dimension",
 )
 
+records_chunk_size = click.option(
+    "-r",
+    "--records-chunk-size",
+    type=int,
+    default=None,
+    help="Chunk size in the records dimension",
+)
+
 schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
 
 max_variant_chunks = click.option(
@@ -574,6 +582,39 @@ def plink2zarr():
 plink2zarr.add_command(convert_plink)
 
 
+@click.command
+@version
+@click.argument(
+    "bed_path",
+    type=click.Path(exists=True, dir_okay=False),
+)
+@new_zarr_path
+@records_chunk_size
+@verbose
+@force
+def bed2zarr_main(
+    bed_path,
+    zarr_path,
+    records_chunk_size,
+    verbose,
+    force,
+):
+    """
+    Convert BED file to the Zarr format. Each BED column will be
+    converted to a Zarr array with appropriate encoding.
+
+    The BED file must be compressed and tabix-indexed. BED9 and BED12
+    formats are currently not supported.
+    """
+    setup_logging(verbose)
+    check_overwrite_dir(zarr_path, force)
+    bed2zarr.bed2zarr(
+        bed_path,
+        zarr_path,
+        records_chunk_size=records_chunk_size,
+    )
+
+
 @click.command
 @version
 @vcfs

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
   # colouredlogs pulls in humanfriendly",
   "cyvcf2",
   "bed_reader",
+  "pandas",
 ]
 requires-python = ">=3.9"
 classifiers = [
@@ -45,6 +46,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
 documentation = "https://sgkit-dev.github.io/bio2zarr/"
 
 [project.scripts]
+bed2zarr = "bio2zarr.cli:bed2zarr_main"
 vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
 vcfpartition = "bio2zarr.cli:vcfpartition"
 

diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz
diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi
diff --git a/tests/data/bed/sample_mask.bed.gz b/tests/data/bed/sample_mask.bed.gz
diff --git a/tests/data/bed/sample_mask.bed.gz.csi b/tests/data/bed/sample_mask.bed.gz.csi