Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft bed2zarr code #281

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bio2zarr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def bio2zarr():
# is handy for development and for those whose PATHs aren't set
# up in the right way.
bio2zarr.add_command(cli.vcf2zarr_main)
bio2zarr.add_command(cli.bed2zarr_main)
bio2zarr.add_command(cli.plink2zarr)
bio2zarr.add_command(cli.vcfpartition)

Expand Down
229 changes: 229 additions & 0 deletions bio2zarr/bed2zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import dataclasses
import logging
import math
from enum import Enum
from typing import Any

import numcodecs
import numpy as np
import pandas as pd
import zarr

from . import core, provenance

logger = logging.getLogger(__name__)

DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
BED_ZARR_VERSION = 0.1


class BedType(Enum):
BED3 = 3
BED4 = 4
BED5 = 5
BED6 = 6
BED7 = 7
BED8 = 8
BED9 = 9
BED12 = 12


@dataclasses.dataclass
class BedFieldSummary(core.JsonDataclass):
num_chunks: int = 0
compressed_size: int = 0
uncompressed_size: int = 0
max_value: Any = -math.inf
min_value: Any = math.inf

def update(self, other):
self.num_chunks = other.num_chunks
self.compressed_size = other.compressed_size
self.uncompressed_size = other.uncompressed_size
self.min_value = min(self.min_value, other.min_value)
self.max_value = max(self.max_value, other.max_value)

@staticmethod
def fromdict(d):
return BedFieldSummary(**d)


@dataclasses.dataclass
class BedField:
category: str
name: str
alias: str
description: str
bed_dtype: str
summary: BedFieldSummary

def smallest_dtype(self):
"""Return the smallest dtype suitable for this field based on
type and values"""
s = self.summary
if self.bed_dtype == "Integer":
if not math.isfinite(s.max_value):
ret = "i1"
else:
ret = core.min_int_dtype(s.min_value, s.max_value)
elif self.bed_dtype == "Character":
ret = "U1"
elif self.bed_dtype == "Category":
ret = core.min_int_dtype(s.min_value, s.max_value)
else:
assert self.bed_dtype == "String"
ret = "O"
return ret


def guess_bed_file_type(path):
"""Check the number of columns in a BED file and guess BED
type."""
first = pd.read_table(path, header=None, nrows=1)
num_cols = first.shape[1]
if num_cols < 3:
raise ValueError(f"Expected at least 3 columns in BED file, got {num_cols}")
if num_cols > 12:
raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}")
if num_cols in (10, 11):
raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns")
if num_cols in (9, 12):
raise ValueError("BED9 and BED12 are valid but currently unsupported formats")
return BedType(num_cols)


# See https://samtools.github.io/hts-specs/BEDv1.pdf
def mandatory_bed_field_definitions():
def make_field_def(name, alias, bed_dtype, description=""):
return BedField(
category="mandatory",
name=name,
alias=alias,
description=description,
bed_dtype=bed_dtype,
summary=BedFieldSummary(),
)

fields = [
make_field_def("contig", "chrom", "Category", "Chromosome name"),
make_field_def("start", "chromStart", "Integer", "Name start position"),
make_field_def("end", "chromEnd", "Integer", "Name end position"),
]
return fields


def optional_bed_field_definitions(num_fields=0):
def make_field_def(name, alias, bed_dtype, description=""):
return BedField(
category="optional",
name=name,
alias=alias,
description=description,
bed_dtype=bed_dtype,
summary=BedFieldSummary(),
)

fields = [
make_field_def("name", "name", "Category", "Name description"),
make_field_def("score", "score", "Integer", "A numerical value"),
make_field_def("strand", "strand", "Character", "Name strand"),
make_field_def("thickStart", "thickStart", "Integer", "Thick start position"),
make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"),
make_field_def("itemRgb", "itemRgb", "Integer", "Display"),
make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"),
make_field_def("blockSizes", "blockSizes", "Integer", "Block sizes"),
make_field_def(
"blockStarts", "blockStarts", "Integer", "Block start positions"
),
]

return fields[:num_fields]


def mkfields(bed_type):
mandatory = mandatory_bed_field_definitions()
optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value)
return mandatory + optional


def encode_categoricals(data, bed_type):
"""Convert categoricals to integer encodings."""
contig = pd.Categorical(
data["contig"], categories=data["contig"].unique(), ordered=True
)
contig_id = contig.categories.values
contig_id = contig_id.astype(f"<U{len(max(contig_id, key=len))}")
data["contig"] = contig.codes
if bed_type.value >= BedType.BED4.value:
name = pd.Categorical(
data["name"], categories=data["name"].unique(), ordered=True
)
name_id = name.categories.values
name_id = name_id.astype(f"<U{len(max(name_id, key=len))}")
data["name"] = name.codes
else:
name_id = None
return data, contig_id, name_id


def update_field_bounds(data, bed_type):
"""Update field bounds based on data."""
ret = []
fields = mkfields(bed_type)
for f in fields:
if f.bed_dtype == "Integer":
if f.name in ("itemRgb", "blockSizes", "blockStarts"):
if data[f.name].dtype == "O":
values = np.concatenate(data[f.name])
else:
values = data[f.name]
f.summary.min_value = min(values)
f.summary.max_value = max(values)
else:
f.summary.min_value = data[f.name].min()
f.summary.max_value = data[f.name].max()
elif f.bed_dtype == "Category":
f.summary.min_value = 0
f.summary.max_value = max(data[f.name].unique())
ret.append(f)
return ret


def bed2zarr(
bed_path,
zarr_path,
records_chunk_size=None,
):
bed_type = guess_bed_file_type(bed_path)
fields = mkfields(bed_type)
data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
data, contig_id, name_id = encode_categoricals(data, bed_type)
fields = update_field_bounds(data, bed_type)
dtypes = {f.name: f.smallest_dtype() for f in fields}
data.index.name = "records"
data = data.astype(dtypes)
store = zarr.DirectoryStore(zarr_path)
root = zarr.group(store=store)
root.attrs.update(
{
"bed_zarr_version": f"{BED_ZARR_VERSION}",
"source": f"bio2zarr-{provenance.__version__}",
}
)
for field in fields[0 : bed_type.value]:
if field.name == "strand":
root.array(
field.name,
data[field.name].values,
chunks=(records_chunk_size,),
dtype="<U1",
)
else:
root.array(
field.name,
data[field.name].values,
chunks=(records_chunk_size,),
)
root.array("contig_id", contig_id, chunks=(len(contig_id),))
if bed_type.value >= BedType.BED4.value:
root.array("name_id", name_id, chunks=(len(name_id),))
43 changes: 42 additions & 1 deletion bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numcodecs
import tabulate

from . import plink, provenance, vcf2zarr, vcf_utils
from . import bed2zarr, plink, provenance, vcf2zarr, vcf_utils
from .vcf2zarr import icf as icf_mod

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -128,6 +128,14 @@ def list_commands(self, ctx):
help="Chunk size in the samples dimension",
)

records_chunk_size = click.option(
"-r",
"--records-chunk-size",
type=int,
default=None,
help="Chunk size in the records dimension",
)

schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))

max_variant_chunks = click.option(
Expand Down Expand Up @@ -574,6 +582,39 @@ def plink2zarr():
plink2zarr.add_command(convert_plink)


@click.command
@version
@click.argument(
"bed_path",
type=click.Path(exists=True, dir_okay=False),
)
@new_zarr_path
@records_chunk_size
@verbose
@force
def bed2zarr_main(
bed_path,
zarr_path,
records_chunk_size,
verbose,
force,
):
"""
Convert BED file to the Zarr format. Each BED column will be
converted to a Zarr array with appropriate encoding.

The BED file must be compressed and tabix-indexed. BED9 and BED12
formats are currently not supported.
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
bed2zarr.bed2zarr(
bed_path,
zarr_path,
records_chunk_size=records_chunk_size,
)


@click.command
@version
@vcfs
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
# colouredlogs pulls in humanfriendly",
"cyvcf2",
"bed_reader",
"pandas",
]
requires-python = ">=3.9"
classifiers = [
Expand All @@ -45,6 +46,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
documentation = "https://sgkit-dev.github.io/bio2zarr/"

[project.scripts]
bed2zarr = "bio2zarr.cli:bed2zarr_main"
vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
vcfpartition = "bio2zarr.cli:vcfpartition"

Expand Down
Binary file not shown.
Binary file not shown.
Binary file added tests/data/bed/sample_mask.bed.gz
Binary file not shown.
Binary file added tests/data/bed/sample_mask.bed.gz.csi
Binary file not shown.
Loading
Loading