From 04dba55bc2df374cb8d0b63a144a3151fc2646fc Mon Sep 17 00:00:00 2001 From: George Powley Date: Tue, 1 Aug 2023 10:17:15 -0400 Subject: [PATCH] Enable ingestion QACheck when resume is enabled (#552) --- docker/Dockerfile-py | 3 ++- libtiledbvcf/src/write/writer.cc | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile-py b/docker/Dockerfile-py index 0fa7008bf..09c00eee8 100644 --- a/docker/Dockerfile-py +++ b/docker/Dockerfile-py @@ -50,7 +50,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins && rm -rf /var/lib/apt/lists* # avoid --home to prevent issues with singularity -RUN pip install --no-cache-dir cython 'pandas>=1.5.0,<2.0.0' tiledb==0.21.2 +# TODO: remove cython pin after updating the arrow version +RUN pip install --no-cache-dir 'cython<3' 'pandas>=1.5.0,<2.0.0' tiledb==0.21.2 # Build arrow ENV ARROW_HOME=/usr/local diff --git a/libtiledbvcf/src/write/writer.cc b/libtiledbvcf/src/write/writer.cc index f52bc2eba..922e5e5b9 100644 --- a/libtiledbvcf/src/write/writer.cc +++ b/libtiledbvcf/src/write/writer.cc @@ -499,11 +499,8 @@ void Writer::ingest_samples() { time_sec, records_ingested / time_sec)); - // Check records ingested matches total records in VCF files, unless resume - // is enabled because resume may not ingest all records in the VCF files - // (check not implemented for V2/V3) - if (dataset_->metadata().version >= TileDBVCFDataset::Version::V4 && - !ingestion_params_.resume_sample_partial_ingestion) { + // Check if records ingested matches the expected total record count. + if (dataset_->metadata().version >= TileDBVCFDataset::Version::V4) { if (records_ingested != total_records_expected_) { std::string message = fmt::format( "QACheck: [FAIL] Total records ingested ({}) != total records in VCF " @@ -828,6 +825,9 @@ std::pair Writer::ingest_samples_v4( // Remove the region if marked to skip if (skip) { LOG_DEBUG("Resume: skipping contig {}", contig); + // Remove records from the total expected record count. + total_records_expected_ -= total_contig_records[contig]; + it = regions_v4.erase(it); } else { it++;