From e0d7df9ac4d88ebfec9f34ad7d2ca01737948d65 Mon Sep 17 00:00:00 2001 From: wdecoster Date: Tue, 1 Nov 2022 00:12:17 +0100 Subject: [PATCH] use f64 for identities should solve weird >100 mean, see also https://github.com/wdecoster/cramino/issues/7 --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/extract_from_bam.rs | 13 ++++++------- src/feather.rs | 8 ++++---- src/histograms.rs | 4 ++-- src/main.rs | 4 ++-- 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b41607..447fa2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -383,7 +383,7 @@ checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "cramino" -version = "0.9.2" +version = "0.9.4" dependencies = [ "arrow", "checksums", diff --git a/Cargo.toml b/Cargo.toml index 63edbe7..8f1a398 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cramino" -version = "0.9.2" +version = "0.9.4" edition = "2021" authors = ["Wouter De Coster decosterwouter@gmail.com"] license = "MIT" diff --git a/src/extract_from_bam.rs b/src/extract_from_bam.rs index d94a949..54e5626 100644 --- a/src/extract_from_bam.rs +++ b/src/extract_from_bam.rs @@ -1,11 +1,10 @@ use bam::ext::BamRecordExtensions; use rust_htslib::bam::record::{Aux, Cigar}; -use rust_htslib::htslib; -use rust_htslib::{bam, bam::Read}; // for BAM_F* +use rust_htslib::{bam, bam::Read, htslib}; pub struct Data { pub lengths: Option>, - pub identities: Option>, + pub identities: Option>, pub tids: Option>, pub starts: Option>, pub ends: Option>, @@ -16,7 +15,7 @@ pub fn extract(bam_path: &String, threads: usize) -> Data { let mut bam = bam::Reader::from_path(&bam_path).expect("Error opening BAM.\n"); bam.set_threads(threads) .expect("Failure setting decompression threads"); - let (mut lengths, mut identities): (Vec, Vec) = bam + let (mut lengths, mut identities): (Vec, Vec) = bam .rc_records() .map(|r| r.expect("Failure parsing Bam file")) .filter(|read| read.flags() & (htslib::BAM_FUNMAP | htslib::BAM_FSECONDARY) as u16 == 0) @@ -101,9 +100,9 @@ pub fn extract_with_phase(bam_path: &String, threads: usize) -> Data { /// based on https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity /// recent minimap2 version have that as the de tag /// if that is not present it is calculated from CIGAR and NM -fn gap_compressed_identity(record: std::rc::Rc) -> f32 { +fn gap_compressed_identity(record: std::rc::Rc) -> f64 { match get_de_tag(&record) { - Some(v) => v, + Some(v) => v as f64, None => { let mut matches = 0; let mut gap_size = 0; @@ -120,7 +119,7 @@ fn gap_compressed_identity(record: std::rc::Rc) -> f32 _ => (), } } - (get_nm_tag(&record) - gap_size + gap_count) as f32 / (matches + gap_count) as f32 + (get_nm_tag(&record) - gap_size + gap_count) as f64 / (matches + gap_count) as f64 } } } diff --git a/src/feather.rs b/src/feather.rs index 200f891..e9eec2d 100644 --- a/src/feather.rs +++ b/src/feather.rs @@ -4,20 +4,20 @@ use std::sync::Arc; use arrow::{ self, - array::{Float32Array, UInt64Array}, + array::{Float64Array, UInt64Array}, ipc::writer::FileWriter, record_batch::RecordBatch, }; -pub fn save_as_arrow(filename: String, lengths: Vec, identities: Vec) { - let identities_array = Arc::new(Float32Array::from(identities)) as _; +pub fn save_as_arrow(filename: String, lengths: Vec, identities: Vec) { + let identities_array = Arc::new(Float64Array::from(identities)) as _; let lengths_array = Arc::new(UInt64Array::from(lengths)) as _; let batch = RecordBatch::try_from_iter([("identities", identities_array), ("lengths", lengths_array)]) .unwrap(); let schema = Schema::new(vec![ - Field::new("identities", DataType::Float32, false), + Field::new("identities", DataType::Float64, false), Field::new("lengths", DataType::UInt64, false), ]); let buffer = File::create(filename).expect("create file error"); diff --git a/src/histograms.rs b/src/histograms.rs index de20f12..a682db2 100644 --- a/src/histograms.rs +++ b/src/histograms.rs @@ -9,7 +9,7 @@ pub fn make_histogram_lengths(array: &[u64]) { println!("\n\n# Histogram for lengths\n{}", histogram); } -pub fn make_histogram_identities(array: &[f32]) { +pub fn make_histogram_identities(array: &[f64]) { let bins = 100.0 - array .iter() @@ -17,7 +17,7 @@ pub fn make_histogram_identities(array: &[f32]) { .unwrap(); let mut histogram = Histogram::with_buckets(bins as u64, None); for value in array.iter() { - histogram.add(f64::from(*value)); + histogram.add(*value); } println!("\n\n# Histogram for identities\n{}", histogram); } diff --git a/src/main.rs b/src/main.rs index 15106be..d1dfa0f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -130,7 +130,7 @@ fn metrics_from_bam( } } -fn generate_main_output(lengths: &Vec, identities: &[f32], genome_size: u64) { +fn generate_main_output(lengths: &Vec, identities: &[f64], genome_size: u64) { let num_reads = lengths.len(); if num_reads < 2 { error!("Not enough reads to calculate metrics!"); @@ -149,7 +149,7 @@ fn generate_main_output(lengths: &Vec, identities: &[f32], genome_size: u64 println!("Median identity\t{:.2}", calculations::median(identities)); println!( "Mean identity\t{:.2}", - identities.iter().sum::() / num_reads as f32 + identities.iter().sum::() / (num_reads as f64) ); }