Skip to content

Commit

Permalink
use f64 for identities
Browse files Browse the repository at this point in the history
should solve weird >100 mean, see also #7
  • Loading branch information
wdecoster committed Oct 31, 2022
1 parent 467a142 commit e0d7df9
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cramino"
version = "0.9.2"
version = "0.9.4"
edition = "2021"
authors = ["Wouter De Coster [email protected]"]
license = "MIT"
Expand Down
13 changes: 6 additions & 7 deletions src/extract_from_bam.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use bam::ext::BamRecordExtensions;
use rust_htslib::bam::record::{Aux, Cigar};
use rust_htslib::htslib;
use rust_htslib::{bam, bam::Read}; // for BAM_F*
use rust_htslib::{bam, bam::Read, htslib};

pub struct Data {
pub lengths: Option<Vec<u64>>,
pub identities: Option<Vec<f32>>,
pub identities: Option<Vec<f64>>,
pub tids: Option<Vec<i32>>,
pub starts: Option<Vec<i64>>,
pub ends: Option<Vec<i64>>,
Expand All @@ -16,7 +15,7 @@ pub fn extract(bam_path: &String, threads: usize) -> Data {
let mut bam = bam::Reader::from_path(&bam_path).expect("Error opening BAM.\n");
bam.set_threads(threads)
.expect("Failure setting decompression threads");
let (mut lengths, mut identities): (Vec<u64>, Vec<f32>) = bam
let (mut lengths, mut identities): (Vec<u64>, Vec<f64>) = bam
.rc_records()
.map(|r| r.expect("Failure parsing Bam file"))
.filter(|read| read.flags() & (htslib::BAM_FUNMAP | htslib::BAM_FSECONDARY) as u16 == 0)
Expand Down Expand Up @@ -101,9 +100,9 @@ pub fn extract_with_phase(bam_path: &String, threads: usize) -> Data {
/// based on https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
/// recent minimap2 version have that as the de tag
/// if that is not present it is calculated from CIGAR and NM
fn gap_compressed_identity(record: std::rc::Rc<rust_htslib::bam::Record>) -> f32 {
fn gap_compressed_identity(record: std::rc::Rc<rust_htslib::bam::Record>) -> f64 {
match get_de_tag(&record) {
Some(v) => v,
Some(v) => v as f64,
None => {
let mut matches = 0;
let mut gap_size = 0;
Expand All @@ -120,7 +119,7 @@ fn gap_compressed_identity(record: std::rc::Rc<rust_htslib::bam::Record>) -> f32
_ => (),
}
}
(get_nm_tag(&record) - gap_size + gap_count) as f32 / (matches + gap_count) as f32
(get_nm_tag(&record) - gap_size + gap_count) as f64 / (matches + gap_count) as f64
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/feather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ use std::sync::Arc;

use arrow::{
self,
array::{Float32Array, UInt64Array},
array::{Float64Array, UInt64Array},
ipc::writer::FileWriter,
record_batch::RecordBatch,
};

pub fn save_as_arrow(filename: String, lengths: Vec<u64>, identities: Vec<f32>) {
let identities_array = Arc::new(Float32Array::from(identities)) as _;
pub fn save_as_arrow(filename: String, lengths: Vec<u64>, identities: Vec<f64>) {
let identities_array = Arc::new(Float64Array::from(identities)) as _;
let lengths_array = Arc::new(UInt64Array::from(lengths)) as _;
let batch =
RecordBatch::try_from_iter([("identities", identities_array), ("lengths", lengths_array)])
.unwrap();

let schema = Schema::new(vec![
Field::new("identities", DataType::Float32, false),
Field::new("identities", DataType::Float64, false),
Field::new("lengths", DataType::UInt64, false),
]);
let buffer = File::create(filename).expect("create file error");
Expand Down
4 changes: 2 additions & 2 deletions src/histograms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ pub fn make_histogram_lengths(array: &[u64]) {
println!("\n\n# Histogram for lengths\n{}", histogram);
}

pub fn make_histogram_identities(array: &[f32]) {
pub fn make_histogram_identities(array: &[f64]) {
let bins = 100.0
- array
.iter()
.min_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Less))
.unwrap();
let mut histogram = Histogram::with_buckets(bins as u64, None);
for value in array.iter() {
histogram.add(f64::from(*value));
histogram.add(*value);
}
println!("\n\n# Histogram for identities\n{}", histogram);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ fn metrics_from_bam(
}
}

fn generate_main_output(lengths: &Vec<u64>, identities: &[f32], genome_size: u64) {
fn generate_main_output(lengths: &Vec<u64>, identities: &[f64], genome_size: u64) {
let num_reads = lengths.len();
if num_reads < 2 {
error!("Not enough reads to calculate metrics!");
Expand All @@ -149,7 +149,7 @@ fn generate_main_output(lengths: &Vec<u64>, identities: &[f32], genome_size: u64
println!("Median identity\t{:.2}", calculations::median(identities));
println!(
"Mean identity\t{:.2}",
identities.iter().sum::<f32>() / num_reads as f32
identities.iter().sum::<f64>() / (num_reads as f64)
);
}

Expand Down

0 comments on commit e0d7df9

Please sign in to comment.