Skip to content

Commit

Permalink
fix: replace HashMap with FxHashMap for improved performance in multi…
Browse files Browse the repository at this point in the history
…ple modules
  • Loading branch information
AndreaGuarracino committed Dec 28, 2024
1 parent 469694b commit 5956725
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 31 deletions.
23 changes: 15 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ regex = "1.11.1"
log = "0.4.22"
env_logger = "0.11.5"
natord = "1.0.9"
rustc-hash = "2.1.0"
18 changes: 9 additions & 9 deletions src/impg.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::HashMap;
use rustc_hash::FxHashMap;
use coitrees::{BasicCOITree, Interval, IntervalTree};
use crate::paf::{PafRecord, ParseErr, Strand};
use crate::seqidx::SequenceIndex;
Expand Down Expand Up @@ -102,8 +102,8 @@ impl QueryMetadata {
}

pub type AdjustedInterval = (Interval<u32>, Vec<CigarOp>, Interval<u32>);
type TreeMap = HashMap<u32, BasicCOITree<QueryMetadata, u32>>;
pub type SerializableImpg = (HashMap<u32, Vec<SerializableInterval>>, SequenceIndex);
type TreeMap = FxHashMap<u32, BasicCOITree<QueryMetadata, u32>>;
pub type SerializableImpg = (FxHashMap<u32, Vec<SerializableInterval>>, SequenceIndex);

#[derive(Clone, Serialize, Deserialize)]
pub struct SerializableInterval {
Expand Down Expand Up @@ -233,7 +233,7 @@ impl Impg {
seq_index.get_or_insert_id(&record.target_name, Some(record.target_length));
}

let intervals: HashMap<u32, Vec<Interval<QueryMetadata>>> = records.par_iter()
let intervals: FxHashMap<u32, Vec<Interval<QueryMetadata>>> = records.par_iter()
.filter_map(|record| {
let query_id = seq_index.get_id(&record.query_name).expect("Query name not found in index");
let target_id = seq_index.get_id(&record.target_name).expect("Target name not found in index");
Expand All @@ -255,11 +255,11 @@ impl Impg {
metadata: query_metadata,
}))
}) // Use fold and reduce to achieve grouping
.fold(HashMap::new, |mut acc: HashMap<u32, Vec<Interval<QueryMetadata>>>, (target_id, interval)| {
.fold(FxHashMap::default, |mut acc: FxHashMap<u32, Vec<Interval<QueryMetadata>>>, (target_id, interval)| {
acc.entry(target_id).or_default().push(interval);
acc
})
.reduce(HashMap::new, |mut acc, part| {
.reduce(FxHashMap::default, |mut acc, part| {
for (key, value) in part {
acc.entry(key).or_default().extend(value);
}
Expand Down Expand Up @@ -361,7 +361,7 @@ impl Impg {
target_id: u32,
range_start: i32,
range_end: i32,
masked_regions: Option<&HashMap<u32, SortedRanges>>
masked_regions: Option<&FxHashMap<u32, SortedRanges>>
) -> Vec<AdjustedInterval> {
let mut results = Vec::new();
// Add the input range to the results
Expand All @@ -381,12 +381,12 @@ impl Impg {
// Initialize stack with first query
let mut stack = vec![(target_id, range_start, range_end)];
// Initialize visited ranges from masked regions if provided
let mut visited_ranges: HashMap<u32, SortedRanges> = if let Some(m) = masked_regions {
let mut visited_ranges: FxHashMap<u32, SortedRanges> = if let Some(m) = masked_regions {
m.iter()
.map(|(&k, v)| (k, (*v).clone()))
.collect()
} else {
HashMap::new()
FxHashMap::default()
};
// Initialize first visited range for target_id if not already present
visited_ranges.entry(target_id)
Expand Down
14 changes: 7 additions & 7 deletions src/partition.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use rustc_hash::FxHashMap;
use std::fs::File;
use std::io::{self, BufWriter, Write};
use crate::impg::Impg;
use coitrees::Interval;
use crate::impg::CigarOp;
use std::collections::HashMap;
use crate::impg::SortedRanges;
use log::{debug, info};
//use std::time::Instant;
Expand Down Expand Up @@ -65,10 +65,10 @@ pub fn partition_alignments(
}

// Initialize masked regions
let mut masked_regions: HashMap<u32, SortedRanges> = HashMap::new();
let mut masked_regions: FxHashMap<u32, SortedRanges> = FxHashMap::default();

// Initialize missing regions from sequence index
let mut missing_regions: HashMap<u32, SortedRanges> = (0..impg.seq_index.len() as u32)
let mut missing_regions: FxHashMap<u32, SortedRanges> = (0..impg.seq_index.len() as u32)
.map(|id| {
let len = impg.seq_index.get_len_from_id(id).unwrap();
let mut ranges = SortedRanges::new();
Expand Down Expand Up @@ -232,7 +232,7 @@ fn merge_overlaps(

fn subtract_masked_regions(
overlaps: &mut Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>)>,
masked_regions: &HashMap<u32, SortedRanges>
masked_regions: &FxHashMap<u32, SortedRanges>
) -> Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>)> {
let mut result = Vec::new();

Expand Down Expand Up @@ -306,12 +306,12 @@ fn subtract_masked_regions(
}

fn update_masked_and_missing_regions(
masked_regions: &mut HashMap<u32, SortedRanges>,
missing_regions: &mut HashMap<u32, SortedRanges>,
masked_regions: &mut FxHashMap<u32, SortedRanges>,
missing_regions: &mut FxHashMap<u32, SortedRanges>,
overlaps: &Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>)>
) {
// First, collect all new regions to be masked by sequence
let mut new_masks: HashMap<u32, Vec<(i32, i32)>> = HashMap::new();
let mut new_masks: FxHashMap<u32, Vec<(i32, i32)>> = FxHashMap::default();
for (query_interval, _, _) in overlaps {
let (start, end) = if query_interval.first <= query_interval.last {
(query_interval.first, query_interval.last)
Expand Down
14 changes: 7 additions & 7 deletions src/seqidx.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
use std::collections::HashMap;
use rustc_hash::FxHashMap;
use serde::{Serialize, Deserialize};

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SequenceIndex {
name_to_id: HashMap<String, u32>,
id_to_name: HashMap<u32, String>,
id_to_len: HashMap<u32, usize>,
name_to_id: FxHashMap<String, u32>,
id_to_name: FxHashMap<u32, String>,
id_to_len: FxHashMap<u32, usize>,
next_id: u32,
}

impl SequenceIndex {
pub fn new() -> Self {
SequenceIndex {
name_to_id: HashMap::new(),
id_to_name: HashMap::new(),
id_to_len: HashMap::new(),
name_to_id: FxHashMap::default(),
id_to_name: FxHashMap::default(),
id_to_len: FxHashMap::default(),
next_id: 0,
}
}
Expand Down

0 comments on commit 5956725

Please sign in to comment.