From 9e9fc5c2bd88605ac07b87a78a8b39f6775caefe Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sun, 26 May 2024 13:44:59 -0400 Subject: [PATCH 1/4] work on performance issues still --- bindings/src/models/region.rs | 45 ++++++++++++++++++--- bindings/src/models/region_set.rs | 65 ++++++++++++++++++------------- bindings/src/models/universe.rs | 11 +++--- 3 files changed, 83 insertions(+), 38 deletions(-) diff --git a/bindings/src/models/region.rs b/bindings/src/models/region.rs index 246a79d4..35b605f7 100644 --- a/bindings/src/models/region.rs +++ b/bindings/src/models/region.rs @@ -79,12 +79,18 @@ impl PyRegion { #[derive(Clone, Debug)] pub struct PyTokenizedRegion { pub id: u32, - pub universe: PyUniverse, + pub universe: Py, } impl From for PyRegion { fn from(value: PyTokenizedRegion) -> Self { - value.universe.convert_id_to_region(value.id).unwrap() + Python::with_gil(|py| { + value + .universe + .borrow(py) + .convert_id_to_region(value.id) + .unwrap() + }) } } @@ -92,21 +98,48 @@ impl From for PyRegion { impl PyTokenizedRegion { #[getter] pub fn chr(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().chr) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .chr) + }) } #[getter] pub fn start(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().start) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .start) + }) } #[getter] pub fn end(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap().end) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap() + .end) + }) } pub fn to_region(&self) -> Result { - Ok(self.universe.convert_id_to_region(self.id).unwrap()) + Python::with_gil(|py| { + Ok(self + .universe + .borrow(py) + .convert_id_to_region(self.id) + .unwrap()) + }) } #[getter] pub fn id(&self) -> Result { diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index 49562ab1..a83fce7a 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -89,17 +89,22 @@ impl PyRegionSet { #[derive(Clone, Debug)] pub struct PyTokenizedRegionSet { pub ids: Vec, - pub universe: PyUniverse, + pub universe: Py, curr: usize, } impl From> for PyTokenizedRegionSet { fn from(value: TokenizedRegionSet) -> Self { - PyTokenizedRegionSet { - ids: value.ids, - universe: value.universe.to_owned().into(), - curr: 0, - } + Python::with_gil(|py| { + let universe = value.universe; + let py_universe: PyUniverse = universe.to_owned().into(); + let py_universe_bound = Py::new(py, py_universe).unwrap(); + PyTokenizedRegionSet { + ids: value.ids, + universe: py_universe_bound, + curr: 0, + } + }) } } @@ -117,21 +122,25 @@ impl PyTokenizedRegionSet { } pub fn to_bit_vector(&self) -> Result> { - let mut bit_vector = vec![0; self.universe.id_to_region.len()]; + Python::with_gil(|py| { + let mut bit_vector = vec![0; self.universe.borrow(py).id_to_region.len()]; - for id in &self.ids { - bit_vector[*id as usize] = 1; - } + for id in &self.ids { + bit_vector[*id as usize] = 1; + } - Ok(bit_vector) + Ok(bit_vector) + }) } pub fn to_regions(&self) -> Result> { - Ok(self - .ids - .iter() - .map(|id| self.universe.id_to_region[&id].clone()) - .collect()) + Python::with_gil(|py| { + Ok(self + .ids + .iter() + .map(|id| self.universe.borrow(py).id_to_region[&id].clone()) + .collect()) + }) } pub fn to_ids(&self) -> Result> { @@ -164,17 +173,19 @@ impl PyTokenizedRegionSet { } pub fn __next__(&mut self) -> Option { - if self.curr < self.ids.len() { - let id = self.ids[self.curr]; - self.curr += 1; - - Some(PyTokenizedRegion { - universe: self.universe.to_owned(), - id, - }) - } else { - None - } + Python::with_gil(|py| { + if self.curr < self.ids.len() { + let id = self.ids[self.curr]; + self.curr += 1; + + Some(PyTokenizedRegion { + universe: self.universe.clone_ref(py), + id, + }) + } else { + None + } + }) } pub fn __getitem__(&self, indx: isize) -> Result { diff --git a/bindings/src/models/universe.rs b/bindings/src/models/universe.rs index 136894fd..362d8845 100644 --- a/bindings/src/models/universe.rs +++ b/bindings/src/models/universe.rs @@ -17,17 +17,18 @@ pub struct PyUniverse { impl From for PyUniverse { fn from(value: Universe) -> Self { - let regions = value.regions.iter().map(|r| r.to_owned().into()).collect(); + let regions = value.regions.into_iter().map(|r| r.into()).collect(); let region_to_id = value .region_to_id - .iter() - .map(|(k, v)| (k.to_owned().into(), *v)) + .into_iter() + .map(|(k, v)| (k.into(), v)) .collect(); + let id_to_region = value .id_to_region - .iter() - .map(|(k, v)| (*k, v.to_owned().into())) + .into_iter() + .map(|(k, v)| (k, v.into())) .collect(); PyUniverse { From 9b00aedb5cb83596a63ef3cdb599b288628271c7 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 28 May 2024 15:37:43 -0400 Subject: [PATCH 2/4] resolve memory issues (#21) --- bindings/src/models/region_set.rs | 17 +-------- bindings/src/tokenizers/tree_tokenizer.rs | 25 ++++++++++-- genimtools/src/common/models/universe.rs | 2 +- optimization.py | 46 +++++++++++++++++++++++ 4 files changed, 69 insertions(+), 21 deletions(-) create mode 100644 optimization.py diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index a83fce7a..cfc11178 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -90,22 +90,7 @@ impl PyRegionSet { pub struct PyTokenizedRegionSet { pub ids: Vec, pub universe: Py, - curr: usize, -} - -impl From> for PyTokenizedRegionSet { - fn from(value: TokenizedRegionSet) -> Self { - Python::with_gil(|py| { - let universe = value.universe; - let py_universe: PyUniverse = universe.to_owned().into(); - let py_universe_bound = Py::new(py, py_universe).unwrap(); - PyTokenizedRegionSet { - ids: value.ids, - universe: py_universe_bound, - curr: 0, - } - }) - } + pub curr: usize, } #[pymethods] diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs index e5a714a4..f4bb7eda 100644 --- a/bindings/src/tokenizers/tree_tokenizer.rs +++ b/bindings/src/tokenizers/tree_tokenizer.rs @@ -15,16 +15,24 @@ use crate::utils::extract_regions_from_py_any; #[pyclass(name = "TreeTokenizer")] pub struct PyTreeTokenizer { pub tokenizer: TreeTokenizer, + pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons } #[pymethods] impl PyTreeTokenizer { #[new] pub fn new(path: String) -> Result { - let path = Path::new(&path); - let tokenizer = TreeTokenizer::try_from(path)?; + Python::with_gil(|py| { + let path = Path::new(&path); + let tokenizer = TreeTokenizer::try_from(path)?; + let py_universe: PyUniverse = tokenizer.universe.to_owned().into(); + let py_universe_bound = Py::new(py, py_universe)?; - Ok(PyTreeTokenizer { tokenizer }) + Ok(PyTreeTokenizer { + tokenizer, + universe: py_universe_bound, + }) + }) } #[getter] @@ -138,7 +146,16 @@ impl PyTreeTokenizer { // tokenize the RegionSet let tokenized = self.tokenizer.tokenize_region_set(&rs); - Ok(tokenized.into()) + Python::with_gil(|py| { + let py_tokenized_region_set = PyTokenizedRegionSet { + ids: tokenized.ids, + curr: 0, + universe: self.universe.clone_ref(py), + }; + + Ok(py_tokenized_region_set) + }) + } // encode returns a list of ids diff --git a/genimtools/src/common/models/universe.rs b/genimtools/src/common/models/universe.rs index 309a88ae..0987f8d6 100644 --- a/genimtools/src/common/models/universe.rs +++ b/genimtools/src/common/models/universe.rs @@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe { id_to_region, }) } -} +} \ No newline at end of file diff --git a/optimization.py b/optimization.py new file mode 100644 index 00000000..b1d7d366 --- /dev/null +++ b/optimization.py @@ -0,0 +1,46 @@ +import scanpy as sc +import numpy as np + +from rich.progress import track +from genimtools.tokenizers import TreeTokenizer +# from genimtools.utils import write_tokens_to_gtok +from geniml.io import Region + +adata = sc.read_h5ad("luecken2021_parsed.h5ad") +# adata = sc.pp.subsample(adata, n_obs=10_000, copy=True) +t = TreeTokenizer("screen.bed") + +adata_features = [ + Region(chr, int(start), int(end)) + for chr, start, end in track( + zip(adata.var["chr"], adata.var["start"], adata.var["end"]), + total=adata.var.shape[0], + description="Extracting regions from AnnData", + ) +] + +features = np.ndarray(len(adata_features), dtype=object) + +for i, region in enumerate(adata_features): + features[i] = region + +del adata_features + +tokenized = [] +for row in track( + range(adata.shape[0]), + total=adata.shape[0], + description="Tokenizing", +): + _, non_zeros = adata.X[row].nonzero() + regions = features[non_zeros] + tokenized.append(t(regions)) + + + + +# the issue is that +t(regions) +# is, for some reason much slower and less efficient than +t.encode(regions) +# why? \ No newline at end of file From 42cb31253123f077803d4cdbea07b4e648ee6fcd Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 28 May 2024 16:44:22 -0400 Subject: [PATCH 3/4] changelog + bump version --- bindings/Cargo.toml | 2 +- bindings/src/models/region_set.rs | 2 +- genimtools/Cargo.toml | 2 +- genimtools/docs/changelog.md | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml index cac9c4a3..b5f5b3f6 100644 --- a/bindings/Cargo.toml +++ b/bindings/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools-py" -version = "0.0.11" +version = "0.0.12" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index cfc11178..c9741d2d 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -6,7 +6,7 @@ use numpy::ndarray::Array; use numpy::{IntoPyArray, PyArray1}; use anyhow::Result; -use genimtools::common::{models::TokenizedRegionSet, utils::extract_regions_from_bed_file}; +use genimtools::common::utils::extract_regions_from_bed_file; use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse}; diff --git a/genimtools/Cargo.toml b/genimtools/Cargo.toml index 86e5bbcb..059957bb 100644 --- a/genimtools/Cargo.toml +++ b/genimtools/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "genimtools" -version = "0.0.11" +version = "0.0.12" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" diff --git a/genimtools/docs/changelog.md b/genimtools/docs/changelog.md index a7fd016f..cb66da01 100644 --- a/genimtools/docs/changelog.md +++ b/genimtools/docs/changelog.md @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.12] +- optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs. + ## [0.0.11] - redesigned API for the tokenizers to better emulate the huggingface tokenizers API. - implemented new traits for tokenizers to allow for more flexibility when creating new tokenizers. From 6cead90a084e26c2dbf0248f41deb8550a44a9ab Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 28 May 2024 16:47:32 -0400 Subject: [PATCH 4/4] remove optimization.py testing code --- optimization.py | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 optimization.py diff --git a/optimization.py b/optimization.py deleted file mode 100644 index b1d7d366..00000000 --- a/optimization.py +++ /dev/null @@ -1,46 +0,0 @@ -import scanpy as sc -import numpy as np - -from rich.progress import track -from genimtools.tokenizers import TreeTokenizer -# from genimtools.utils import write_tokens_to_gtok -from geniml.io import Region - -adata = sc.read_h5ad("luecken2021_parsed.h5ad") -# adata = sc.pp.subsample(adata, n_obs=10_000, copy=True) -t = TreeTokenizer("screen.bed") - -adata_features = [ - Region(chr, int(start), int(end)) - for chr, start, end in track( - zip(adata.var["chr"], adata.var["start"], adata.var["end"]), - total=adata.var.shape[0], - description="Extracting regions from AnnData", - ) -] - -features = np.ndarray(len(adata_features), dtype=object) - -for i, region in enumerate(adata_features): - features[i] = region - -del adata_features - -tokenized = [] -for row in track( - range(adata.shape[0]), - total=adata.shape[0], - description="Tokenizing", -): - _, non_zeros = adata.X[row].nonzero() - regions = features[non_zeros] - tokenized.append(t(regions)) - - - - -# the issue is that -t(regions) -# is, for some reason much slower and less efficient than -t.encode(regions) -# why? \ No newline at end of file