From bd9556b36e737de61caef13236cc03be2258f979 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 17 Feb 2024 20:10:47 -0800 Subject: [PATCH] replace piz with rc-zip --- Cargo.lock | 303 ++++++++++++++++++++++++++++++++++++---- src/core/Cargo.toml | 3 +- src/core/src/errors.rs | 5 + src/core/src/storage.rs | 117 ++++++---------- 4 files changed, 327 insertions(+), 101 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a0612bb36..160df72842 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -252,6 +252,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.34" @@ -330,15 +341,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "codepage-437" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e40c1169585d8d08e5675a39f2fc056cd19a258fc4cba5e3bbf4a9c1026de535" -dependencies = [ - "csv", -] - [[package]] name = "codespan-reporting" version = "0.11.1" @@ -524,6 +526,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + [[package]] name = "enum_dispatch" version = "0.3.12" @@ -536,6 +547,12 @@ dependencies = [ "syn 2.0.46", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.8" @@ -584,7 +601,7 @@ dependencies = [ "cfg-if", "crc32fast", "libc", - "miniz_oxide", + "miniz_oxide 0.4.4", ] [[package]] @@ -639,6 +656,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + [[package]] name = "heck" version = "0.4.1" @@ -684,6 +707,16 @@ dependencies = [ "cxx-build", ] +[[package]] +name = "indexmap" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" +dependencies = [ + "equivalent", + "hashbrown 0.14.3", +] + [[package]] name = "inplace-vec-builder" version = "0.1.1" @@ -727,9 +760,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] @@ -875,9 +908,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.4.1" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap" @@ -923,6 +956,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + [[package]] name = "murmurhash3" version = "0.0.5" @@ -1001,6 +1043,39 @@ dependencies = [ "libm", ] +[[package]] +name = "num_enum" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "oem_cp" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4" +dependencies = [ + "phf", + "phf_codegen", + "serde", + "serde_json", +] + [[package]] name = "once_cell" version = "1.19.0" @@ -1031,13 +1106,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ "heck", - "itertools 0.12.0", + "itertools 0.12.1", "proc-macro2", "proc-macro2-diagnostics", "quote", "syn 2.0.46", ] +[[package]] +name = "oval" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135cef32720c6746450d910890b0b69bcba2bbf6f85c9f4583df13fe415de828" + +[[package]] +name = "ownable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcba94d1536fcc470287d96fd26356c38da8215fdb9a74285b09621f35d9350" +dependencies = [ + "ownable-macro", +] + +[[package]] +name = "ownable-macro" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c91d2781624dec1234581a1a01e63638f36546ad72ee82873ac1b84f41117b" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.46", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1045,21 +1147,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] -name = "piz" -version = "0.5.1" +name = "phf" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ - "camino", - "chrono", - "codepage-437", - "crc32fast", - "flate2", - "log", - "memchr", - "thiserror", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", ] +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + [[package]] name = "pkg-config" version = "0.3.24" @@ -1094,6 +1224,17 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "positioned-io" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccabfeeb89c73adf4081f0dca7f8e28dbda90981a222ceea37f619e93ea6afe9" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + [[package]] name = "ppv-lite86" version = "0.2.16" @@ -1119,6 +1260,15 @@ dependencies = [ "num-integer", ] +[[package]] +name = "proc-macro-crate" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -1275,6 +1425,39 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rc-zip" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17a2e4a592e2501742c853fbed65514ccdd11959d3d69ad6688bd1b695471c82" +dependencies = [ + "cfg-if", + "chardetng", + "chrono", + "crc32fast", + "encoding_rs", + "miniz_oxide 0.7.2", + "num_enum", + "oem_cp", + "oval", + "ownable", + "thiserror", + "tracing", + "winnow", +] + +[[package]] +name = "rc-zip-sync" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40689e7a79f092e45fc3ee24a1ccc38242b626f9234564e561480b768f9e3fdf" +dependencies = [ + "oval", + "positioned-io", + "rc-zip", + "tracing", +] + [[package]] name = "regex" version = "1.5.6" @@ -1320,7 +1503,7 @@ dependencies = [ "bitvec", "bytecheck", "bytes", - "hashbrown", + "hashbrown 0.12.1", "ptr_meta", "rend", "rkyv_derive", @@ -1482,6 +1665,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "smallvec" version = "1.8.0" @@ -1522,11 +1711,12 @@ dependencies = [ "num-iter", "once_cell", "ouroboros", - "piz", "primal-check", "proptest", "rand", "rayon", + "rc-zip", + "rc-zip-sync", "rkyv", "roaring", "rocksdb", @@ -1642,6 +1832,54 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml_datetime" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" + +[[package]] +name = "toml_edit" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -2007,6 +2245,15 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] + [[package]] name = "wyz" version = "0.5.1" diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 40984ffaa1..06f818bb1c 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -47,8 +47,9 @@ nohash-hasher = "0.2.0" num-iter = "0.1.44" once_cell = "1.18.0" ouroboros = "0.18.3" -piz = "0.5.0" primal-check = "0.3.1" +rc-zip = { version = "4.0.0", default-features = false } +rc-zip-sync = "4.0.0" rkyv = { version = "0.7.44", optional = true } roaring = "0.10.2" rayon = { version = "1.8.1", optional = true } diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index c43b104bee..b239bc6ba1 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -66,6 +66,9 @@ pub enum SourmashError { #[error(transparent)] CsvError(#[from] csv::Error), + #[error(transparent)] + ZipError(#[from] rc_zip::error::Error), + #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] #[error(transparent)] Panic(#[from] crate::ffi::utils::Panic), @@ -118,6 +121,7 @@ pub enum SourmashErrorCode { NifflerError = 100_005, CsvError = 100_006, RocksDBError = 100_007, + ZipError = 100_008, } #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] @@ -148,6 +152,7 @@ impl SourmashErrorCode { SourmashError::NifflerError { .. } => SourmashErrorCode::NifflerError, SourmashError::Utf8Error { .. } => SourmashErrorCode::Utf8Error, SourmashError::CsvError { .. } => SourmashErrorCode::CsvError, + SourmashError::ZipError { .. } => SourmashErrorCode::ZipError, #[cfg(not(target_arch = "wasm32"))] #[cfg(feature = "branchwater")] diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 17cbb7701c..5ee0c279e0 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -1,5 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; -use std::ffi::OsStr; +use std::collections::HashMap; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; use std::ops::Deref; @@ -8,6 +7,7 @@ use std::sync::{Arc, RwLock}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use once_cell::sync::OnceCell; +use rc_zip_sync::{ArchiveHandle, ReadZip}; use serde::{Deserialize, Serialize}; use thiserror::Error; use typed_builder::TypedBuilder; @@ -99,18 +99,14 @@ pub struct FSStorage { #[ouroboros::self_referencing] pub struct ZipStorage { - mapping: Option, + file: std::fs::File, - #[borrows(mapping)] + #[borrows(file)] #[covariant] - archive: piz::ZipArchive<'this>, + archive: ArchiveHandle<'this, std::fs::File>, subdir: Option, path: Option, - - #[borrows(archive)] - #[covariant] - metadata: Metadata<'this>, } /// Store data in memory (no permanent storage) @@ -120,8 +116,6 @@ pub struct MemStorage { sigs: Arc>>, } -pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; - // ========================================= impl InnerStorage { @@ -277,56 +271,26 @@ impl Storage for FSStorage { } } -fn lookup<'a, P: AsRef>( - metadata: &'a Metadata, - path: P, -) -> Result<&'a piz::read::FileMetadata<'a>> { - let path = path.as_ref(); - metadata - .get(&path.as_os_str()) - .ok_or_else(|| StorageError::PathNotFoundError(path.to_string()).into()) - .copied() -} - -fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result> { - let subdirs: Vec<_> = archive - .entries() - .iter() - .filter(|entry| entry.is_dir()) - .collect(); - if subdirs.len() == 1 { - Ok(Some(subdirs[0].path.as_str().into())) - } else { - Ok(None) - } -} - impl Storage for ZipStorage { fn save(&self, _path: &str, _content: &[u8]) -> Result { unimplemented!(); } fn load(&self, path: &str) -> Result> { - let metadata = self.borrow_metadata(); + let archive = self.borrow_archive(); + if let Some(entry) = archive.by_name(path) { + return Ok(entry.bytes()?); + } - let entry = lookup(metadata, path).or_else(|_| { - if let Some(subdir) = self.borrow_subdir() { - lookup(metadata, subdir.to_owned() + path) - .map_err(|_| StorageError::PathNotFoundError(path.into())) - } else { - Err(StorageError::PathNotFoundError(path.into())) + if let Some(subdir) = &self.borrow_subdir() { + if let Some(entry) = archive.by_name(subdir.to_owned() + path) { + return Ok(entry.bytes()?); } - })?; - - let mut reader = BufReader::new( - self.borrow_archive() - .read(entry) - .map_err(|_| StorageError::DataReadError(path.into()))?, - ); - let mut contents = Vec::new(); - reader.read_to_end(&mut contents)?; + } - Ok(contents) + Err(crate::errors::SourmashError::StorageError( + StorageError::PathNotFoundError(path.into()), + )) } fn args(&self) -> StorageArgs { @@ -343,35 +307,41 @@ impl Storage for ZipStorage { } fn spec(&self) -> String { - format!("zip://{}", self.path().unwrap_or("".into())) + format!("zip://{}", self.borrow_path().clone().unwrap_or("".into())) } } impl ZipStorage { pub fn from_file>(location: P) -> Result { - let zip_file = File::open(location.as_ref())?; - let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; + let file = File::open(location.as_ref())?; let mut storage = ZipStorageBuilder { - mapping: Some(mapping), - archive_builder: |mapping: &Option| { - piz::ZipArchive::new(mapping.as_ref().unwrap()).unwrap() - }, - metadata_builder: |archive: &piz::ZipArchive| { - archive - .entries() - .iter() - .map(|entry| (entry.path.as_os_str(), entry)) - .collect() - }, + file, + archive_builder: |file: &std::fs::File| file.read_zip().expect("Error loading zipfile"), subdir: None, path: Some(location.as_ref().into()), } .build(); - let subdir = find_subdirs(storage.borrow_archive())?; - storage.with_mut(|fields| *fields.subdir = subdir); + let subdir = { + let subdirs: Vec<_> = storage + .borrow_archive() + .entries() + .filter(|entry| matches!(entry.kind(), rc_zip::parse::EntryKind::Directory)) + .collect(); + if subdirs.len() == 1 { + Some( + subdirs[0] + .sanitized_name() + .expect("TODO throw right error") + .into(), + ) + } else { + None + } + }; + storage.with_mut(|fields| *fields.subdir = subdir); Ok(storage) } @@ -391,9 +361,8 @@ impl ZipStorage { Ok(self .borrow_archive() .entries() - .iter() .filter_map(|entry| { - let path = entry.path.as_str(); + let path = entry.sanitized_name().expect("TODO throw right error"); if path.ends_with(".sbt.json") { Some(path.into()) } else { @@ -407,8 +376,12 @@ impl ZipStorage { Ok(self .borrow_archive() .entries() - .iter() - .map(|entry| entry.path.as_str().into()) + .map(|entry| { + entry + .sanitized_name() + .expect("TODO throw right error") + .into() + }) .collect()) } }