From 90b0314f0e0a3c3e5968d8648497de9b6c1d339e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 13 Jun 2024 06:06:51 -0700 Subject: [PATCH 01/22] Add a failing test for indexing multiple manifest zips provided as a single file --- src/python/tests/test_index.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index f490d2aa..4fce35d4 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -123,6 +123,32 @@ def test_index_manifest(runtmp, capfd): assert 'index is done' in runtmp.last_result.err +def test_index_multiple_manifests(runtmp, capfd): + # test index with text file of multiple manifests + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + sigs = [sig2, sig47, sig63] + manifests = [] + for sig in sigs: + sig_mf = runtmp.output(os.path.basename(sig) + ".mf.zip") + runtmp.sourmash("sig", "cat", sig, "-o", sig_mf) + manifests.append(sig_mf) + + # assert False + manifests_zips_list = runtmp.output('manifest_zips.txt') + make_file_list(manifests_zips_list, manifests) + + output = runtmp.output('out.db') + runtmp.sourmash('scripts', 'index', manifests_zips_list, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + print(runtmp.last_result.err) + assert 'index is done' in runtmp.last_result.err + + def test_index_bad_siglist_2(runtmp, capfd): # test with a bad siglist (containing a missing file) against_list = runtmp.output('against.txt') From 97fa4133c9224ffdab486230e49c1f754da6d8e5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 13 Jun 2024 07:20:05 -0700 Subject: [PATCH 02/22] After loading matching Signature, Add re-looking through the pathlist in --- src/utils.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/utils.rs b/src/utils.rs index f3c1f7d6..a79f6238 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -595,6 +595,8 @@ fn collection_from_pathlist( }) .collect(); + let mut last_error = None; + // load sketches from paths in parallel. let n_failed = AtomicUsize::new(0); let records: Vec = lines @@ -634,6 +636,19 @@ fn collection_from_pathlist( .build(), ), ); + let collection = collection.or_else( + lines.par_iter().filter_map(|line| Some(PathBuf::from(line).extension() == "zip")) { + match collection_from_zipfile(&PathBuf::from(line), &report_type) { + Ok(coll) => Some((coll, 0)), + Err(e) => { + last_error = Some(e); + None + } + } + eprintln!("Matched zipfiles") + } + ); + let n_failed = n_failed.load(atomic::Ordering::SeqCst); Ok((collection, n_failed)) From 3295f0ecf74d4dee3251389621a985114e8a769b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 21 Jun 2024 08:22:05 -0700 Subject: [PATCH 03/22] Created collection_from_zipfile_or_signature_or_manifest to call within collection_from_pathlist --- src/utils.rs | 92 ++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index a79f6238..a65412e2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -542,6 +542,46 @@ pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Resu } } +// Make a collection from anything except a pathlist, as this is called +// from collection_from_pathlist +fn collection_from_zipfile_or_signature_or_manifest( + sigpath: &Path, + report_type: &ReportType, +) -> Result, Option> { +// returns collection and number of failures + let mut last_error = None; + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + match collection_from_zipfile(&sigpath, &report_type) { + Ok(coll) => Some(coll), + Err(e) => { + last_error = Some(e); + None + } + } + } else { + None + }; + + let collection = + collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) { + Ok(coll) => Some(coll), + Err(e) => { + last_error = Some(e); + None + } + }); + + let collection = + collection.or_else(|| match collection_from_signature(&sigpath, &report_type) { + Ok(coll) => Some(coll), + Err(e) => { + last_error = Some(e); + None + } + }); + Ok(collection) +} + fn collection_from_manifest( sigpath: &Path, report_type: &ReportType, @@ -585,69 +625,29 @@ fn collection_from_pathlist( ) })?; let reader = BufReader::new(file); + let mut last_error: std::option::Option = None; // load list of paths let lines: Vec<_> = reader .lines() .filter_map(|line| match line { - Ok(path) => Some(path), + Ok(path) => Some(PathBuf::from(path)), Err(_err) => None, }) .collect(); - let mut last_error = None; - // load sketches from paths in parallel. let n_failed = AtomicUsize::new(0); - let records: Vec = lines + let collection = lines .par_iter() - .filter_map(|path| match Signature::from_path(path) { - Ok(signatures) => { - let recs: Vec = signatures - .into_iter() - .flat_map(|v| Record::from_sig(&v, path)) - .collect(); - Some(recs) - } + .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { + Ok(collection) => Some(collection), Err(err) => { - eprintln!("Sketch loading error: {}", err); eprintln!("WARNING: could not load sketches from path '{}'", path); let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } - }) - .flatten() - .collect(); - - if records.is_empty() { - eprintln!( - "No valid signatures found in {} pathlist '{}'", - report_type, sigpath - ); - } - - let manifest: Manifest = records.into(); - let collection = Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ); - let collection = collection.or_else( - lines.par_iter().filter_map(|line| Some(PathBuf::from(line).extension() == "zip")) { - match collection_from_zipfile(&PathBuf::from(line), &report_type) { - Ok(coll) => Some((coll, 0)), - Err(e) => { - last_error = Some(e); - None - } - } - eprintln!("Matched zipfiles") - } - ); + }).flatten().collect::>(); let n_failed = n_failed.load(atomic::Ordering::SeqCst); From 9c3ecf97fea0fe88372986e5d79e527d5250e356 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 22 Jun 2024 16:09:30 -0700 Subject: [PATCH 04/22] Iterate over the records in the collection to get their paths to then create more collections on --- src/utils.rs | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 31f12b27..0475ffa0 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -638,20 +638,48 @@ fn collection_from_pathlist( // load sketches from paths in parallel. let n_failed = AtomicUsize::new(0); - let collection = lines + + // Load all entries as collections + let record_paths = lines .par_iter() .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { - Ok(collection) => Some(collection), + Ok(collection) => { + Some(collection + .unwrap() + .manifest() + .iter() + .filter_map(|record| match record.internal_location(){ + PathBuf => Some(location) + + }) + .collect()) + }, Err(err) => { eprintln!("WARNING: could not load sketches from path '{}'", path); let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } - }).flatten().collect::>(); + }).flatten().collect::>(); + + // // For each record in the collection, get its path filename + // let record_paths = collections + // .par_iter() + // .filter_map(|coll| match coll.iter() { + // Ok(location) => Some(location), + // Err(err) => { + // eprintln!("WARNING: could not get filepath for record '{}'", record); + // let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); + // None + + // } + // }).flatten().collect(); + + // Now load the path filenames as one big collection + let collection = Collection::from_paths(&record_paths); let n_failed = n_failed.load(atomic::Ordering::SeqCst); - Ok((collection, n_failed)) + Ok((collection?, n_failed)) } fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result { From 6615436da79229db3a5f45222178a9e64fc5baa5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 22 Jun 2024 20:42:39 -0700 Subject: [PATCH 05/22] Clean up some comments --- src/utils.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 0475ffa0..99d64256 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -644,6 +644,7 @@ fn collection_from_pathlist( .par_iter() .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { Ok(collection) => { + // For each record in the collection, get its path filename Some(collection .unwrap() .manifest() @@ -660,19 +661,7 @@ fn collection_from_pathlist( None } }).flatten().collect::>(); - - // // For each record in the collection, get its path filename - // let record_paths = collections - // .par_iter() - // .filter_map(|coll| match coll.iter() { - // Ok(location) => Some(location), - // Err(err) => { - // eprintln!("WARNING: could not get filepath for record '{}'", record); - // let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); - // None - - // } - // }).flatten().collect(); + // Now load the path filenames as one big collection let collection = Collection::from_paths(&record_paths); From 4ec39a36b77bd7cfd61342bc0cb7b6271bd1e308 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 22 Jun 2024 20:49:20 -0700 Subject: [PATCH 06/22] Change to use map instead of filter_map to get record internal locations --- src/utils.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 99d64256..3b356825 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -649,10 +649,7 @@ fn collection_from_pathlist( .unwrap() .manifest() .iter() - .filter_map(|record| match record.internal_location(){ - PathBuf => Some(location) - - }) + .map(|record| record.internal_location()) .collect()) }, Err(err) => { From 90849261517aa74d76d847457dd6f5348d5fa854 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 28 Jun 2024 08:28:42 -0700 Subject: [PATCH 07/22] Started writing collection_to_pathlist but can't get the types working --- src/utils.rs | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 3b356825..a8d340c8 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -640,26 +640,29 @@ fn collection_from_pathlist( let n_failed = AtomicUsize::new(0); // Load all entries as collections - let record_paths = lines + let collections = lines .par_iter() .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { - Ok(collection) => { - // For each record in the collection, get its path filename - Some(collection - .unwrap() - .manifest() - .iter() - .map(|record| record.internal_location()) - .collect()) - }, + Ok(collection) => Some(collection), Err(err) => { eprintln!("WARNING: could not load sketches from path '{}'", path); let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } - }).flatten().collect::>(); + }).flatten().collect::>(); + let record_paths = collections + .par_iter() + .filter_map(|collection| match collection_to_pathlist(&collection) { + Ok(pathlist) => Some(pathlist), + Err(err) => { + eprintln!("WARNING: could not get paths for collection "); + None + } + }) + .flatten(); + // Now load the path filenames as one big collection let collection = Collection::from_paths(&record_paths); @@ -668,6 +671,23 @@ fn collection_from_pathlist( Ok((collection?, n_failed)) } +// Convert a collection into a list of paths that can then be read to create a collection +fn collection_to_pathlist(collection: &Collection) -> Result, anyhow::Error> { + // For each record in the collection, get its path filename + let filenames: Vec = collection + .manifest() + .iter() + .map(|record| record.filename()) + .flatten(); + + let pathlist = &filenames + .iter() + .map(|filename| PathBuf::from(&filename)) + .flatten(); + + Ok(pathlist) +} + fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result { let signatures = Signature::from_path(sigpath).with_context(|| { format!( From c9e738e266bebfe43e97eb0466f84f24c79d39e6 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 30 Jun 2024 16:11:01 -0700 Subject: [PATCH 08/22] Get the signatures from a collection --- src/python/tests/test_index.py | 1 + src/utils.rs | 75 +++++++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 4fce35d4..1b287373 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -147,6 +147,7 @@ def test_index_multiple_manifests(runtmp, capfd): print(captured.err) print(runtmp.last_result.err) assert 'index is done' in runtmp.last_result.err + assert False def test_index_bad_siglist_2(runtmp, capfd): diff --git a/src/utils.rs b/src/utils.rs index a8d340c8..cb18ba7b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -24,6 +24,7 @@ use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; +use sourmash::prelude::Storage; use stats::{median, stddev}; use std::collections::{HashMap, HashSet}; /// Track a name/minhash. @@ -624,11 +625,11 @@ fn collection_from_pathlist( report_type, sigpath ) })?; - let reader = BufReader::new(file); + let reader_new = BufReader::new(file); let mut last_error: std::option::Option = None; // load list of paths - let lines: Vec<_> = reader + let path_bufs: Vec<_> = reader_new .lines() .filter_map(|line| match line { Ok(path) => Some(PathBuf::from(path)), @@ -640,7 +641,7 @@ fn collection_from_pathlist( let n_failed = AtomicUsize::new(0); // Load all entries as collections - let collections = lines + let collections = path_bufs .par_iter() .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { Ok(collection) => Some(collection), @@ -652,40 +653,74 @@ fn collection_from_pathlist( }).flatten().collect::>(); - let record_paths = collections + // get all the signatures from each collection + let flattened_signatures = collections .par_iter() - .filter_map(|collection| match collection_to_pathlist(&collection) { + .filter_map(|collection| match collection_to_signatures(&collection) { Ok(pathlist) => Some(pathlist), Err(err) => { - eprintln!("WARNING: could not get paths for collection "); + eprintln!("WARNING: could not get signatures for collection "); None } }) - .flatten(); + .flatten().collect::>(); - // Now load the path filenames as one big collection - let collection = Collection::from_paths(&record_paths); + // // Now load the path filenames as one big collection + let collection = Collection::from_sigs(flattened_signatures); - let n_failed = n_failed.load(atomic::Ordering::SeqCst); + // let n_failed = n_failed.load(atomic::Ordering::SeqCst); - Ok((collection?, n_failed)) + Ok((collection?, 0)) } // Convert a collection into a list of paths that can then be read to create a collection -fn collection_to_pathlist(collection: &Collection) -> Result, anyhow::Error> { +fn collection_to_signatures(collection: &Collection) -> Result, anyhow::Error> +{ // For each record in the collection, get its path filename - let filenames: Vec = collection + let signatures: Vec = collection .manifest() .iter() - .map(|record| record.filename()) - .flatten(); + .filter_map(|record| match collection.sig_from_record(&record) { + Ok(sigstore) => { + eprintln!("Hello we are in collection_to_signatures, internal_location {}", record.internal_location()); + // sig_from_record produces a SigStore, not a Signature + // Star (*) is a dereferencing operator, which will let us access the Signature + // stored in the SigStore directly + let raw = collection.storage().load( + &::clone( + &record + .internal_location() + ) + .into_os_string() + .into_string() + .unwrap() + ).unwrap(); + let sig = Signature::from_reader(&mut &raw[..]) + .unwrap() + // TODO: select the right sig? + .swap_remove(0); + Some(sig) + }, + Err(_err) => None, + // record.filename() + }) + // .map(|record| record.filename()); + // internal_location should create Vec> + .collect::>(); - let pathlist = &filenames - .iter() - .map(|filename| PathBuf::from(&filename)) - .flatten(); + eprintln!("signatures, {}", signatures.len()); + + // Collection::from_sigs(signatures); + + // let pathlist = &filenames + // .iter() + // .map(|filename| PathBuf::from(&filename)) + // .flatten(); + + // let random_strings =vec!["abc".to_string(), "def".to_string()]; - Ok(pathlist) + // Ok(random_strings) + Ok(signatures) } fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result { From 8fd921c24988426e8346b1a52b63178036c8472e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 30 Jun 2024 16:11:16 -0700 Subject: [PATCH 09/22] remove assert False --- src/python/tests/test_index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 1b287373..4fce35d4 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -147,7 +147,6 @@ def test_index_multiple_manifests(runtmp, capfd): print(captured.err) print(runtmp.last_result.err) assert 'index is done' in runtmp.last_result.err - assert False def test_index_bad_siglist_2(runtmp, capfd): From ef8ee17a046fc29128e0b4e2f70df9869dc6b0e1 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 30 Jun 2024 16:21:46 -0700 Subject: [PATCH 10/22] Cargo format --- src/utils.rs | 56 +++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index cb18ba7b..d48a0af1 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -20,11 +20,11 @@ use std::sync::atomic::AtomicUsize; use sourmash::ani_utils::{ani_ci_from_containment, ani_from_containment}; use sourmash::collection::Collection; use sourmash::manifest::{Manifest, Record}; +use sourmash::prelude::Storage; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; -use sourmash::prelude::Storage; use stats::{median, stddev}; use std::collections::{HashMap, HashSet}; /// Track a name/minhash. @@ -549,7 +549,7 @@ fn collection_from_zipfile_or_signature_or_manifest( sigpath: &Path, report_type: &ReportType, ) -> Result, Option> { -// returns collection and number of failures + // returns collection and number of failures let mut last_error = None; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match collection_from_zipfile(&sigpath, &report_type) { @@ -643,27 +643,31 @@ fn collection_from_pathlist( // Load all entries as collections let collections = path_bufs .par_iter() - .filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { - Ok(collection) => Some(collection), - Err(err) => { - eprintln!("WARNING: could not load sketches from path '{}'", path); - let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); - None + .filter_map(|path| { + match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { + Ok(collection) => Some(collection), + Err(err) => { + eprintln!("WARNING: could not load sketches from path '{}'", path); + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); + None + } } - }).flatten().collect::>(); - + }) + .flatten() + .collect::>(); // get all the signatures from each collection let flattened_signatures = collections .par_iter() - .filter_map(|collection| match collection_to_signatures(&collection) { + .filter_map(|collection| match collection_to_signatures(&collection) { Ok(pathlist) => Some(pathlist), Err(err) => { eprintln!("WARNING: could not get signatures for collection "); None } }) - .flatten().collect::>(); + .flatten() + .collect::>(); // // Now load the path filenames as one big collection let collection = Collection::from_sigs(flattened_signatures); @@ -674,35 +678,37 @@ fn collection_from_pathlist( } // Convert a collection into a list of paths that can then be read to create a collection -fn collection_to_signatures(collection: &Collection) -> Result, anyhow::Error> -{ +fn collection_to_signatures(collection: &Collection) -> Result, anyhow::Error> { // For each record in the collection, get its path filename let signatures: Vec = collection .manifest() .iter() .filter_map(|record| match collection.sig_from_record(&record) { Ok(sigstore) => { - eprintln!("Hello we are in collection_to_signatures, internal_location {}", record.internal_location()); + eprintln!( + "Hello we are in collection_to_signatures, internal_location {}", + record.internal_location() + ); // sig_from_record produces a SigStore, not a Signature - // Star (*) is a dereferencing operator, which will let us access the Signature + // Star (*) is a dereferencing operator, which will let us access the Signature // stored in the SigStore directly - let raw = collection.storage().load( - &::clone( - &record - .internal_location() - ) + let raw = collection + .storage() + .load( + &::clone(&record.internal_location()) .into_os_string() .into_string() - .unwrap() - ).unwrap(); + .unwrap(), + ) + .unwrap(); let sig = Signature::from_reader(&mut &raw[..]) .unwrap() // TODO: select the right sig? .swap_remove(0); Some(sig) - }, + } Err(_err) => None, - // record.filename() + // record.filename() }) // .map(|record| record.filename()); // internal_location should create Vec> From 6f68a3b03908b93c5ba80c7ea209ca613713a8dd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 30 Jun 2024 17:40:07 -0700 Subject: [PATCH 11/22] Change command name to testing multiple zip files --- src/python/tests/test_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 4fce35d4..bae6319b 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -123,8 +123,8 @@ def test_index_manifest(runtmp, capfd): assert 'index is done' in runtmp.last_result.err -def test_index_multiple_manifests(runtmp, capfd): - # test index with text file of multiple manifests +def test_index_manifest_zip_files(runtmp, capfd): + # test index with text file of multiple zip files sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') From 9760992251610981d5ffd2f83c2f0c0748862e4c Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 10:43:51 -0700 Subject: [PATCH 12/22] no compiler errors! --- src/utils.rs | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index d48a0af1..312e4b47 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -545,10 +545,12 @@ pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Resu // Make a collection from anything except a pathlist, as this is called // from collection_from_pathlist -fn collection_from_zipfile_or_signature_or_manifest( +fn collection_from_any_zipfile_or_signature_or_manifest( sigpath: &Path, report_type: &ReportType, + allow_failed: bool, ) -> Result, Option> { + // returns collection and number of failures let mut last_error = None; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { @@ -618,6 +620,7 @@ fn collection_from_manifest( fn collection_from_pathlist( sigpath: &Path, report_type: &ReportType, + allow_failed: bool, ) -> Result<(Collection, usize), anyhow::Error> { let file = File::open(sigpath).with_context(|| { format!( @@ -633,7 +636,9 @@ fn collection_from_pathlist( .lines() .filter_map(|line| match line { Ok(path) => Some(PathBuf::from(path)), - Err(_err) => None, + Err(err) => { + None + }, }) .collect(); @@ -644,7 +649,7 @@ fn collection_from_pathlist( let collections = path_bufs .par_iter() .filter_map(|path| { - match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) { + match collection_from_any_zipfile_or_signature_or_manifest(&path, &report_type, allow_failed) { Ok(collection) => Some(collection), Err(err) => { eprintln!("WARNING: could not load sketches from path '{}'", path); @@ -663,6 +668,7 @@ fn collection_from_pathlist( Ok(pathlist) => Some(pathlist), Err(err) => { eprintln!("WARNING: could not get signatures for collection "); + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } }) @@ -670,11 +676,18 @@ fn collection_from_pathlist( .collect::>(); // // Now load the path filenames as one big collection - let collection = Collection::from_sigs(flattened_signatures); - // let n_failed = n_failed.load(atomic::Ordering::SeqCst); + let collection = Collection::from_sigs(flattened_signatures).with_context(|| { + format!( + "Loaded {} signatures but failed to load as collection: '{}'", + report_type, sigpath + ) + }); + + let n_failed = n_failed.load(atomic::Ordering::SeqCst); - Ok((collection?, 0)) + // How to get number of failed? + Ok((collection?, n_failed)) } // Convert a collection into a list of paths that can then be read to create a collection @@ -689,22 +702,7 @@ fn collection_to_signatures(collection: &Collection) -> Result, a "Hello we are in collection_to_signatures, internal_location {}", record.internal_location() ); - // sig_from_record produces a SigStore, not a Signature - // Star (*) is a dereferencing operator, which will let us access the Signature - // stored in the SigStore directly - let raw = collection - .storage() - .load( - &::clone(&record.internal_location()) - .into_os_string() - .into_string() - .unwrap(), - ) - .unwrap(); - let sig = Signature::from_reader(&mut &raw[..]) - .unwrap() - // TODO: select the right sig? - .swap_remove(0); + let sig = Signature::from(sigstore); Some(sig) } Err(_err) => None, @@ -796,7 +794,7 @@ pub fn load_collection( }); let collection = - collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type) { + collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type, allow_failed) { Ok((coll, n_failed)) => Some((coll, n_failed)), Err(e) => { last_error = Some(e); From 8a0f6a4c0a2f60049bb903bab9da7b56df7cfae9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 10:45:12 -0700 Subject: [PATCH 13/22] remove print statement --- src/utils.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 312e4b47..cbe1a4c6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -698,10 +698,6 @@ fn collection_to_signatures(collection: &Collection) -> Result, a .iter() .filter_map(|record| match collection.sig_from_record(&record) { Ok(sigstore) => { - eprintln!( - "Hello we are in collection_to_signatures, internal_location {}", - record.internal_location() - ); let sig = Signature::from(sigstore); Some(sig) } From 57a83b02212324c6a5efd2a0a842e26ff013eedc Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 11:42:13 -0700 Subject: [PATCH 14/22] No compiler errors, tried to add error if path doesn't exist --- src/utils.rs | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index cbe1a4c6..52c2179a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -550,7 +550,6 @@ fn collection_from_any_zipfile_or_signature_or_manifest( report_type: &ReportType, allow_failed: bool, ) -> Result, Option> { - // returns collection and number of failures let mut last_error = None; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { @@ -622,12 +621,17 @@ fn collection_from_pathlist( report_type: &ReportType, allow_failed: bool, ) -> Result<(Collection, usize), anyhow::Error> { + // load sketches from paths in parallel. + let n_failed = AtomicUsize::new(0); + let file = File::open(sigpath).with_context(|| { + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); format!( "Failed to open {} pathlist file: '{}'", report_type, sigpath ) })?; + let reader_new = BufReader::new(file); let mut last_error: std::option::Option = None; @@ -635,21 +639,33 @@ fn collection_from_pathlist( let path_bufs: Vec<_> = reader_new .lines() .filter_map(|line| match line { - Ok(path) => Some(PathBuf::from(path)), + Ok(l) => { + let path_buf = PathBuf::from(l.clone()); + match path_buf.exists() { + true => Some(path_buf), + false => { + eprintln!("WARNING: path '{}' does not exist", l); + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + } Err(err) => { + let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None - }, + } }) .collect(); - // load sketches from paths in parallel. - let n_failed = AtomicUsize::new(0); - // Load all entries as collections let collections = path_bufs .par_iter() .filter_map(|path| { - match collection_from_any_zipfile_or_signature_or_manifest(&path, &report_type, allow_failed) { + match collection_from_any_zipfile_or_signature_or_manifest( + &path, + &report_type, + allow_failed, + ) { Ok(collection) => Some(collection), Err(err) => { eprintln!("WARNING: could not load sketches from path '{}'", path); @@ -789,14 +805,15 @@ pub fn load_collection( } }); - let collection = - collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type, allow_failed) { + let collection = collection.or_else(|| { + match collection_from_pathlist(&sigpath, &report_type, allow_failed) { Ok((coll, n_failed)) => Some((coll, n_failed)), Err(e) => { last_error = Some(e); None } - }); + } + }); match collection { Some((coll, n_failed)) => { From 7aa1280b488d00f9a696d555fb76a1fa36dc9527 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 11:43:46 -0700 Subject: [PATCH 15/22] Code now checks for path existence --- src/python/tests/test_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index bae6319b..fc9a5a54 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -166,7 +166,7 @@ def test_index_bad_siglist_2(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err def test_index_empty_siglist(runtmp, capfd): From 83046d8cc74610251582b4c7661597e05e171ec4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 11:54:05 -0700 Subject: [PATCH 16/22] Added error for if path exists but can't load signature --- src/utils.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index 52c2179a..6d5754ba 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -581,7 +581,24 @@ fn collection_from_any_zipfile_or_signature_or_manifest( None } }); - Ok(collection) + + + match collection { + Some(collection) => { + Ok(Some(collection)) + } + None => { + if let Some(e) = last_error { + eprintln!("WARNING: could not load sketches from path '{}'", sigpath); + Err(Some(e)) + } else { + // Should never get here + Err(anyhow!( + "Unable to load the collection for an unknown reason." + ).into()) + } + } + } } fn collection_from_manifest( From cf2b8ea1e2cd3e2c0f5bed1956de5ecd26d35a6d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 14:25:02 -0700 Subject: [PATCH 17/22] Check for non-existence of files in fromfiles/pathlist --- src/python/tests/test_gather.py | 2 +- src/python/tests/test_multigather.py | 6 +++--- src/python/tests/test_multisearch.py | 4 ++-- src/python/tests/test_pairwise.py | 2 +- src/python/tests/test_search.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 4ab4c6de..8143a1fc 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -221,7 +221,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 1316d52a..a88930af 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -396,7 +396,7 @@ def test_missing_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." @@ -492,7 +492,7 @@ def test_bad_against(runtmp, capfd): against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') - make_file_list(against_list, [sig2, "no exist"]) + make_file_list(against_list, [sig2, "no-exist"]) # should succeed, but with error output. runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, @@ -501,7 +501,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index 611b0f81..417b778f 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -312,7 +312,7 @@ def test_bad_query(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err @@ -416,7 +416,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 3869b3d4..e4dcb639 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -220,7 +220,7 @@ def test_bad_query(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 analysis paths failed to load. See error messages above." in captured.err diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 8e08e714..fc443643 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -350,7 +350,7 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err @@ -449,7 +449,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err + assert "WARNING: path 'no-exist' does not exist" in captured.err assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err From dff7c5f4d69cb54672c5801b53a0184afd8dfc2d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 14:32:13 -0700 Subject: [PATCH 18/22] Fix bug for loading multiple sketches in fastgather! --- src/python/tests/test_gather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 8143a1fc..e5c84179 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -307,7 +307,7 @@ def test_against_multisigfile(runtmp, zip_against): print(df) else: print(df) - assert len(df) == 1 + assert len(df) == 3 # @CTB this is a bug :(. It should load multiple sketches properly! From d1331f01bea9bc485fcae675a99e374075a49089 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 14:36:33 -0700 Subject: [PATCH 19/22] Don't double-print warning couldn't load sketch --- src/utils.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 6d5754ba..dcf17411 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -685,7 +685,7 @@ fn collection_from_pathlist( ) { Ok(collection) => Some(collection), Err(err) => { - eprintln!("WARNING: could not load sketches from path '{}'", path); + // eprintln!("collection_from_any_zipfile_or_signature_or_manifest WARNING: could not load sketches from path '{}'", path); let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } @@ -719,7 +719,6 @@ fn collection_from_pathlist( let n_failed = n_failed.load(atomic::Ordering::SeqCst); - // How to get number of failed? Ok((collection?, n_failed)) } From 3246c3df5c4bbe68c0f1dffdbd22d38d3236ae41 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 15:05:56 -0700 Subject: [PATCH 20/22] Specify path doesn't exist in the pathlist/fromfiles --- src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index dcf17411..4d7dd829 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -661,7 +661,7 @@ fn collection_from_pathlist( match path_buf.exists() { true => Some(path_buf), false => { - eprintln!("WARNING: path '{}' does not exist", l); + eprintln!("WARNING: path '{}' does not exist in '{}'", l, sigpath); let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst); None } From c38f7ddba8f48ec59e6bb280acfedf14027ba988 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 5 Jul 2024 15:38:18 -0700 Subject: [PATCH 21/22] cargo format --- src/utils.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 4d7dd829..3126c55b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -582,20 +582,15 @@ fn collection_from_any_zipfile_or_signature_or_manifest( } }); - match collection { - Some(collection) => { - Ok(Some(collection)) - } + Some(collection) => Ok(Some(collection)), None => { if let Some(e) = last_error { eprintln!("WARNING: could not load sketches from path '{}'", sigpath); Err(Some(e)) } else { // Should never get here - Err(anyhow!( - "Unable to load the collection for an unknown reason." - ).into()) + Err(anyhow!("Unable to load the collection for an unknown reason.").into()) } } } @@ -730,6 +725,10 @@ fn collection_to_signatures(collection: &Collection) -> Result, a .iter() .filter_map(|record| match collection.sig_from_record(&record) { Ok(sigstore) => { + eprintln!( + "Hello in collection_to_signatures, reading {}", + record.filename() + ); let sig = Signature::from(sigstore); Some(sig) } From d93011665b969f218ef1119ed8463ec0f23e57f9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 27 Jul 2024 16:41:55 -0700 Subject: [PATCH 22/22] debugging, lots of print statements --- src/mastiff_manygather.rs | 12 ++++++++++++ src/python/tests/test_gather.py | 2 +- src/python/tests/test_multigather.py | 2 +- src/utils.rs | 18 +++++++++++++++--- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 03e054dc..c9fd2aad 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -34,6 +34,18 @@ pub fn mastiff_manygather( allow_failed_sigpaths, )?; + eprintln!( + "queries_file: {}\nquery_collection.len(): {}", + // "\nquery_collection.manifest().internal_locations(): {}", + queries_file, + query_collection.len(), + // query_collection.manifest().internal_locations().cloned().collect::>().join(", ") + ); + + query_collection.manifest().internal_locations().for_each(|location| + eprintln!("query collection, internal location: {}", location) + ); + // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index e5c84179..69399eb6 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -247,7 +247,7 @@ def test_bad_against_2(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Sketch loading error: File is too short, less than five bytes" in captured.err + # assert "Sketch loading error: File is too short, less than five bytes" in captured.err assert "WARNING: could not load sketches from path" in captured.err assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index a88930af..15c2becf 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -548,7 +548,7 @@ def test_empty_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Sketch loading error: No such file or directory" in captured.err + assert "WARNING: path '' does not exist in 'against.txt'" in captured.err assert "No search signatures loaded, exiting." in captured.err diff --git a/src/utils.rs b/src/utils.rs index 3126c55b..309aab83 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -726,13 +726,25 @@ fn collection_to_signatures(collection: &Collection) -> Result, a .filter_map(|record| match collection.sig_from_record(&record) { Ok(sigstore) => { eprintln!( - "Hello in collection_to_signatures, reading {}", - record.filename() + "\n:) Hello in collection_to_signatures, record.internal_location(): {}, record.filename(): {}", + record.internal_location(), record.filename() ); + eprintln!("record.md5sum(): {}", record.md5()); let sig = Signature::from(sigstore); + eprintln!( + ":) Hello in collection_to_signatures, created signature, signature filename: {}", + sig.filename() + ); + eprintln!("sig.md5sum(): {}", sig.md5sum()); Some(sig) } - Err(_err) => None, + Err(_err) => { + eprintln!( + "\n:( Hello in collection_to_signatures Error (None), record.internal_location(): {}, record.filename(): {}", + record.internal_location(), record.filename() + ); + None + }, // record.filename() }) // .map(|record| record.filename());