Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable listing of zip files in fromfiles for index, gather, search #354

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
90b0314
Add a failing test for indexing multiple manifest zips provided as a …
olgabot Jun 13, 2024
97fa413
After loading matching Signature, Add re-looking through the pathlist in
olgabot Jun 13, 2024
3295f0e
Created collection_from_zipfile_or_signature_or_manifest to call with…
olgabot Jun 21, 2024
d92c873
Merge branch 'main' into olgabot/index-multiple-manifests
olgabot Jun 21, 2024
9c3ecf9
Iterate over the records in the collection to get their paths to then…
olgabot Jun 22, 2024
6615436
Clean up some comments
olgabot Jun 23, 2024
4ec39a3
Change to use map instead of filter_map to get record internal locations
olgabot Jun 23, 2024
9084926
Started writing collection_to_pathlist but can't get the types working
olgabot Jun 28, 2024
c9e738e
Get the signatures from a collection
olgabot Jun 30, 2024
8fd921c
remove assert False
olgabot Jun 30, 2024
ef8ee17
Cargo format
olgabot Jun 30, 2024
12cde5b
Merge branch 'main' into olgabot/index-multiple-manifests
olgabot Jul 1, 2024
6f68a3b
Change command name to testing multiple zip files
olgabot Jul 1, 2024
9760992
no compiler errors!
olgabot Jul 5, 2024
8a0f6a4
remove print statement
olgabot Jul 5, 2024
57a83b0
No compiler errors, tried to add error if path doesn't exist
olgabot Jul 5, 2024
7aa1280
Code now checks for path existence
olgabot Jul 5, 2024
83046d8
Added error for if path exists but can't load signature
olgabot Jul 5, 2024
cf2b8ea
Check for non-existence of files in fromfiles/pathlist
olgabot Jul 5, 2024
dff7c5f
Fix bug for loading multiple sketches in fastgather!
olgabot Jul 5, 2024
d1331f0
Don't double-print warning couldn't load sketch
olgabot Jul 5, 2024
3246c3d
Specify path doesn't exist in the pathlist/fromfiles
olgabot Jul 5, 2024
c38f7dd
cargo format
olgabot Jul 5, 2024
d930116
debugging, lots of print statements
olgabot Jul 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,32 @@ def test_index_manifest(runtmp, capfd):
assert 'index is done' in runtmp.last_result.err


def test_index_multiple_manifests(runtmp, capfd):
# test index with text file of multiple manifests
sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')
sigs = [sig2, sig47, sig63]
manifests = []
for sig in sigs:
sig_mf = runtmp.output(os.path.basename(sig) + ".mf.zip")
runtmp.sourmash("sig", "cat", sig, "-o", sig_mf)
manifests.append(sig_mf)

# assert False
manifests_zips_list = runtmp.output('manifest_zips.txt')
make_file_list(manifests_zips_list, manifests)

output = runtmp.output('out.db')
runtmp.sourmash('scripts', 'index', manifests_zips_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)
print(runtmp.last_result.err)
assert 'index is done' in runtmp.last_result.err


def test_index_bad_siglist_2(runtmp, capfd):
# test with a bad siglist (containing a missing file)
against_list = runtmp.output('against.txt')
Expand Down
94 changes: 63 additions & 31 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,46 @@ pub fn collection_from_zipfile(sigpath: &Path, report_type: &ReportType) -> Resu
}
}

// Make a collection from anything except a pathlist, as this is called
// from collection_from_pathlist
fn collection_from_zipfile_or_signature_or_manifest(
sigpath: &Path,
report_type: &ReportType,
) -> Result<Option<Collection>, Option<anyhow::Error>> {
// returns collection and number of failures
let mut last_error = None;
let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") {
match collection_from_zipfile(&sigpath, &report_type) {
Ok(coll) => Some(coll),
Err(e) => {
last_error = Some(e);
None
}
}
} else {
None
};

let collection =
collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) {
Ok(coll) => Some(coll),
Err(e) => {
last_error = Some(e);
None
}
});

let collection =
collection.or_else(|| match collection_from_signature(&sigpath, &report_type) {
Ok(coll) => Some(coll),
Err(e) => {
last_error = Some(e);
None
}
});
Ok(collection)
}

fn collection_from_manifest(
sigpath: &Path,
report_type: &ReportType,
Expand Down Expand Up @@ -585,58 +625,50 @@ fn collection_from_pathlist(
)
})?;
let reader = BufReader::new(file);
let mut last_error: std::option::Option<anyhow::Error> = None;

// load list of paths
let lines: Vec<_> = reader
.lines()
.filter_map(|line| match line {
Ok(path) => Some(path),
Ok(path) => Some(PathBuf::from(path)),
Err(_err) => None,
})
.collect();

// load sketches from paths in parallel.
let n_failed = AtomicUsize::new(0);
let records: Vec<Record> = lines

// Load all entries as collections
let record_paths = lines
.par_iter()
.filter_map(|path| match Signature::from_path(path) {
Ok(signatures) => {
let recs: Vec<Record> = signatures
.into_iter()
.flat_map(|v| Record::from_sig(&v, path))
.collect();
Some(recs)
}
.filter_map(|path| match collection_from_zipfile_or_signature_or_manifest(&path, &report_type) {
Ok(collection) => {
// For each record in the collection, get its path filename
Some(collection
.unwrap()
.manifest()
.iter()
.filter_map(|record| match record.internal_location(){
PathBuf => Some(location)

})
.collect())
},
Copy link
Contributor Author

@olgabot olgabot Jun 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Getting compiler error that Some doesn't exist? Not sure what to make of this..

error[E0425]: cannot find value `location` in this scope
   --> src/utils.rs:653:41
    |
653 |                         PathBuf => Some(location)
    |                                         ^^^^^^^^ not found in this scope

Err(err) => {
eprintln!("Sketch loading error: {}", err);
eprintln!("WARNING: could not load sketches from path '{}'", path);
let _ = n_failed.fetch_add(1, atomic::Ordering::SeqCst);
None
}
})
.flatten()
.collect();
}).flatten().collect::<Vec<PathBuf>>();

if records.is_empty() {
eprintln!(
"No valid signatures found in {} pathlist '{}'",
report_type, sigpath
);
}

let manifest: Manifest = records.into();
let collection = Collection::new(
manifest,
InnerStorage::new(
FSStorage::builder()
.fullpath("".into())
.subdir("".into())
.build(),
),
);
// Now load the path filenames as one big collection
let collection = Collection::from_paths(&record_paths);

let n_failed = n_failed.load(atomic::Ordering::SeqCst);

Ok((collection, n_failed))
Ok((collection?, n_failed))
}

fn collection_from_signature(sigpath: &Path, report_type: &ReportType) -> Result<Collection> {
Expand Down
Loading