Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: clean up index to use MultiCollection #451

Merged
merged 8 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 40 additions & 55 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,81 +1,66 @@
use anyhow::Context;
use camino::Utf8PathBuf as PathBuf;
use sourmash::index::revindex::RevIndex;
use sourmash::index::revindex::RevIndexOps;
use sourmash::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use crate::utils::{load_collection, ReportType};
use sourmash::collection::{Collection, CollectionSet};

pub fn index<P: AsRef<Path>>(
siglist: String,
selection: &Selection,
output: P,
colors: bool,
_allow_failed_sigpaths: bool,
allow_failed_sigpaths: bool,
use_internal_storage: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading siglist");
eprintln!("Loading sketches from {}", siglist);

let collection = match siglist {
x if x.ends_with(".zip") => Collection::from_zipfile(x)?,
x if x.ends_with(".sig") || x.ends_with(".sig.gz") => {
let signatures = Signature::from_path(&x)
.with_context(|| format!("Failed to load signatures from: '{}'", x))?;
let multi = match load_collection(
&siglist,
selection,
ReportType::General,
allow_failed_sigpaths,
) {
Ok(multi) => multi,
Err(err) => return Err(err.into()),
};
eprintln!("Found {} sketches total.", multi.len());

Collection::from_sigs(signatures).with_context(|| {
format!(
"Loaded signatures but failed to load as collection: '{}'",
x
)
})?
// Try to convert it into a Collection and then CollectionSet.
let collection = match Collection::try_from(multi.clone()) {
// conversion worked!
Ok(c) => {
let cs: CollectionSet = c.select(selection)?.try_into()?;
Ok(cs)
}
_ => {
let file = File::open(siglist.clone())
.with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?;

let reader = BufReader::new(file);

// load list of paths
let lines: Vec<_> = reader
.lines()
.filter_map(|line| match line {
Ok(path) => {
let mut filename = PathBuf::new();
filename.push(path);
Some(filename)
}
Err(_err) => None,
})
.collect();

if lines.is_empty() {
return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into());
// conversion failed; can we/should we load it into memory?
Err(_) => {
if use_internal_storage {
eprintln!("WARNING: loading all sketches into memory in order to index.");
eprintln!("See 'index' documentation for details.");
let c: Collection = multi.load_all_sigs(selection)?;
let cs: CollectionSet = c.try_into()?;
Ok(cs)
} else {
match Collection::from_paths(&lines) {
Ok(collection) => collection,
Err(err) => {
eprintln!("Error in loading from '{}': {}", siglist, err);
return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into());
}
}
Err(
anyhow::anyhow!("cannot index this type of collection with external storage")
.into(),
)
}
}
};

let collection: CollectionSet = collection.select(selection)?.try_into()?;
match collection {
Ok(collection) => {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if collection.is_empty() {
Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into())
} else {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if use_internal_storage {
index.internalize_storage()?;
if use_internal_storage {
index.internalize_storage()?;
}
Ok(())
}
Ok(())
Err(e) => Err(e),
}
}
4 changes: 4 additions & 0 deletions src/python/tests/test_fastgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def test_installed(runtmp):


def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand Down Expand Up @@ -60,6 +62,8 @@ def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, togg


def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand Down
11 changes: 10 additions & 1 deletion src/python/tests/test_fastmultigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def test_simple_read_manifests(runtmp):


def test_simple_indexed(runtmp, zip_query, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")

# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
Expand Down Expand Up @@ -239,6 +242,8 @@ def test_simple_indexed(runtmp, zip_query, toggle_internal_storage):


def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
Expand Down Expand Up @@ -273,6 +278,8 @@ def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage):


def test_missing_querylist(runtmp, capfd, indexed, zip_query, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test missing querylist
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand Down Expand Up @@ -1174,7 +1181,9 @@ def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd):
"47.fa.sig.gz",
"63.fa.sig.gz"])

# index!
# index! CTB, note this will fail currently.
raise pytest.xfail("not implemented")

runtmp.sourmash('scripts', 'index', against_list, '--no-internal-storage',
'-o', 'subdir/against.rocksdb')

Expand Down
59 changes: 54 additions & 5 deletions src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def test_installed(runtmp):


def test_index(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")

# test basic index!
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -35,6 +38,49 @@ def test_index(runtmp, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


def test_index_warning_message(runtmp, capfd):
# test basic index when it has to load things into memory - see #451.
siglist = runtmp.output('db-sigs.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(siglist, [sig2, sig47, sig63])

output = runtmp.output('db.rocksdb')

runtmp.sourmash('scripts', 'index', siglist, '-o', output)
assert os.path.exists(output)
print(runtmp.last_result.err)

assert 'index is done' in runtmp.last_result.err
captured = capfd.readouterr()
print(captured.err)
assert "WARNING: loading all sketches into memory in order to index." in captured.err


def test_index_error_message(runtmp, capfd):
# test basic index when it errors out b/c can't load
siglist = runtmp.output('db-sigs.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(siglist, [sig2, sig47, sig63])

output = runtmp.output('db.rocksdb')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'index', siglist, '-o', output,
'--no-internal-storage')

captured = capfd.readouterr()
print(captured.err)
assert "cannot index this type of collection with external storage" in captured.err


def test_index_protein(runtmp, toggle_internal_storage):
sigs = get_test_data('protein.zip')
output = runtmp.output('db.rocksdb')
Expand Down Expand Up @@ -82,10 +128,9 @@ def test_index_missing_siglist(runtmp, capfd, toggle_internal_storage):

captured = capfd.readouterr()
print(captured.err)
assert 'Failed to open pathlist file:' in captured.err
assert 'Error: No such file or directory: ' in captured.err


@pytest.mark.xfail(reason="not implemented yet")
def test_index_sig(runtmp, capfd, toggle_internal_storage):
# test index with a .sig.gz file instead of pathlist
# (should work now)
Expand All @@ -101,7 +146,6 @@ def test_index_sig(runtmp, capfd, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


@pytest.mark.xfail(reason="not implemented yet")
def test_index_manifest(runtmp, capfd, toggle_internal_storage):
# test index with a manifest file
sig2 = get_test_data('2.fa.sig.gz')
Expand All @@ -118,7 +162,6 @@ def test_index_manifest(runtmp, capfd, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


@pytest.mark.xfail(reason="needs more work")
def test_index_bad_siglist_2(runtmp, capfd):
# test with a bad siglist (containing a missing file)
against_list = runtmp.output('against.txt')
Expand All @@ -139,7 +182,6 @@ def test_index_bad_siglist_2(runtmp, capfd):
assert "WARNING: could not load sketches from path 'no-exist'" in captured.err


@pytest.mark.xfail(reason="needs more work")
def test_index_empty_siglist(runtmp, capfd):
# test empty siglist file
siglist = runtmp.output('db-sigs.txt')
Expand Down Expand Up @@ -347,6 +389,8 @@ def test_index_zipfile_bad(runtmp, capfd):


def test_index_check(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")
# test check index
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -367,6 +411,8 @@ def test_index_check(runtmp, toggle_internal_storage):


def test_index_check_quick(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")
# test check index
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -387,6 +433,9 @@ def test_index_check_quick(runtmp, toggle_internal_storage):


def test_index_subdir(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")

# test basic index & output to subdir
siglist = runtmp.output('db-sigs.txt')

Expand Down
33 changes: 33 additions & 0 deletions src/utils/multicollection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,26 @@ impl MultiCollection {
.collect();
MultiCollection::new(colls, self.contains_revindex)
}

pub fn load_all_sigs(self, selection: &Selection) -> Result<Collection> {
let all_sigs: Vec<Signature> = self
.par_iter()
.filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) {
Ok(sig) => {
let sig = sig.clone().select(selection).ok()?;
Some(Signature::from(sig))
}
Err(_) => {
eprintln!(
"FAILED to load sketch from '{}'",
record.internal_location()
);
None
}
})
.collect();
Ok(Collection::from_sigs(all_sigs)?)
}
}

impl Select for MultiCollection {
Expand Down Expand Up @@ -318,6 +338,19 @@ impl From<Vec<MultiCollection>> for MultiCollection {
}
}

// Extract a single Collection from a MultiCollection, if possible
impl TryFrom<MultiCollection> for Collection {
type Error = &'static str;

fn try_from(multi: MultiCollection) -> Result<Self, Self::Error> {
if multi.collections.len() == 1 {
Ok(multi.collections.into_iter().next().unwrap())
} else {
Err("More than one Collection in this MultiCollection; cannot convert")
}
}
}

/// Track a name/minhash.
pub struct SmallSignature {
pub location: String,
Expand Down
Loading