Skip to content

Commit

Permalink
sync paused development
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-eyes committed Sep 7, 2023
1 parent 609118e commit 2760c31
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 76 deletions.
5 changes: 5 additions & 0 deletions apps/repr_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ int main(int argc, char** argv) {
uint64_t from_node = stoi(parts[0]);
uint64_t to_node = stoi(parts[1]);
count[from_node]++;
-
count[to_node]++;
}

Expand All @@ -41,4 +42,8 @@ int main(int argc, char** argv) {
for (auto& [k, v] : elems) {
cout << k << ": " << v << endl;
}




}
2 changes: 1 addition & 1 deletion build_wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ cleanup

# Build the project if not already built
BUILD_DIR="build"
[[ -d ${BUILD_DIR} ]] || cmake -Bbuild && cmake --build build -j4
[[ -d ${BUILD_DIR} ]] || cmake -Bbuild && cmake --build build -j32


echo "BDIST WHEEL"
Expand Down
14 changes: 14 additions & 0 deletions export_sig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@ int main(int argc, char** argv) {
auto begin_time = Time::now();
zstr::ifstream sig_stream(sig_path);
json::value json = json::parse(sig_stream);

int selected_sig = -1;
for (int sig_no = 0; sig_no < json.size(); sig_no++) {
json::array& sig_array = as_array(json[sig_no]["signatures"]);
for (auto it = sig_array.begin(); it != sig_array.end(); ++it) {
const json::value& v = *it;
if (v["ksize"] == kSize) {
selected_sig = sig_no;
break;
}
}
}


auto sourmash_sig = json[0]["signatures"];
const json::array& sig_array = as_array(sourmash_sig);
for (auto it = sig_array.begin(); it != sig_array.end(); ++it) {
Expand Down
2 changes: 1 addition & 1 deletion include/kSpider.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

namespace kSpider {

void pairwise(string index_prefix, int user_threads);
void pairwise(string index_prefix, int user_threads, int scale = 0);
void index_kmers(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix);
void index_kmers_nonCanonical(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix);
void index_skipmers(int m, int n, int k, string fasta_file, string names_file, int chunk_size, string index_prefix);
Expand Down
2 changes: 1 addition & 1 deletion pairwise.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "kSpider.hpp"

int main(int argc, char** argv) {
kSpider::pairwise(argv[1], stoi(argv[2]));
kSpider::pairwise(argv[1], stoi(argv[2]), stod(argv[3]));
}
7 changes: 5 additions & 2 deletions pykSpider/kSpider2/ks_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ class Clusters:
"min_cont": 3,
"avg_cont": 4,
"max_cont": 5,
"ani": 6,
"ochiai": 6,
"jaccard": 7,
"beta_ani": 8,
"ani": 9,
}

seq_to_kmers = dict()
Expand Down Expand Up @@ -150,7 +153,7 @@ def cluster_graph(self):
@cli.command(name="cluster", help_priority=4)
@click.option('-c', '--cutoff', required=False, type=click.FloatRange(0, 1, clamp=False), default=0.0, show_default=True, help="cluster sequences with (containment > cutoff)")
@click.option('-i', '--index-prefix', "index_prefix", required=True, type=click.STRING, help="Index file prefix")
@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani']")
@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani', 'ochiai', 'jaccard', 'beta_ani]")
@click.pass_context
def main(ctx, index_prefix, cutoff, distance_type):
"""Sequence clustering."""
Expand Down
41 changes: 22 additions & 19 deletions pykSpider/kSpider2/ks_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
from kSpider2.click_context import cli
from scipy.cluster.hierarchy import linkage, to_tree, ClusterWarning
import numpy as np
from warnings import simplefilter
simplefilter("ignore", ClusterWarning)

Expand Down Expand Up @@ -40,9 +41,8 @@ def get_newick(node, parent_dist, leaf_names, newick='') -> str:

@cli.command(name="export", help_priority=5)
@click.option('-i', '--index-prefix', required=True, type=click.STRING, help="Index file prefix")
# @click.option('--dist-mat', "distance_matrix", is_flag=True, help="Convert pairwise matrix to NxN distance matrix", default=False)
@click.option('--newick', "newick", is_flag=True, help="Convert pairwise (containment) matrix to newick format", default=False)
@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_cont', 'avg_cont', 'max_cont', 'ani']")
@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_cont', 'avg_cont', 'max_cont', 'ani', 'ochiai', 'jaccard', 'beta_ani]")
@click.option('-o', "overwritten_output", default="na", required=False, type=click.STRING, help="custom output file name prefix")
@click.pass_context
def main(ctx, index_prefix, newick, distance_type, overwritten_output):
Expand All @@ -54,27 +54,30 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
kSpider_pairwise_tsv = f"{index_prefix}_kSpider_pairwise.tsv"
namesMap_file = f"{index_prefix}.namesMap"
seqToKmers_tsv = f"{index_prefix}_kSpider_seqToKmersNo.tsv"

LOGGER = ctx.obj

distance_to_col = {
"min_cont": 3,
"avg_cont": 4,
"max_cont": 5,
"ochiai": 6,
"jaccard": 7,
"beta_ani": 8,
"ani": 99
}

if distance_type not in distance_to_col:
LOGGER.ERROR("unknown distance!")

dist_col = distance_to_col[distance_type]
if dist_col == "ani":
with open(kSpider_pairwise_tsv, 'r') as pairwise_tsv:
if "ani" not in next(pairwise_tsv).lower():
LOGGER.ERROR("ANI was selected but was not found in the pairwise file.\nPlease, run kSpider pairwise --extend_with_ani -i <index_prefix> script")



# Check for existing pairwise file
for _file in [kSpider_pairwise_tsv, namesMap_file, seqToKmers_tsv]:
if not os.path.exists(_file):
Expand All @@ -83,7 +86,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
"""
# Load kmer count per record
"""
seq_to_kmers = dict()
seq_to_kmers = {}
with open(seqToKmers_tsv) as KMER_COUNT:
next(KMER_COUNT)
for line in KMER_COUNT:
Expand All @@ -94,7 +97,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
# Parse namesmap
"""

namesMap_dict = dict()
namesMap_dict = {}
with open(namesMap_file) as NAMES:
next(NAMES)
for line in NAMES:
Expand All @@ -105,17 +108,17 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):

"""Parse kSpider's pairwise
"""
distances = dict()

distances = {}
labeled_out = f"kSpider_{index_basename}_pairwise.tsv"
distmatrix_out = f"kSpider_{index_basename}_distmat.tsv"
newick_out = f"kSpider_{index_basename}.newick"

if overwritten_output != "na":
labeled_out = f"{overwritten_output}_pairwise.tsv"
distmatrix_out = f"{overwritten_output}_distmat.tsv"
newick_out = f"{overwritten_output}.newick"

if distance_type == "ani":
with open(kSpider_pairwise_tsv) as PAIRWISE, open(labeled_out, 'w') as NEW, open(index_prefix + "_kSpider_pairwise.ani_col.tsv") as ANI:
ctx.obj.INFO(f"Writing pairwise matrix to {labeled_out}")
Expand All @@ -132,7 +135,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
dist_metric = float(next(ANI).strip())
distances[(grp1, grp2)] = dist_metric
NEW.write(f"{grp1}\t{grp2}\t{dist_metric}\n")

else:
with open(kSpider_pairwise_tsv) as PAIRWISE, open(labeled_out, 'w') as NEW:
ctx.obj.INFO(f"Writing pairwise matrix to {labeled_out}")
Expand All @@ -149,7 +152,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
distances[(grp1, grp2)] = dist_metric
NEW.write(f"{grp1}\t{grp2}\t{dist_metric}\n")

unique_ids = sorted(set([x for y in distances.keys() for x in y]))
unique_ids = sorted({x for y in distances for x in y})
df = pd.DataFrame(index=unique_ids, columns=unique_ids)
for k, v in distances.items():
df.loc[k[0], k[1]] = 1-v
Expand All @@ -158,13 +161,13 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
df = df.fillna(0)
LOGGER.INFO(f"Writing distance matrix to {distmatrix_out}")
df.to_csv(distmatrix_out, sep='\t')

if newick:
loaded_df = pd.read_csv(distmatrix_out, sep='\t')
LOGGER.INFO(f"Writing newick to {newick_out}.")
names = list(loaded_df.columns[1:])
dist = loaded_df[loaded_df.columns[1:]].to_numpy()
Z = linkage(dist, 'single')
Z = linkage(dist, 'ward')
tree = to_tree(Z, False)

newick = get_newick(tree, tree.dist, names)
Expand Down
2 changes: 1 addition & 1 deletion pykSpider/kSpider2/ks_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def main(ctx, index_prefix, user_threads, ani, sourmash_scale):
f"Constructing the containment pairwise matrix using {user_threads} cores.")
if sourmash_scale:
ctx.obj.WARNING("No need to provide -s/--scale when running this command.")
kSpider_internal.pairwise(index_prefix, user_threads)
kSpider_internal.pairwise(index_prefix, user_threads, sourmash_scale)
ctx.obj.SUCCESS("Done.")
else:
pairwise_file = index_prefix + "_kSpider_pairwise.tsv"
Expand Down
68 changes: 45 additions & 23 deletions src/pairwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "parallel_hashmap/phmap.h"
#include "parallel_hashmap/phmap_dump.h"
#include <cassert>
#include <math.h>

using boost::adaptors::transformed;
using boost::algorithm::join;
Expand Down Expand Up @@ -45,6 +46,7 @@ class Combo {

void combinations(int n) {
this->combs.clear();
this->combs.reserve((n * (n - 1)) / 2);
this->comb(n, this->r, this->arr);
}

Expand Down Expand Up @@ -92,7 +94,7 @@ namespace kSpider {
}
}

void load_colors_to_sources(const std::string& filename, int_vec_map * map)
void load_colors_to_sources(const std::string& filename, int_vec_map* map)
{
phmap::BinaryInputArchive ar_in(filename.c_str());
size_t size;
Expand Down Expand Up @@ -120,7 +122,7 @@ namespace kSpider {
}
}

void pairwise(string index_prefix, int user_threads) {
void pairwise(string index_prefix, int user_threads, int scale) {

// Read colors

Expand All @@ -135,7 +137,7 @@ namespace kSpider {
begin_time = Time::now();
int_int_map colorsCount;
load_colors_count(index_prefix + "_color_count.bin", colorsCount);


// TODO: should be csv, rename later.
// std::ifstream data(index_prefix + "_kSpider_colorCount.tsv");
Expand All @@ -154,7 +156,7 @@ namespace kSpider {

cout << "parsing index colors: " << std::chrono::duration<double, std::milli>(Time::now() - begin_time).count() / 1000 << " secs" << endl;
begin_time = Time::now();

// for (const auto& record : color_to_ids) {
// uint32_t colorCount = colorsCount[record.first];
// for (auto group_id : record.second) {
Expand Down Expand Up @@ -211,7 +213,6 @@ namespace kSpider {
Combo combo = Combo();
combo.combinations(item.second.size());
for (uint32_t i = 0; i < combo.combs.size(); i++) {
// for (auto const& seq_pair : combo.combs) {
auto const& seq_pair = combo.combs[i];
uint32_t _seq1 = item.second[seq_pair.first];
uint32_t _seq2 = item.second[seq_pair.second];
Expand All @@ -223,20 +224,14 @@ namespace kSpider {
[ccount](PAIRS_COUNTER::value_type& v) { v.second += ccount; }, // called only when key was already present
ccount
);

// ** BUG FIX ** was creating wrong shared_kmers
// auto _p = make_pair(_seq1, _seq2);
// uint32_t ccount = colorsCount[item.first];
// edges.lazy_emplace_l(_p,
// [ccount](PAIRS_COUNTER::value_type& v) { v.second++; }, // called only when key was already present
// [_p, ccount](const PAIRS_COUNTER::constructor& ctor) {
// ctor(_p, ccount); }
// ); // construct value_type in place when key not present
}
}
}



cout << "pairwise hashmap construction: " << std::chrono::duration<double, std::milli>(Time::now() - begin_time).count() / 1000 << " secs" << endl;
cout << "Number of pairwise comparisons: " << edges.size() << endl;
cout << "writing pairwise matrix to " << index_prefix << "_kSpider_pairwise.tsv" << endl;

std::ofstream myfile;
Expand All @@ -248,29 +243,56 @@ namespace kSpider {
<< "\tmin_containment"
<< "\tavg_containment"
<< "\tmax_containment"
<< "\tochiai"
<< "\tjaccard"
<< "\tbeta_ani"
<< '\n';
uint64_t line_count = 0;
for (const auto& edge : edges) {
uint64_t shared_kmers = edge.second;
float shared_kmers = static_cast<float>(edge.second);
uint32_t source_1 = edge.first.first;
uint32_t source_2 = edge.first.second;
uint32_t source_1_kmers = groupID_to_kmerCount[source_1];
uint32_t source_2_kmers = groupID_to_kmerCount[source_2];

float cont_1_in_2 = (float)shared_kmers / source_2_kmers;
float cont_2_in_1 = (float)shared_kmers / source_1_kmers;
// containments
float cont_1_in_2 = shared_kmers / source_2_kmers;
float cont_2_in_1 = shared_kmers / source_1_kmers;
float min_containment = min(cont_1_in_2, cont_2_in_1);
float avg_containment = (cont_1_in_2 + cont_2_in_1) / 2.0;
float avg_containment = (cont_1_in_2 + cont_2_in_1) / 2.0f;
float max_containment = max(cont_1_in_2, cont_2_in_1);

myfile
<< source_1
<< '\t' << source_2
<< '\t' << shared_kmers
// Ochiai distance
float ochiai = (shared_kmers / sqrt((float)source_1_kmers * (float)source_2_kmers));

// Jaccard distance (if size of samples is roughly similar)
// J(A, B) = 1 - |A ∩ B| / (|A| + |B| - |A ∩ B|)
float jaccard = shared_kmers / (source_1_kmers + source_2_kmers - shared_kmers);

// Kulczynski distance needs abundance of each sample
// float kulczynski = (float)shared_kmers / (source_1_kmers + source_2_kmers) * 2;

// my ANI
float ani = 0.0;
if (scale) {
double avg_length = ((source_1_kmers + source_2_kmers) / 2.0) * scale;
ani = (2.0 * shared_kmers) / (source_1_kmers + source_2_kmers - (2.0 * shared_kmers / avg_length));
}

// Prepare the output string
std::stringstream line;
line << edge.first.first
<< '\t' << edge.first.second
<< '\t' << edge.second
<< '\t' << min_containment
<< '\t' << avg_containment
<< '\t' << max_containment
<< '\n';
<< '\t' << ochiai
<< '\t' << jaccard;
if (scale) line << '\t' << ani;
line << '\n';
myfile << line.str();

}
myfile.close();
}
Expand Down
Loading

0 comments on commit 2760c31

Please sign in to comment.