sync paused development

dib-lab · Sep 7, 2023 · 2760c31 · 2760c31
1 parent 609118e
commit 2760c31
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 76 deletions.
diff --git a/apps/repr_sketches.cpp b/apps/repr_sketches.cpp
@@ -29,6 +29,7 @@ int main(int argc, char** argv) {
             uint64_t from_node = stoi(parts[0]);
             uint64_t to_node = stoi(parts[1]);
             count[from_node]++;
+            -
             count[to_node]++;
         }
 
@@ -41,4 +42,8 @@ int main(int argc, char** argv) {
     for (auto& [k, v] : elems) {
         cout << k << ": " << v << endl;
     }
+
+
+
+
 }
diff --git a/build_wrapper.sh b/build_wrapper.sh
@@ -28,7 +28,7 @@ cleanup
 
 # Build the project if not already built
 BUILD_DIR="build"
-[[ -d ${BUILD_DIR} ]] || cmake -Bbuild && cmake --build build -j4
+[[ -d ${BUILD_DIR} ]] || cmake -Bbuild && cmake --build build -j32
 
 
 echo "BDIST WHEEL"

diff --git a/export_sig.cpp b/export_sig.cpp
@@ -33,6 +33,20 @@ int main(int argc, char** argv) {
     auto begin_time = Time::now();
     zstr::ifstream sig_stream(sig_path);
     json::value json = json::parse(sig_stream);
+
+    int selected_sig = -1;
+    for (int sig_no = 0; sig_no < json.size(); sig_no++) {
+        json::array& sig_array = as_array(json[sig_no]["signatures"]);
+        for (auto it = sig_array.begin(); it != sig_array.end(); ++it) {
+            const json::value& v = *it;
+            if (v["ksize"] == kSize) {
+                selected_sig = sig_no;
+                break;
+            }
+        }
+    }
+
+
     auto sourmash_sig = json[0]["signatures"];
     const json::array& sig_array = as_array(sourmash_sig);
     for (auto it = sig_array.begin(); it != sig_array.end(); ++it) {

diff --git a/include/kSpider.hpp b/include/kSpider.hpp
@@ -8,7 +8,7 @@
 
 namespace kSpider {
 
-    void pairwise(string index_prefix, int user_threads);
+    void pairwise(string index_prefix, int user_threads, int scale = 0);
     void index_kmers(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix);
     void index_kmers_nonCanonical(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix);
     void index_skipmers(int m, int n, int k, string fasta_file, string names_file, int chunk_size, string index_prefix);

diff --git a/pairwise.cpp b/pairwise.cpp
@@ -1,5 +1,5 @@
 #include "kSpider.hpp"
 
 int main(int argc, char** argv) {
-    kSpider::pairwise(argv[1], stoi(argv[2]));
+    kSpider::pairwise(argv[1], stoi(argv[2]), stod(argv[3]));
 }
diff --git a/pykSpider/kSpider2/ks_clustering.py b/pykSpider/kSpider2/ks_clustering.py
@@ -13,7 +13,10 @@ class Clusters:
         "min_cont": 3,
         "avg_cont": 4,
         "max_cont": 5,
-        "ani": 6,
+        "ochiai": 6,
+        "jaccard": 7,
+        "beta_ani": 8,
+        "ani": 9,
     }
 
     seq_to_kmers = dict()
@@ -150,7 +153,7 @@ def cluster_graph(self):
 @cli.command(name="cluster", help_priority=4)
 @click.option('-c', '--cutoff', required=False, type=click.FloatRange(0, 1, clamp=False), default=0.0, show_default=True, help="cluster sequences with (containment > cutoff)")
 @click.option('-i', '--index-prefix', "index_prefix", required=True, type=click.STRING, help="Index file prefix")
-@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani']")
+@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani', 'ochiai', 'jaccard', 'beta_ani]")
 @click.pass_context
 def main(ctx, index_prefix, cutoff, distance_type):
     """Sequence clustering."""

diff --git a/pykSpider/kSpider2/ks_export.py b/pykSpider/kSpider2/ks_export.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from kSpider2.click_context import cli
 from scipy.cluster.hierarchy import linkage, to_tree, ClusterWarning
+import numpy as np
 from warnings import simplefilter
 simplefilter("ignore", ClusterWarning)
 
@@ -40,9 +41,8 @@ def get_newick(node, parent_dist, leaf_names, newick='') -> str:
 
 @cli.command(name="export", help_priority=5)
 @click.option('-i', '--index-prefix', required=True, type=click.STRING, help="Index file prefix")
-# @click.option('--dist-mat', "distance_matrix", is_flag=True, help="Convert pairwise matrix to NxN distance matrix", default=False)
 @click.option('--newick', "newick", is_flag=True, help="Convert pairwise (containment) matrix to newick format", default=False)
-@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_cont', 'avg_cont', 'max_cont', 'ani']")
+@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_cont', 'avg_cont', 'max_cont', 'ani', 'ochiai', 'jaccard', 'beta_ani]")
 @click.option('-o', "overwritten_output", default="na", required=False, type=click.STRING, help="custom output file name prefix")
 @click.pass_context
 def main(ctx, index_prefix, newick, distance_type, overwritten_output):
@@ -54,27 +54,30 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
     kSpider_pairwise_tsv = f"{index_prefix}_kSpider_pairwise.tsv"
     namesMap_file = f"{index_prefix}.namesMap"
     seqToKmers_tsv = f"{index_prefix}_kSpider_seqToKmersNo.tsv"
-    
+
     LOGGER = ctx.obj
-    
+
     distance_to_col = {
         "min_cont": 3,
         "avg_cont": 4,
         "max_cont": 5,
+        "ochiai": 6,
+        "jaccard": 7,
+        "beta_ani": 8,
         "ani": 99
     }
-    
+
     if distance_type not in distance_to_col:
         LOGGER.ERROR("unknown distance!")
-    
+
     dist_col = distance_to_col[distance_type]
     if dist_col == "ani":
         with open(kSpider_pairwise_tsv, 'r') as pairwise_tsv:
             if "ani" not in next(pairwise_tsv).lower():
                 LOGGER.ERROR("ANI was selected but was not found in the pairwise file.\nPlease, run kSpider pairwise --extend_with_ani -i <index_prefix> script")
-    
-    
-    
+
+
+
     # Check for existing pairwise file
     for _file in [kSpider_pairwise_tsv, namesMap_file, seqToKmers_tsv]:
         if not os.path.exists(_file):
@@ -83,7 +86,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
     """
     # Load kmer count per record
     """
-    seq_to_kmers = dict()
+    seq_to_kmers = {}
     with open(seqToKmers_tsv) as KMER_COUNT:
         next(KMER_COUNT)
         for line in KMER_COUNT:
@@ -94,7 +97,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
     # Parse namesmap
     """
 
-    namesMap_dict = dict()
+    namesMap_dict = {}
     with open(namesMap_file) as NAMES:
         next(NAMES)
         for line in NAMES:
@@ -105,17 +108,17 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
 
     """Parse kSpider's pairwise
     """
-    
-    distances = dict()
+
+    distances = {}
     labeled_out = f"kSpider_{index_basename}_pairwise.tsv"
     distmatrix_out = f"kSpider_{index_basename}_distmat.tsv"
     newick_out = f"kSpider_{index_basename}.newick"
-    
+
     if overwritten_output != "na":
         labeled_out = f"{overwritten_output}_pairwise.tsv"
         distmatrix_out = f"{overwritten_output}_distmat.tsv"
         newick_out = f"{overwritten_output}.newick"
-    
+
     if distance_type == "ani":
         with open(kSpider_pairwise_tsv) as PAIRWISE, open(labeled_out, 'w') as NEW, open(index_prefix + "_kSpider_pairwise.ani_col.tsv") as ANI:
             ctx.obj.INFO(f"Writing pairwise matrix to {labeled_out}")
@@ -132,7 +135,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
                 dist_metric = float(next(ANI).strip())
                 distances[(grp1, grp2)] = dist_metric
                 NEW.write(f"{grp1}\t{grp2}\t{dist_metric}\n")
-                
+
     else:
         with open(kSpider_pairwise_tsv) as PAIRWISE, open(labeled_out, 'w') as NEW:
             ctx.obj.INFO(f"Writing pairwise matrix to {labeled_out}")
@@ -149,7 +152,7 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
                 distances[(grp1, grp2)] = dist_metric
                 NEW.write(f"{grp1}\t{grp2}\t{dist_metric}\n")
 
-    unique_ids = sorted(set([x for y in distances.keys() for x in y]))
+    unique_ids = sorted({x for y in distances for x in y})
     df = pd.DataFrame(index=unique_ids, columns=unique_ids)
     for k, v in distances.items():
         df.loc[k[0], k[1]] = 1-v
@@ -158,13 +161,13 @@ def main(ctx, index_prefix, newick, distance_type, overwritten_output):
     df = df.fillna(0)
     LOGGER.INFO(f"Writing distance matrix to {distmatrix_out}")
     df.to_csv(distmatrix_out, sep='\t')
-        
+
     if newick:
         loaded_df = pd.read_csv(distmatrix_out, sep='\t')
         LOGGER.INFO(f"Writing newick to {newick_out}.")
         names = list(loaded_df.columns[1:])
         dist = loaded_df[loaded_df.columns[1:]].to_numpy()
-        Z = linkage(dist, 'single')
+        Z = linkage(dist, 'ward')
         tree = to_tree(Z, False)
 
         newick = get_newick(tree, tree.dist, names)

diff --git a/pykSpider/kSpider2/ks_pairwise.py b/pykSpider/kSpider2/ks_pairwise.py
@@ -24,7 +24,7 @@ def main(ctx, index_prefix, user_threads, ani, sourmash_scale):
             f"Constructing the containment pairwise matrix using {user_threads} cores.")
         if sourmash_scale:
             ctx.obj.WARNING("No need to provide -s/--scale when running this command.")
-        kSpider_internal.pairwise(index_prefix, user_threads)
+        kSpider_internal.pairwise(index_prefix, user_threads, sourmash_scale) 
         ctx.obj.SUCCESS("Done.")
     else:        
         pairwise_file = index_prefix + "_kSpider_pairwise.tsv"

diff --git a/src/pairwise.cpp b/src/pairwise.cpp
@@ -9,6 +9,7 @@
 #include "parallel_hashmap/phmap.h"
 #include "parallel_hashmap/phmap_dump.h"
 #include <cassert>
+#include <math.h>
 
 using boost::adaptors::transformed;
 using boost::algorithm::join;
@@ -45,6 +46,7 @@ class Combo {
 
     void combinations(int n) {
         this->combs.clear();
+        this->combs.reserve((n * (n - 1)) / 2);
         this->comb(n, this->r, this->arr);
     }
 
@@ -92,7 +94,7 @@ namespace kSpider {
         }
     }
 
-    void load_colors_to_sources(const std::string& filename, int_vec_map * map)
+    void load_colors_to_sources(const std::string& filename, int_vec_map* map)
     {
         phmap::BinaryInputArchive ar_in(filename.c_str());
         size_t size;
@@ -120,7 +122,7 @@ namespace kSpider {
         }
     }
 
-    void pairwise(string index_prefix, int user_threads) {
+    void pairwise(string index_prefix, int user_threads, int scale) {
 
         // Read colors
 
@@ -135,7 +137,7 @@ namespace kSpider {
         begin_time = Time::now();
         int_int_map colorsCount;
         load_colors_count(index_prefix + "_color_count.bin", colorsCount);
-        
+
 
         // TODO: should be csv, rename later.
         // std::ifstream data(index_prefix + "_kSpider_colorCount.tsv");
@@ -154,7 +156,7 @@ namespace kSpider {
 
         cout << "parsing index colors: " << std::chrono::duration<double, std::milli>(Time::now() - begin_time).count() / 1000 << " secs" << endl;
         begin_time = Time::now();
-        
+
         // for (const auto& record : color_to_ids) {
         //     uint32_t colorCount = colorsCount[record.first];
         //     for (auto group_id : record.second) {
@@ -211,7 +213,6 @@ namespace kSpider {
                 Combo combo = Combo();
                 combo.combinations(item.second.size());
                 for (uint32_t i = 0; i < combo.combs.size(); i++) {
-                    // for (auto const& seq_pair : combo.combs) {
                     auto const& seq_pair = combo.combs[i];
                     uint32_t _seq1 = item.second[seq_pair.first];
                     uint32_t _seq2 = item.second[seq_pair.second];
@@ -223,20 +224,14 @@ namespace kSpider {
                         [ccount](PAIRS_COUNTER::value_type& v) { v.second += ccount; },           // called only when key was already present
                         ccount
                     );
-
-                    // ** BUG FIX ** was creating wrong shared_kmers
-                    // auto _p = make_pair(_seq1, _seq2);
-                    // uint32_t ccount = colorsCount[item.first];
-                    // edges.lazy_emplace_l(_p,
-                    //     [ccount](PAIRS_COUNTER::value_type& v) { v.second++; },           // called only when key was already present
-                    //     [_p, ccount](const PAIRS_COUNTER::constructor& ctor) {
-                    //         ctor(_p, ccount); }
-                    // ); // construct value_type in place when key not present 
                 }
             }
         }
 
+
+
         cout << "pairwise hashmap construction: " << std::chrono::duration<double, std::milli>(Time::now() - begin_time).count() / 1000 << " secs" << endl;
+        cout << "Number of pairwise comparisons: " << edges.size() << endl;
         cout << "writing pairwise matrix to " << index_prefix << "_kSpider_pairwise.tsv" << endl;
 
         std::ofstream myfile;
@@ -248,29 +243,56 @@ namespace kSpider {
             << "\tmin_containment"
             << "\tavg_containment"
             << "\tmax_containment"
+            << "\tochiai"
+            << "\tjaccard"
+            << "\tbeta_ani"
             << '\n';
         uint64_t line_count = 0;
         for (const auto& edge : edges) {
-            uint64_t shared_kmers = edge.second;
+            float shared_kmers = static_cast<float>(edge.second);
             uint32_t source_1 = edge.first.first;
             uint32_t source_2 = edge.first.second;
             uint32_t source_1_kmers = groupID_to_kmerCount[source_1];
             uint32_t source_2_kmers = groupID_to_kmerCount[source_2];
 
-            float cont_1_in_2 = (float)shared_kmers / source_2_kmers;
-            float cont_2_in_1 = (float)shared_kmers / source_1_kmers;
+            // containments
+            float cont_1_in_2 = shared_kmers / source_2_kmers;
+            float cont_2_in_1 = shared_kmers / source_1_kmers;
             float min_containment = min(cont_1_in_2, cont_2_in_1);
-            float avg_containment = (cont_1_in_2 + cont_2_in_1) / 2.0;
+            float avg_containment = (cont_1_in_2 + cont_2_in_1) / 2.0f;
             float max_containment = max(cont_1_in_2, cont_2_in_1);
 
-            myfile
-                << source_1
-                << '\t' << source_2
-                << '\t' << shared_kmers
+            // Ochiai distance
+            float ochiai = (shared_kmers / sqrt((float)source_1_kmers * (float)source_2_kmers));
+
+            // Jaccard distance (if size of samples is roughly similar)
+            // J(A, B) = 1 - |A ∩ B| / (|A| + |B| - |A ∩ B|)
+            float jaccard = shared_kmers / (source_1_kmers + source_2_kmers - shared_kmers);
+
+            // Kulczynski distance needs abundance of each sample
+            // float kulczynski = (float)shared_kmers / (source_1_kmers + source_2_kmers) * 2;
+
+            // my ANI
+            float ani = 0.0;
+            if (scale) {
+                double avg_length = ((source_1_kmers + source_2_kmers) / 2.0) * scale;
+                ani = (2.0 * shared_kmers) / (source_1_kmers + source_2_kmers - (2.0 * shared_kmers / avg_length));
+            }
+
+            // Prepare the output string
+            std::stringstream line;
+            line << edge.first.first
+                << '\t' << edge.first.second
+                << '\t' << edge.second
                 << '\t' << min_containment
                 << '\t' << avg_containment
                 << '\t' << max_containment
-                << '\n';
+                << '\t' << ochiai
+                << '\t' << jaccard;
+            if (scale) line << '\t' << ani;
+            line << '\n';
+            myfile << line.str();
+
         }
         myfile.close();
     }