Skip to content

Commit

Permalink
Merge branch 'v2'
Browse files Browse the repository at this point in the history
  • Loading branch information
agudys committed Aug 19, 2020
2 parents 82d9530 + 05a1ffb commit a3cdd6e
Show file tree
Hide file tree
Showing 20 changed files with 83,770 additions and 41 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: C/C++ CI

on:
push:
branches: [ master, v2, feature/tests ]
pull_request:
branches: [ master, v2, feature/tests ]

jobs:

linux-test:
name: Linux test
runs-on: ubuntu-latest
env:
INPUT_DIR: ./test/virus

steps:
- uses: actions/checkout@v2
- name: make
run: make

- name: build + all2all (default k)
run: |
./kmer-db build -multisample-fasta ${INPUT_DIR}/sequences.list k18.db
./kmer-db all2all k18.db k18.csv
cmp k18.csv ${INPUT_DIR}/k18.csv
- name: build + all2all (k=24)
run: |
./kmer-db build -k 24 -multisample-fasta ${INPUT_DIR}/sequences.list k24.db
./kmer-db all2all k24.db k24.csv
cmp k24.csv ${INPUT_DIR}/k24.csv
- name: distance (default k)
run: |
./kmer-db distance jaccard min max cosine mash k18.csv
cmp k18.csv.jaccard ${INPUT_DIR}/k18.csv.jaccard
cmp k18.csv.min ${INPUT_DIR}/k18.csv.min
cmp k18.csv.max ${INPUT_DIR}/k18.csv.max
cmp k18.csv.cosine ${INPUT_DIR}/k18.csv.cosine
cmp k18.csv.mash ${INPUT_DIR}/k18.csv.mash
macos-build:
name: macOS test
runs-on: macOS-latest

steps:
- uses: actions/checkout@v2
- name: make
run: make

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
*.sqlite
*.json
*.suo
/src/kmer_db.vcxproj.user
18 changes: 14 additions & 4 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,20 @@ else
EXTRA_LIBS_DIR = ""
endif

OS_NAME := $(shell uname -s)
ifeq ($(OS_NAME), Darwin)
OMP_FLAGS = -Xpreprocessor -fopenmp
ABI_FLAGS =
else
OMP_FLAGS = -fopenmp
ABI_FLAGS = -fabi-version=6
endif


CC = g++
CFLAGS = -Wall -O3 -m64 -std=c++11 -fopenmp -pthread -mavx -I $(KMER_DB_LIBS_DIR) -I $(EXTRA_LIBS_DIR)
CFLAGS_AVX2 = -Wall -O3 -m64 -std=c++11 -fopenmp -pthread -mavx2 -I $(KMER_DB_LIBS_DIR) -I $(EXTRA_LIBS_DIR)
CLINK = -lm -O3 -std=c++11 -lpthread -fopenmp -mavx -fabi-version=6
CFLAGS = -Wall -O3 -m64 -std=c++11 $(OMP_FLAGS) -pthread -mavx -I $(KMER_DB_LIBS_DIR) -I $(EXTRA_LIBS_DIR)
CFLAGS_AVX2 = -Wall -O3 -m64 -std=c++11 $(OMP_FLAGS) -pthread -mavx2 -I $(KMER_DB_LIBS_DIR) -I $(EXTRA_LIBS_DIR)
CLINK = -lm -O3 -std=c++11 -lpthread $(OMP_FLAGS) -mavx $(ABI_FLAGS)

OBJS := $(KMER_DB_MAIN_DIR)/analyzer.o \
$(KMER_DB_MAIN_DIR)/console.o \
Expand All @@ -36,7 +46,7 @@ OBJS := $(KMER_DB_MAIN_DIR)/analyzer.o \
$(KMER_DB_LIBS_DIR)/mmer.o

$(KMER_DB_MAIN_DIR)/parallel_sorter.o: $(KMER_DB_MAIN_DIR)/parallel_sorter.cpp
$(CC) -O3 -mavx -m64 -std=c++11 -pthread -fopenmp -c $< -o $@
$(CC) -O3 -mavx -m64 -std=c++11 -pthread $(OMP_FLAGS) -c $< -o $@

ifeq ($(NO_AVX2),true)
## no avx2 support
Expand Down
14 changes: 9 additions & 5 deletions src/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ int Console::runAllVsAll(const std::string& dbFilename, const std::string& simil
cout << "Calculating matrix of common k-mers...";
start = std::chrono::high_resolution_clock::now();
LowerTriangularMatrix<uint32_t> matrix;
calculator(*db, matrix);
calculator.all2all(*db, matrix);
dt = std::chrono::high_resolution_clock::now() - start;
cout << "OK (" << dt.count() << " seconds)" << endl;

Expand Down Expand Up @@ -446,7 +446,7 @@ int Console::runOneVsAll(const std::string& dbFilename, const std::string& singl
cout << "Calculating similarity vector...";
start = std::chrono::high_resolution_clock::now();
std::vector<uint32_t> sims;
calculator(db, queryKmers, queryKmersCount, sims);
calculator.one2all(db, queryKmers, queryKmersCount, sims);
dt = std::chrono::high_resolution_clock::now() - start;
cout << "OK (" << dt.count() << " seconds)" << endl;

Expand Down Expand Up @@ -528,7 +528,7 @@ int Console::runNewVsAll(const std::string& dbFilename, const std::string& multi
auto start = std::chrono::high_resolution_clock::now();

sims.clear();
calculator(db, task->kmers, task->kmersCount, sims);
calculator.one2all<true>(db, task->kmers, task->kmersCount, sims);
ofs << endl << task->sampleName << "," << task->kmersCount << ",";
std::copy(sims.begin(), sims.end(), ostream_iterator<uint32_t>(ofs, ","));

Expand Down Expand Up @@ -614,12 +614,16 @@ int Console::runDistanceCalculation(const std::string& similarityFilename, const
if ((i + 1) % 10 == 0) {
cout << "\r" << i + 1 << "/" << kmersCount.size() << "...";
}
// find first comma
auto pos = in.find(',');
string queryName(in.begin(), in.begin() + pos);

std::replace(in.begin(), in.end(), ',', ' ');
istringstream iss(in);
iss.seekg(pos + 1); // move right after first comma separator
uint64_t queryKmersCount = 0;
string queryName;
iss >> queryName >> queryKmersCount;

iss >> queryKmersCount;

auto newEnd = std::copy(std::istream_iterator<size_t>(iss), std::istream_iterator<size_t>(), intersections.begin());
size_t numVals = newEnd - intersections.begin();
Expand Down
2 changes: 1 addition & 1 deletion src/kmc_api/kmer_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@


#include <stdio.h>
#include <ext/algorithm>
#include <algorithm>
#include <iostream>
using namespace std;

Expand Down
19 changes: 0 additions & 19 deletions src/kmer_db.vcxproj.user

This file was deleted.

8 changes: 8 additions & 0 deletions src/parallel_sorter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka

#ifdef WIN32
#include <ppl.h>
#elif defined __APPLE__
#include <algorithm>
#else
#include <parallel/algorithm>
#endif
Expand All @@ -24,6 +26,8 @@ void ParallelSort(kmer_t *arr, size_t arr_size)
#ifdef WIN32
concurrency::parallel_sort(arr, arr + arr_size);
//std::stable_sort(samplePatterns.begin(), samplePatterns.end(), pid_comparer);
#elif defined __APPLE__
std:: stable_sort(arr, arr + arr_size);
#else
__gnu_parallel::sort(arr, arr + arr_size);
#endif
Expand All @@ -39,6 +43,8 @@ void ParallelSort(pair<pattern_id_t, pattern_id_t*> *arr, size_t arr_size, pair<
#ifdef WIN32
concurrency::parallel_sort(arr, arr + arr_size, pid_comparer);
//std::stable_sort(samplePatterns.begin(), samplePatterns.end(), pid_comparer);
#elif defined __APPLE__
std:: stable_sort(arr, arr + arr_size, pid_comparer);
#else
__gnu_parallel::sort(arr, arr + arr_size, pid_comparer);
#endif
Expand All @@ -55,6 +61,8 @@ void ParallelSort(pair<kmer_or_pattern_t, pattern_id_t*> *arr, size_t arr_size,
#ifdef WIN32
concurrency::parallel_sort(arr, arr + arr_size, pid_comparer);
//std::stable_sort(samplePatterns.begin(), samplePatterns.end(), pid_comparer);
#elif defined __APPLE__
std:: stable_sort(arr, arr + arr_size, pid_comparer);
#else
__gnu_parallel::sort(arr, arr + arr_size, pid_comparer);
#endif
Expand Down
7 changes: 1 addition & 6 deletions src/prefix_kmer_db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,7 @@

#include "log.h"

#ifdef WIN32
#include <ppl.h>
#else
#include <parallel/algorithm>
#endif
#include <omp.h>
#include <algorithm>

#include <numeric>
#include <cassert>
Expand Down
120 changes: 118 additions & 2 deletions src/similarity_calculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SimilarityCalculator::SimilarityCalculator(int _num_threads, size_t cacheBufferM

// *****************************************************************************************
//
void SimilarityCalculator::operator()(PrefixKmerDb& db, LowerTriangularMatrix<uint32_t>& matrix) const
void SimilarityCalculator::all2all(PrefixKmerDb& db, LowerTriangularMatrix<uint32_t>& matrix) const
{
// get stuff from database
auto& patterns = db.getPatterns();
Expand Down Expand Up @@ -394,7 +394,8 @@ void SimilarityCalculator::operator()(PrefixKmerDb& db, LowerTriangularMatrix<ui

// *****************************************************************************************
//
void SimilarityCalculator::operator()(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector<uint32_t>& similarities) const {
template <>
void SimilarityCalculator::one2all<true>(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector<uint32_t>& similarities) const {

// get stuff from database
const auto& patterns = db.getPatterns();
Expand Down Expand Up @@ -528,6 +529,121 @@ void SimilarityCalculator::operator()(const PrefixKmerDb& db, const kmer_t* kme
});
}

dt = std::chrono::high_resolution_clock::now() - start;
LOG_VERBOSE << "Pattern unpacking time: " << dt.count() << endl;
}

// *****************************************************************************************
//
template <>
void SimilarityCalculator::one2all<false>(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector<uint32_t>& similarities) const
{
// get stuff from database
const auto& patterns = db.getPatterns();
int samples_count = db.getSamplesCount();
const auto& hashtables = db.getHashtables();

similarities.resize(samples_count, 0);

std::unordered_map<pattern_id_t, int32_t> patterns2count;

std::chrono::duration<double> dt;
auto start = std::chrono::high_resolution_clock::now();

// iterate over kmers in analyzed sample
for (size_t i = 0; i < kmersCount; ++i) {

if (i + PREFETCH_DIST < kmersCount) {
kmer_t prefetch_kmer = kmers[i + PREFETCH_DIST];

kmer_t prefix = GET_PREFIX_SHIFTED(prefetch_kmer);
suffix_t suffix = GET_SUFFIX(prefetch_kmer);

hashtables[prefix].prefetch(suffix);
}

// check if kmer exists in a database
kmer_t kmer = kmers[i];
kmer_t prefix = GET_PREFIX_SHIFTED(kmer);
suffix_t suffix = GET_SUFFIX(kmer);

auto entry = hashtables[prefix].find(suffix);

if (entry != nullptr) {
auto pid = *entry;
const auto& pattern = patterns[pid];

if (pattern.get_num_kmers() == 0)
continue;

++patterns2count[pid];
}
}

std::vector<std::pair<pattern_id_t, int32_t>> patterns2countVector(patterns2count.size());
int i = 0;

uint64_t sum_pattern_lengths = 0;
for (auto &entry : patterns2count)
{
sum_pattern_lengths += patterns[entry.first].get_num_samples();
patterns2countVector[i++] = entry;
}

patterns2count.clear();

dt = std::chrono::high_resolution_clock::now() - start;
LOG_VERBOSE << "Pattern listing time: " << dt.count() << endl;

start = std::chrono::high_resolution_clock::now();

std::vector<uint32_t> samples(samples_count);

size_t lo = 0;
size_t hi = patterns2countVector.size();

for (size_t id = lo; id < hi; ++id) {
if (id + 1 < hi)
_mm_prefetch((const char*)(patterns.data() + patterns2countVector[id + 1].first), _MM_HINT_T0);

auto pid = patterns2countVector[id].first;
const auto& pattern = patterns[pid];
int num_samples = pattern.get_num_samples();
int to_add = patterns2countVector[id].second;

uint32_t* out = samples.data() + pattern.get_num_samples(); // start from the end

int64_t current_id = pid;
while (current_id >= 0) {
const auto& cur = patterns[current_id];

out -= cur.get_num_local_samples();
cur.decodeSamples(out);

current_id = cur.get_parent_id();
}

auto *p = samples.data();

int i;
for (i = 0; i + 4 <= num_samples; i += 4)
{
similarities[*p++] += to_add;
similarities[*p++] += to_add;
similarities[*p++] += to_add;
similarities[*p++] += to_add;
}
num_samples -= i;

switch (num_samples)
{
case 3: similarities[*p++] += to_add;
case 2: similarities[*p++] += to_add;
case 1: similarities[*p++] += to_add;
}

}

dt = std::chrono::high_resolution_clock::now() - start;
LOG_VERBOSE << "Pattern unpacking time: " << dt.count() << endl;
}
5 changes: 3 additions & 2 deletions src/similarity_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ class SimilarityCalculator {
public:
SimilarityCalculator(int _num_threads, size_t cacheBufferMb);

virtual void operator()(PrefixKmerDb& db, LowerTriangularMatrix<uint32_t>& matrix) const;
void all2all(PrefixKmerDb& db, LowerTriangularMatrix<uint32_t>& matrix) const;

virtual void operator()(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector<uint32_t>& vector) const;
template <bool parallel = true>
void one2all(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector<uint32_t>& vector) const;


protected:
Expand Down
7 changes: 5 additions & 2 deletions src/version.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#pragma once

#define VERSION "1.7.5"
#define DATE "13.02.2020"
#define VERSION "1.7.6"
#define DATE "30.03.2020"


/*
Version history
1.7.6 (31.03.2020)
- Fixed bug in distance mode when sequence id contained spaces.
1.7.5 (13.02.2020)
- Some compilation warnings removed.
- Fixed crash on samples with small k-mers count or very small filter values.
Expand Down
167 changes: 167 additions & 0 deletions test/virus/k18.csv

Large diffs are not rendered by default.

166 changes: 166 additions & 0 deletions test/virus/k18.csv.cosine

Large diffs are not rendered by default.

166 changes: 166 additions & 0 deletions test/virus/k18.csv.jaccard

Large diffs are not rendered by default.

166 changes: 166 additions & 0 deletions test/virus/k18.csv.mash

Large diffs are not rendered by default.

166 changes: 166 additions & 0 deletions test/virus/k18.csv.max

Large diffs are not rendered by default.

166 changes: 166 additions & 0 deletions test/virus/k18.csv.min

Large diffs are not rendered by default.

167 changes: 167 additions & 0 deletions test/virus/k24.csv

Large diffs are not rendered by default.

Loading

0 comments on commit a3cdd6e

Please sign in to comment.