From 15f715aea6890e84a68a159ccdfc9e5cfad91188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torbj=C3=B8rn=20Rognes?= Date: Tue, 21 Aug 2018 13:54:48 +0200 Subject: [PATCH] VSEARCH 2.8.2: Reduced memory for dereplication and more --- README.md | 28 +-- configure.ac | 2 +- man/vsearch.1 | 10 +- src/abundance.cc | 4 +- src/derep.cc | 463 ++++++++++++++++++++++++++++++++++------------ src/mergepairs.cc | 2 +- src/util.cc | 4 +- src/vsearch.h | 3 + 8 files changed, 373 insertions(+), 143 deletions(-) diff --git a/README.md b/README.md index ce018669..4b83263e 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Most of the nucleotide based commands and options in USEARCH version 7 are suppo ## Getting Help -If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. +If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example @@ -37,9 +37,9 @@ In the example below, VSEARCH will identify sequences in the file database.fsa t **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` -wget https://github.com/torognes/vsearch/archive/v2.8.1.tar.gz -tar xzf v2.8.1.tar.gz -cd vsearch-2.8.1 +wget https://github.com/torognes/vsearch/archive/v2.8.2.tar.gz +tar xzf v2.8.2.tar.gz +cd vsearch-2.8.2 ./autogen.sh ./configure make @@ -70,36 +70,36 @@ Binary distributions are provided for x86-64 systems running GNU/Linux, macOS (v Download the appropriate executable for your system using the following commands if you are using a Linux x86_64 system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch-2.8.1-linux-x86_64.tar.gz -tar xzf vsearch-2.8.1-linux-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch-2.8.2-linux-x86_64.tar.gz +tar xzf vsearch-2.8.2-linux-x86_64.tar.gz ``` Or these commands if you are using a Linux ppc64le system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch-2.8.1-linux-ppc64le.tar.gz -tar xzf vsearch-2.8.1-linux-ppc64le.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch-2.8.2-linux-ppc64le.tar.gz +tar xzf vsearch-2.8.2-linux-ppc64le.tar.gz ``` Or these commands if you are using a Mac: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch-2.8.1-macos-x86_64.tar.gz -tar xzf vsearch-2.8.1-macos-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch-2.8.2-macos-x86_64.tar.gz +tar xzf vsearch-2.8.2-macos-x86_64.tar.gz ``` Or if you are using Windows, download and extract (unzip) the contents of this file: ``` -https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch-2.8.1-win-x86_64.zip +https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch-2.8.2-win-x86_64.zip ``` -Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.8.1-linux-x86_64` or `vsearch-2.8.1-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. +Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.8.2-linux-x86_64` or `vsearch-2.8.2-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. -Windows: You will now have the binary distribution in a folder called `vsearch-2.8.1-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. +Windows: You will now have the binary distribution in a folder called `vsearch-2.8.2-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. -**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). +**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.8.2/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Plugins, packages, and wrappers diff --git a/configure.ac b/configure.ac index ae55a4c2..ec0eda51 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([vsearch], [2.8.1], [torognes@ifi.uio.no]) +AC_INIT([vsearch], [2.8.2], [torognes@ifi.uio.no]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) diff --git a/man/vsearch.1 b/man/vsearch.1 index 8b34c139..4d741ded 100644 --- a/man/vsearch.1 +++ b/man/vsearch.1 @@ -1,5 +1,5 @@ .\" ============================================================================ -.TH vsearch 1 "June 22, 2018" "version 2.8.1" "USER COMMANDS" +.TH vsearch 1 "August 21, 2018" "version 2.8.2" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em chimera detection, clustering, dereplication and @@ -3398,6 +3398,14 @@ Added the fastq_maxdiffpct option to the fastq_mergepairs command. .TP .BR v2.8.1\~ "released June 22nd, 2018" Fixes for compilation warnings with GCC 8. +.TP +.BR v2.8.2\~ "released August 21st, 2018" +Fix for wrong placement of semicolons in header lines in some cases +when using the sizeout or xsize options. Reduced memory requirements +for full-length dereplication in cases with many duplicate sequences. +Improved wording of fastq_mergepairs report. Updated manual regarding +use of sizein and sizeout with dereplication. Changed a compiler +option. .RE .LP .\" ============================================================================ diff --git a/src/abundance.cc b/src/abundance.cc index a93abc6a..18f53b73 100644 --- a/src/abundance.cc +++ b/src/abundance.cc @@ -149,12 +149,12 @@ void abundance_fprint_header_strip_size(FILE * fp, { if (start <= 1) { - if (end < header_length) + if (end < header_length - 1) fprintf(fp, "%s", header + end + 1); } else { - if (end == header_length) + if (end >= header_length - 1) fprintf(fp, "%.*s", start - 1, header); else fprintf(fp, "%.*s;%.*s", diff --git a/src/derep.cc b/src/derep.cc index b0a12242..88acd213 100644 --- a/src/derep.cc +++ b/src/derep.cc @@ -60,8 +60,6 @@ #include "vsearch.h" -//#define BITMAP - #define HASH hash_cityhash64 struct bucket @@ -71,23 +69,47 @@ struct bucket unsigned int seqno_last; unsigned int size; bool deleted; + char * header; + char * seq; }; -#ifdef BITMAP -unsigned char * hash_occupied = 0; - -void hash_set_occupied(uint64_t hashindex) +int derep_compare_prefix(const void * a, const void * b) { - hash_occupied[(hashindex) >> 3] |= 1 << (hashindex & 7); -} + struct bucket * x = (struct bucket *) a; + struct bucket * y = (struct bucket *) b; -int hash_is_occupied(uint64_t hashindex) -{ - return hash_occupied[(hashindex) >> 3] & (1 << (hashindex & 7)); + /* highest abundance first, then by label, otherwise keep order */ + + if (x->deleted > y->deleted) + return +1; + else if (x->deleted < y->deleted) + return -1; + else + { + if (x->size < y->size) + return +1; + else if (x->size > y->size) + return -1; + else + { + int r = strcmp(db_getheader(x->seqno_first), + db_getheader(y->seqno_first)); + if (r != 0) + return r; + else + { + if (x->seqno_first < y->seqno_first) + return -1; + else if (x->seqno_first > y->seqno_first) + return +1; + else + return 0; + } + } + } } -#endif -int derep_compare(const void * a, const void * b) +int derep_compare_full(const void * a, const void * b) { struct bucket * x = (struct bucket *) a; struct bucket * y = (struct bucket *) b; @@ -106,8 +128,9 @@ int derep_compare(const void * a, const void * b) return -1; else { - int r = strcmp(db_getheader(x->seqno_first), - db_getheader(y->seqno_first)); + if (x->size == 0) + return 0; + int r = strcmp(x->header, y->header); if (r != 0) return r; else @@ -142,8 +165,50 @@ int seqcmp(char * a, char * b, int n) return chrmap_4bit[(int)(*p)] - chrmap_4bit[(int)(*q)]; } +void rehash(struct bucket * * hashtableref, int64_t alloc_clusters) +{ + /* + double the size of the hash table: + + - allocate the new hash table + - rehash all entries from the old to the new table + - free the old table + - update variables + */ + + struct bucket * old_hashtable = * hashtableref; + uint64_t old_hashtablesize = 2 * alloc_clusters; + uint64_t new_hashtablesize = 2 * old_hashtablesize; + uint64_t new_hash_mask = new_hashtablesize - 1; + + struct bucket * new_hashtable = + (struct bucket *) xmalloc(sizeof(bucket) * new_hashtablesize); + memset(new_hashtable, 0, sizeof(bucket) * new_hashtablesize); + + /* rehash all */ + for(uint64_t i = 0; i < old_hashtablesize; i++) + { + struct bucket * old_bp = old_hashtable + i; + if (old_bp->size) + { + uint64_t k = old_bp->hash & new_hash_mask; + while (new_hashtable[k].size) + k = (k + 1) & new_hash_mask; + struct bucket * new_bp = new_hashtable + k; + + * new_bp = * old_bp; + } + } + + xfree(old_hashtable); + * hashtableref = new_hashtable; +} + + void derep_fulllength() { + show_rusage(); + FILE * fp_output = 0; FILE * fp_uc = 0; @@ -161,57 +226,131 @@ void derep_fulllength() fatal("Unable to open output (uc) file for writing"); } - db_read(opt_derep_fulllength, 0); + fastx_handle h = fastx_open(opt_derep_fulllength); show_rusage(); - int64_t dbsequencecount = db_getsequencecount(); + if (!h) + fatal("Unrecognized input file type (not proper FASTA or FASTQ format)"); - /* adjust size of hash table for 2/3 fill rate */ + uint64_t filesize = fastx_get_size(h); - int64_t hashtablesize = 1; - int hash_shift = 0; - while (3 * dbsequencecount > 2 * hashtablesize) - { - hashtablesize <<= 1; - hash_shift++; - } - int hash_mask = hashtablesize - 1; + /* allocate initial mem for 1M sequences of length 1023 */ + + uint64_t alloc_clusters = 1024 * 1024; + uint64_t alloc_seqs = 1024 * 1024; + int64_t alloc_seqlen = 1023; + + uint64_t hashtablesize = 2 * alloc_clusters; + uint64_t hash_mask = hashtablesize - 1; struct bucket * hashtable = (struct bucket *) xmalloc(sizeof(bucket) * hashtablesize); - memset(hashtable, 0, sizeof(bucket) * hashtablesize); -#ifdef BITMAP - hash_occupied = - (unsigned char *) xmalloc(hashtablesize / 8); - memset(hash_occupied, 0, hashtablesize / 8); -#endif + show_rusage(); - int64_t clusters = 0; + /* alloc and init table of links to other sequences in cluster */ + + unsigned int * nextseqtab = + (unsigned int*) xmalloc(sizeof(unsigned int) * alloc_seqs); + const unsigned int terminal = (unsigned int)(-1); + memset(nextseqtab, terminal, sizeof(unsigned int) * alloc_seqs); + + show_rusage(); + + char * match_strand = (char *) xmalloc(alloc_seqs); + memset(match_strand, 0, alloc_seqs); + + show_rusage(); + + char * seq_up = (char*) xmalloc(alloc_seqlen + 1); + char * rc_seq_up = (char*) xmalloc(alloc_seqlen + 1); + + char * prompt = 0; + if (xsprintf(& prompt, "Dereplicating file %s", opt_derep_fulllength) == -1) + fatal("Out of memory"); + + progress_init(prompt, filesize); + + uint64_t sequencecount = 0; + uint64_t nucleotidecount = 0; + int64_t shortest = INT64_MAX; + int64_t longest = 0; + uint64_t discarded_short = 0; + uint64_t discarded_long = 0; + uint64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; double median = 0.0; double average = 0.0; - /* alloc and init table of links to other sequences in cluster */ + while(fastx_next(h, 0, chrmap_no_change)) + { + int64_t seqlen = fastx_get_sequence_length(h); - unsigned int * nextseqtab = (unsigned int*) xmalloc(sizeof(unsigned int) * dbsequencecount); - const unsigned int terminal = (unsigned int)(-1); - memset(nextseqtab, -1, sizeof(unsigned int) * dbsequencecount); + if (seqlen < opt_minseqlength) + { + discarded_short++; + continue; + } - char * match_strand = (char *) xmalloc(dbsequencecount); - memset(match_strand, 0, dbsequencecount); + if (seqlen > opt_maxseqlength) + { + discarded_long++; + continue; + } - char * seq_up = (char*) xmalloc(db_getlongestsequence() + 1); - char * rc_seq_up = (char*) xmalloc(db_getlongestsequence() + 1); + nucleotidecount += seqlen; + if (seqlen > longest) + longest = seqlen; + if (seqlen < shortest) + shortest = seqlen; - progress_init("Dereplicating", dbsequencecount); - for(int64_t i=0; i alloc_seqlen) + { + alloc_seqlen = seqlen; + seq_up = (char*) xrealloc(seq_up, alloc_seqlen + 1); + rc_seq_up = (char*) xrealloc(rc_seq_up, alloc_seqlen + 1); + + show_rusage(); + } + + if (sequencecount + 1 > alloc_seqs) + { + uint64_t new_alloc_seqs = 2 * alloc_seqs; + + nextseqtab = + (unsigned int*) xrealloc(nextseqtab, + sizeof(unsigned int) * new_alloc_seqs); + memset(nextseqtab + alloc_seqs, + terminal, + sizeof(unsigned int) * alloc_seqs); + + match_strand = (char *) xrealloc(match_strand, new_alloc_seqs); + memset(match_strand + alloc_seqs, 0, alloc_seqs); + + alloc_seqs = new_alloc_seqs; + + show_rusage(); + } + + if (clusters + 1 > alloc_clusters) + { + uint64_t new_alloc_clusters = 2 * alloc_clusters; + + rehash(& hashtable, alloc_clusters); + + alloc_clusters = new_alloc_clusters; + hashtablesize = 2 * alloc_clusters; + hash_mask = hashtablesize - 1; + + show_rusage(); + } + + char * seq = fastx_get_sequence(h); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up, seq, seqlen); @@ -232,24 +371,13 @@ void derep_fulllength() uint64_t j = hash & hash_mask; struct bucket * bp = hashtable + j; - while ( -#ifdef BITMAP - (hash_is_occupied(j)) -#else - (bp->size) -#endif + while ((bp->size) && - ((bp->hash != hash) || - (seqlen != db_getsequencelen(bp->seqno_first)) || - (seqcmp(seq_up, db_getsequence(bp->seqno_first), seqlen)))) + ((hash != bp->hash) || + (seqcmp(seq_up, bp->seq, seqlen)))) { - bp++; - j++; - if (bp >= hashtable + hashtablesize) - { - bp = hashtable; - j = 0; - } + j = (j+1) & hash_mask; + bp = hashtable + j; } if ((opt_strand > 1) && !bp->size) @@ -258,40 +386,28 @@ void derep_fulllength() /* check minus strand as well */ uint64_t rc_hash = HASH(rc_seq_up, seqlen); - struct bucket * rc_bp = hashtable + rc_hash % hashtablesize; uint64_t k = rc_hash & hash_mask; + struct bucket * rc_bp = hashtable + k; - while ( -#ifdef BITMAP - (hash_is_occupied(j)) -#else - (rc_bp->size) -#endif + while ((rc_bp->size) && - ((rc_bp->hash != rc_hash) || - (seqlen != db_getsequencelen(rc_bp->seqno_first)) || - (seqcmp(rc_seq_up, - db_getsequence(rc_bp->seqno_first), - seqlen)))) + ((rc_hash != rc_bp->hash) || + (seqcmp(rc_seq_up, rc_bp->seq, seqlen)))) { - rc_bp++; - k++; - if (rc_bp >= hashtable + hashtablesize) - { - rc_bp = hashtable; - k++; - } + k = (k+1) & hash_mask; + rc_bp = hashtable + k; } if (rc_bp->size) { bp = rc_bp; j = k; - match_strand[i] = 1; + match_strand[sequencecount] = 1; } } - int64_t ab = opt_sizein ? db_getabundance(i) : 1; + int abundance = fastx_get_abundance(h); + int64_t ab = opt_sizein ? abundance : 1; sumsize += ab; if (bp->size) @@ -299,40 +415,114 @@ void derep_fulllength() /* at least one identical sequence already */ bp->size += ab; unsigned int last = bp->seqno_last; - nextseqtab[last] = i; - bp->seqno_last = i; + nextseqtab[last] = sequencecount; + bp->seqno_last = sequencecount; } else { /* no identical sequences yet */ + char * header = fastx_get_header(h); + bp->size = ab; bp->hash = hash; - bp->seqno_first = i; - bp->seqno_last = i; + bp->seqno_first = sequencecount; + bp->seqno_last = sequencecount; + bp->seq = xstrdup(seq); + bp->header = xstrdup(header); clusters++; } if (bp->size > maxsize) maxsize = bp->size; -#ifdef BITMAP - hash_set_occupied(j); -#endif + sequencecount++; - progress_update(i); + progress_update(fastx_get_position(h)); } progress_done(); + xfree(prompt); + fastx_close(h); + + show_rusage(); + + if (!opt_quiet) + { + if (sequencecount > 0) + fprintf(stderr, + "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 + ", max %'" PRIu64 ", avg %'.0f\n", + nucleotidecount, + sequencecount, + shortest, + longest, + nucleotidecount * 1.0 / sequencecount); + else + fprintf(stderr, + "%'" PRIu64 " nt in %'" PRIu64 " seqs\n", + nucleotidecount, + sequencecount); + } + + if (opt_log) + { + if (sequencecount > 0) + fprintf(fp_log, + "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 + ", max %'" PRIu64 ", avg %'.0f\n", + nucleotidecount, + sequencecount, + shortest, + longest, + nucleotidecount * 1.0 / sequencecount); + else + fprintf(fp_log, + "%'" PRIu64 " nt in %'" PRIu64 " seqs\n", + nucleotidecount, + sequencecount); + } + + if (discarded_short) + { + fprintf(stderr, + "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", + opt_minseqlength, + discarded_short, + (discarded_short == 1 ? "sequence" : "sequences")); + + if (opt_log) + fprintf(fp_log, + "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", + opt_minseqlength, + discarded_short, + (discarded_short == 1 ? "sequence" : "sequences")); + } + + if (discarded_long) + { + fprintf(stderr, + "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", + opt_maxseqlength, + discarded_long, + (discarded_long == 1 ? "sequence" : "sequences")); + + if (opt_log) + fprintf(fp_log, + "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", + opt_maxseqlength, + discarded_long, + (discarded_long == 1 ? "sequence" : "sequences")); + } xfree(seq_up); xfree(rc_seq_up); show_rusage(); - progress_init("Sorting", 1); - qsort(hashtable, hashtablesize, sizeof(bucket), derep_compare); + qsort(hashtable, hashtablesize, sizeof(struct bucket), derep_compare_full); progress_done(); + show_rusage(); if (clusters > 0) { @@ -358,32 +548,34 @@ void derep_fulllength() { if (!opt_quiet) fprintf(stderr, - "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", + "%" PRId64 + " unique sequences, avg cluster %.1lf, median %.0f, max %" + PRIu64 "\n", clusters, average, median, maxsize); if (opt_log) fprintf(fp_log, - "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", + "%" PRId64 + " unique sequences, avg cluster %.1lf, median %.0f, max %" + PRIu64 "\n\n", clusters, average, median, maxsize); } - show_rusage(); - - /* count selected */ - int64_t selected = 0; - for (int64_t i=0; isize; if ((size >= opt_minuniquesize) && (size <= opt_maxuniquesize)) { selected++; - if (selected == opt_topn) + if (selected == (uint64_t) opt_topn) break; } } + show_rusage(); /* write output */ @@ -392,7 +584,7 @@ void derep_fulllength() progress_init("Writing output file", clusters); int64_t relabel_count = 0; - for (int64_t i=0; isize; @@ -401,10 +593,10 @@ void derep_fulllength() relabel_count++; fasta_print_general(fp_output, 0, - db_getsequence(bp->seqno_first), - db_getsequencelen(bp->seqno_first), - db_getheader(bp->seqno_first), - db_getheaderlen(bp->seqno_first), + bp->seq, + strlen(bp->seq), + bp->header, + strlen(bp->header), size, relabel_count, -1, -1, 0, 0.0); @@ -423,11 +615,11 @@ void derep_fulllength() if (opt_uc) { progress_init("Writing uc file, first part", clusters); - for (int64_t i=0; iseqno_first); - int64_t len = db_getsequencelen(bp->seqno_first); + char * h = bp->header; + int64_t len = strlen(bp->seq); fprintf(fp_uc, "S\t%" PRId64 "\t%" PRId64 "\t*\t*\t*\t*\t*\t%s\t*\n", i, len, h); @@ -444,42 +636,60 @@ void derep_fulllength() progress_update(i); } progress_done(); - show_rusage(); progress_init("Writing uc file, second part", clusters); - for (int64_t i=0; isize, db_getheader(bp->seqno_first)); + i, bp->size, bp->header); progress_update(i); } fclose(fp_uc); progress_done(); - show_rusage(); } + show_rusage(); + if (selected < clusters) { if (!opt_quiet) fprintf(stderr, - "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", + "%" PRId64 " uniques written, %" + PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); if (opt_log) fprintf(fp_log, - "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", + "%" PRId64 " uniques written, %" + PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } + show_rusage(); + + /* Free all seqs and headers */ + + for (uint64_t i=0; isize) + { + xfree(bp->seq); + xfree(bp->header); + } + } + + show_rusage(); + xfree(match_strand); xfree(nextseqtab); xfree(hashtable); - db_free(); -} + show_rusage(); +} void derep_prefix() { @@ -532,7 +742,8 @@ void derep_prefix() /* alloc and init table of links to other sequences in cluster */ - unsigned int * nextseqtab = (unsigned int*) xmalloc(sizeof(unsigned int) * dbsequencecount); + unsigned int * nextseqtab + = (unsigned int*) xmalloc(sizeof(unsigned int) * dbsequencecount); const unsigned int terminal = (unsigned int)(-1); memset(nextseqtab, -1, sizeof(unsigned int) * dbsequencecount); @@ -693,7 +904,7 @@ void derep_prefix() show_rusage(); progress_init("Sorting", 1); - qsort(hashtable, hashtablesize, sizeof(bucket), derep_compare); + qsort(hashtable, hashtablesize, sizeof(bucket), derep_compare_prefix); progress_done(); if (clusters > 0) @@ -720,11 +931,15 @@ void derep_prefix() { if (!opt_quiet) fprintf(stderr, - "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", + "%" PRId64 + " unique sequences, avg cluster %.1lf, median %.0f, max %" + PRIu64 "\n", clusters, average, median, maxsize); if (opt_log) fprintf(fp_log, - "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", + "%" PRId64 + " unique sequences, avg cluster %.1lf, median %.0f, max %" + PRIu64 "\n\n", clusters, average, median, maxsize); } @@ -822,13 +1037,15 @@ void derep_prefix() { if (!opt_quiet) fprintf(stderr, - "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", + "%" PRId64 " uniques written, %" PRId64 + " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); if (opt_log) fprintf(fp_log, - "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", + "%" PRId64 " uniques written, %" PRId64 + " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } diff --git a/src/mergepairs.cc b/src/mergepairs.cc index dc6b89a4..3cbc6795 100644 --- a/src/mergepairs.cc +++ b/src/mergepairs.cc @@ -1366,7 +1366,7 @@ void fastq_mergepairs() if (failed_repeat) fprintf(stderr, - "%10" PRIu64 " potential tandem repeat\n", + "%10" PRIu64 " multiple potential alignments\n", failed_repeat); if (failed_maxdiffs) diff --git a/src/util.cc b/src/util.cc index 595e6ddd..d929f797 100644 --- a/src/util.cc +++ b/src/util.cc @@ -60,6 +60,8 @@ #include "vsearch.h" +//#define SHOW_RUSAGE + static const char * progress_prompt; static uint64_t progress_next; static uint64_t progress_size; @@ -186,7 +188,7 @@ int64_t getusec(void) void show_rusage() { -#if 0 +#ifdef SHOW_RUSAGE double user_time = 0.0; double system_time = 0.0; diff --git a/src/vsearch.h b/src/vsearch.h index c96be8c9..85d9f0d5 100644 --- a/src/vsearch.h +++ b/src/vsearch.h @@ -58,7 +58,9 @@ */ +#define __STDC_CONSTANT_MACROS 1 #define __STDC_FORMAT_MACROS 1 +#define __STDC_LIMIT_MACROS 1 #define __restrict #ifdef HAVE_CONFIG_H @@ -87,6 +89,7 @@ #include #include #include +#include #include #include