From 2ea68c826d3123aad6af4d1fc6806684bcf35478 Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Thu, 22 Feb 2024 16:12:39 +0000 Subject: [PATCH] stats: add N50_num (L50). #15 --- CHANGELOG.md | 6 ++++-- doc/docs/usage.md | 23 ++++++++++++----------- go.mod | 2 +- go.sum | 4 ++-- seqkit/cmd/stat.go | 22 ++++++++++++++-------- 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 835703d..73c7c33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -- [SeqKit v2.7.1](https://github.com/shenwei356/seqkit/releases/tag/v2.7.1) - 2024-01-31 -[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.7.1/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.7.1) +- [SeqKit v2.8.0](https://github.com/shenwei356/seqkit/releases/tag/v2.8.0) - 2024-01-31 +[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.8.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.8.0) + - `seqkit stats`: + - Add column `N50_num`, an alias of L50, [#15](https://github.com/shenwei356/seqkit/issues/15). - `seqkit seq/locate/fish/watch`: - Removing the flag `-V/--validate-seq-length`. Now the whole sequence will be checked if `-v/--validate-seq` is given. - `seqkit amplicon`: diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 0cbb8ed..f29fc8b 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -690,10 +690,11 @@ Columns: 11. Q3 third quartile of sequence length , with gaps or spaces counted 12. sum_gap number of gaps 13. N50 N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50 - 14. Q20(%) percentage of bases with the quality score greater than 20 - 15. Q30(%) percentage of bases with the quality score greater than 30 - 16. AvgQual average quality - 17. GC(%) percentage of GC content + 14. N50_num N50_num or L50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#L50 + 15. Q20(%) percentage of bases with the quality score greater than 20 + 16. Q30(%) percentage of bases with the quality score greater than 30 + 17. AvgQual average quality + 18. GC(%) percentage of GC content Attentions: 1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3) @@ -784,13 +785,13 @@ Eexamples 1. Extra information $ seqkit stats *.f{a,q}.gz -a - file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) AvgQual GC(%) - hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 0 0 0 45.77 - mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 0 0 0 47.6 - Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 150 150 150 0 150 96.16 89.71 24.82 49.91 - nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 40.79 12.63 9.48 46.66 - reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 91.24 86.62 15.45 53.63 - reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 91.06 87.66 14.62 54.77 + file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 N50_num Q20(%) Q30(%) AvgQual GC(%) + hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 380 0 0 0 45.77 + mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 12 0 0 0 47.6 + Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 150 150 150 0 150 1 96.16 89.71 24.82 49.91 + nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 585 40.79 12.63 9.48 46.66 + reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 3 91.24 86.62 15.45 53.63 + reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 2 91.06 87.66 14.62 54.77 1. **Parallelize counting files, it's much faster for lots of small files, especially for files on SSD** diff --git a/go.mod b/go.mod index 368092f..114a509 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/mattn/go-isatty v0.0.16 github.com/mitchellh/go-homedir v1.1.0 github.com/pkg/errors v0.9.1 - github.com/shenwei356/bio v0.13.0 + github.com/shenwei356/bio v0.13.1 github.com/shenwei356/breader v0.3.2 github.com/shenwei356/bwt v0.6.1 github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba diff --git a/go.sum b/go.sum index 255bb0b..5d8512c 100644 --- a/go.sum +++ b/go.sum @@ -123,8 +123,8 @@ github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/f github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= -github.com/shenwei356/bio v0.13.0 h1:PtgC6pJyBcSR7S16qjn6dW1mISAmsFakaBgh0uI+aFY= -github.com/shenwei356/bio v0.13.0/go.mod h1:JjWavkWha/UzK0K7dqTVuzkHEBgGNad9yC1hv0YJrG0= +github.com/shenwei356/bio v0.13.1 h1:lj1YJMaRxGfyosoihZSyqtW4fW5219v188AAFmcUWVc= +github.com/shenwei356/bio v0.13.1/go.mod h1:JjWavkWha/UzK0K7dqTVuzkHEBgGNad9yC1hv0YJrG0= github.com/shenwei356/breader v0.3.2 h1:GLy2clIMck6FdTwj8WLnmhv0PW/7Pp+Wcx7TVEHG0ks= github.com/shenwei356/breader v0.3.2/go.mod h1:BimwolkMTIr/O4iX7xXtjEB1z5y39G+8I5Tsm9guC3E= github.com/shenwei356/bwt v0.6.1 h1:BDS1nhhIxiC284OKtLBUgy+U5L/7WrvT/ehOExB/DSA= diff --git a/seqkit/cmd/stat.go b/seqkit/cmd/stat.go index 4a3c8f3..80800ed 100644 --- a/seqkit/cmd/stat.go +++ b/seqkit/cmd/stat.go @@ -71,10 +71,11 @@ Columns: 11. Q3 third quartile of sequence length , with gaps or spaces counted 12. sum_gap number of gaps 13. N50 N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50 - 14. Q20(%) percentage of bases with the quality score greater than 20 - 15. Q30(%) percentage of bases with the quality score greater than 30 - 16. AvgQual average quality - 17. GC(%) percentage of GC content + 14. N50_num N50_num or L50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#L50 + 15. Q20(%) percentage of bases with the quality score greater than 20 + 16. Q30(%) percentage of bases with the quality score greater than 30 + 17. AvgQual average quality + 18. GC(%) percentage of GC content Attentions: 1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3) @@ -193,7 +194,7 @@ Tips: "max_len", } if all { - colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...) + colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...) } if hasNX { @@ -241,12 +242,13 @@ Tips: info.lenAvg, info.lenMax) if all { - fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", + fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", info.Q1, info.Q2, info.Q3, info.gapSum, info.N50, + info.L50, info.q20, info.q30, info.avgQual, @@ -281,12 +283,13 @@ Tips: info.lenAvg, info.lenMax) if all { - fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", + fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", info.Q1, info.Q2, info.Q3, info.gapSum, info.N50, + info.L50, info.q20, info.q30, info.avgQual, @@ -329,12 +332,13 @@ Tips: info.lenAvg, info.lenMax) if all { - fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", + fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f", info.Q1, info.Q2, info.Q3, info.gapSum, info.N50, + info.L50, info.q20, info.q30, info.avgQual, @@ -592,6 +596,7 @@ Tips: {Header: "Q3", Align: stable.AlignRight, HumanizeNumbers: true}, {Header: "sum_gap", Align: stable.AlignRight, HumanizeNumbers: true}, {Header: "N50", Align: stable.AlignRight, HumanizeNumbers: true}, + {Header: "N50_num", Align: stable.AlignRight, HumanizeNumbers: true}, {Header: "Q20(%)", Align: stable.AlignRight, HumanizeNumbers: true}, {Header: "Q30(%)", Align: stable.AlignRight, HumanizeNumbers: true}, {Header: "AvgQual", Align: stable.AlignRight, HumanizeNumbers: true}, @@ -624,6 +629,7 @@ Tips: row = append(row, info.Q3) row = append(row, info.gapSum) row = append(row, info.N50) + row = append(row, info.L50) row = append(row, info.q20) row = append(row, info.q30) row = append(row, info.avgQual)