From 2ea68c826d3123aad6af4d1fc6806684bcf35478 Mon Sep 17 00:00:00 2001
From: Wei Shen <shenwei356@gmail.com>
Date: Thu, 22 Feb 2024 16:12:39 +0000
Subject: [PATCH] stats: add N50_num (L50). #15

---
 CHANGELOG.md       |  6 ++++--
 doc/docs/usage.md  | 23 ++++++++++++-----------
 go.mod             |  2 +-
 go.sum             |  4 ++--
 seqkit/cmd/stat.go | 22 ++++++++++++++--------
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 835703d..73c7c33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
-- [SeqKit v2.7.1](https://github.com/shenwei356/seqkit/releases/tag/v2.7.1) - 2024-01-31
-[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.7.1/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.7.1)
+- [SeqKit v2.8.0](https://github.com/shenwei356/seqkit/releases/tag/v2.8.0) - 2024-01-31
+[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.8.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.8.0)
+    - `seqkit stats`:
+        - Add column `N50_num`, an alias of L50, [#15](https://github.com/shenwei356/seqkit/issues/15).
     - `seqkit seq/locate/fish/watch`:
         - Removing the flag `-V/--validate-seq-length`. Now the whole sequence will be checked if `-v/--validate-seq` is given.
     - `seqkit amplicon`:
diff --git a/doc/docs/usage.md b/doc/docs/usage.md
index 0cbb8ed..f29fc8b 100644
--- a/doc/docs/usage.md
+++ b/doc/docs/usage.md
@@ -690,10 +690,11 @@ Columns:
   11. Q3        third quartile of sequence length , with gaps or spaces counted
   12. sum_gap   number of gaps
   13. N50       N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50
-  14. Q20(%)    percentage of bases with the quality score greater than 20
-  15. Q30(%)    percentage of bases with the quality score greater than 30
-  16. AvgQual   average quality
-  17. GC(%)     percentage of GC content
+  14. N50_num   N50_num or L50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#L50
+  15. Q20(%)    percentage of bases with the quality score greater than 20
+  16. Q30(%)    percentage of bases with the quality score greater than 30
+  17. AvgQual   average quality
+  18. GC(%)     percentage of GC content
   
 Attentions:
   1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -784,13 +785,13 @@ Eexamples
 1. Extra information
 
         $ seqkit stats *.f{a,q}.gz -a
-        file               format  type  num_seqs    sum_len  min_len  avg_len  max_len   Q1   Q2   Q3  sum_gap  N50  Q20(%)  Q30(%)  AvgQual  GC(%)
-        hairpin.fa.gz      FASTA   RNA     28,645  2,949,871       39      103    2,354   76   91  111        0  101       0       0        0  45.77
-        mature.fa.gz       FASTA   RNA     35,828    781,222       15     21.8       34   21   22   22        0   22       0       0        0   47.6
-        Illimina1.8.fq.gz  FASTQ   DNA     10,000  1,500,000      150      150      150  150  150  150        0  150   96.16   89.71    24.82  49.91
-        nanopore.fq.gz     FASTQ   DNA      4,000  1,798,723      153    449.7    6,006  271  318  391        0  395   40.79   12.63     9.48  46.66
-        reads_1.fq.gz      FASTQ   DNA      2,500    567,516      226      227      229  227  227  227        0  227   91.24   86.62    15.45  53.63
-        reads_2.fq.gz      FASTQ   DNA      2,500    560,002      223      224      225  224  224  224        0  224   91.06   87.66    14.62  54.77
+        file               format  type  num_seqs    sum_len  min_len  avg_len  max_len   Q1   Q2   Q3  sum_gap  N50  N50_num  Q20(%)  Q30(%)  AvgQual  GC(%)
+        hairpin.fa.gz      FASTA   RNA     28,645  2,949,871       39      103    2,354   76   91  111        0  101      380       0       0        0  45.77
+        mature.fa.gz       FASTA   RNA     35,828    781,222       15     21.8       34   21   22   22        0   22       12       0       0        0   47.6
+        Illimina1.8.fq.gz  FASTQ   DNA     10,000  1,500,000      150      150      150  150  150  150        0  150        1   96.16   89.71    24.82  49.91
+        nanopore.fq.gz     FASTQ   DNA      4,000  1,798,723      153    449.7    6,006  271  318  391        0  395      585   40.79   12.63     9.48  46.66
+        reads_1.fq.gz      FASTQ   DNA      2,500    567,516      226      227      229  227  227  227        0  227        3   91.24   86.62    15.45  53.63
+        reads_2.fq.gz      FASTQ   DNA      2,500    560,002      223      224      225  224  224  224        0  224        2   91.06   87.66    14.62  54.77
 
 1. **Parallelize counting files, it's much faster for lots of small files, especially for files on SSD**
 
diff --git a/go.mod b/go.mod
index 368092f..114a509 100644
--- a/go.mod
+++ b/go.mod
@@ -20,7 +20,7 @@ require (
 	github.com/mattn/go-isatty v0.0.16
 	github.com/mitchellh/go-homedir v1.1.0
 	github.com/pkg/errors v0.9.1
-	github.com/shenwei356/bio v0.13.0
+	github.com/shenwei356/bio v0.13.1
 	github.com/shenwei356/breader v0.3.2
 	github.com/shenwei356/bwt v0.6.1
 	github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba
diff --git a/go.sum b/go.sum
index 255bb0b..5d8512c 100644
--- a/go.sum
+++ b/go.sum
@@ -123,8 +123,8 @@ github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/f
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
 github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk=
-github.com/shenwei356/bio v0.13.0 h1:PtgC6pJyBcSR7S16qjn6dW1mISAmsFakaBgh0uI+aFY=
-github.com/shenwei356/bio v0.13.0/go.mod h1:JjWavkWha/UzK0K7dqTVuzkHEBgGNad9yC1hv0YJrG0=
+github.com/shenwei356/bio v0.13.1 h1:lj1YJMaRxGfyosoihZSyqtW4fW5219v188AAFmcUWVc=
+github.com/shenwei356/bio v0.13.1/go.mod h1:JjWavkWha/UzK0K7dqTVuzkHEBgGNad9yC1hv0YJrG0=
 github.com/shenwei356/breader v0.3.2 h1:GLy2clIMck6FdTwj8WLnmhv0PW/7Pp+Wcx7TVEHG0ks=
 github.com/shenwei356/breader v0.3.2/go.mod h1:BimwolkMTIr/O4iX7xXtjEB1z5y39G+8I5Tsm9guC3E=
 github.com/shenwei356/bwt v0.6.1 h1:BDS1nhhIxiC284OKtLBUgy+U5L/7WrvT/ehOExB/DSA=
diff --git a/seqkit/cmd/stat.go b/seqkit/cmd/stat.go
index 4a3c8f3..80800ed 100644
--- a/seqkit/cmd/stat.go
+++ b/seqkit/cmd/stat.go
@@ -71,10 +71,11 @@ Columns:
   11. Q3        third quartile of sequence length , with gaps or spaces counted
   12. sum_gap   number of gaps
   13. N50       N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50
-  14. Q20(%)    percentage of bases with the quality score greater than 20
-  15. Q30(%)    percentage of bases with the quality score greater than 30
-  16. AvgQual   average quality
-  17. GC(%)     percentage of GC content
+  14. N50_num   N50_num or L50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#L50
+  15. Q20(%)    percentage of bases with the quality score greater than 20
+  16. Q30(%)    percentage of bases with the quality score greater than 30
+  17. AvgQual   average quality
+  18. GC(%)     percentage of GC content
   
 Attentions:
   1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -193,7 +194,7 @@ Tips:
 				"max_len",
 			}
 			if all {
-				colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...)
+				colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...)
 			}
 
 			if hasNX {
@@ -241,12 +242,13 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
 								info.Q1,
 								info.Q2,
 								info.Q3,
 								info.gapSum,
 								info.N50,
+								info.L50,
 								info.q20,
 								info.q30,
 								info.avgQual,
@@ -281,12 +283,13 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
 								info.Q1,
 								info.Q2,
 								info.Q3,
 								info.gapSum,
 								info.N50,
+								info.L50,
 								info.q20,
 								info.q30,
 								info.avgQual,
@@ -329,12 +332,13 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
 								info.Q1,
 								info.Q2,
 								info.Q3,
 								info.gapSum,
 								info.N50,
+								info.L50,
 								info.q20,
 								info.q30,
 								info.avgQual,
@@ -592,6 +596,7 @@ Tips:
 				{Header: "Q3", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "sum_gap", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "N50", Align: stable.AlignRight, HumanizeNumbers: true},
+				{Header: "N50_num", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "Q20(%)", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "Q30(%)", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "AvgQual", Align: stable.AlignRight, HumanizeNumbers: true},
@@ -624,6 +629,7 @@ Tips:
 				row = append(row, info.Q3)
 				row = append(row, info.gapSum)
 				row = append(row, info.N50)
+				row = append(row, info.L50)
 				row = append(row, info.q20)
 				row = append(row, info.q30)
 				row = append(row, info.avgQual)