From 6fffe62ee98d76a177b511556f01f41c76f8c071 Mon Sep 17 00:00:00 2001
From: ZJaume <jzaragoza@prompsit.com>
Date: Fri, 2 Feb 2024 14:34:21 +0000
Subject: [PATCH] Update CLI parameters in help and README

---
 README.md         | 6 ++++--
 warc2text_main.cc | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 073b588..4b04aaf 100644
--- a/README.md
+++ b/README.md
@@ -42,8 +42,9 @@ warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
           [ --paragraph-identification ] [ --tag-filters <filters_file> ] <warc_file>...
 ```
 * `--output`/`-o` output folder
-* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output).
-* `--jsonl` Produce JSON Lines on stdout instead of writing to files per language.
+* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`metadata`, `url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output).
+* `--jsonl` Produce JSON Lines for `html` and `text` files instead of base64 encoding.
+* `--stdout` Write all the information in JSONLines to stdout. Needs --jsonl option.
 * `--pdfpass` WARC file where PDF records will be stored
 * `--robotstxtpass` WARC file where robots.txt related records will be stored
 * `--encode-urls` Escape non-ascii characters that appear in the record URL with `%dd` encoding.
@@ -51,6 +52,7 @@ warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
 * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML
 * `--classifier` classifier to use: `cld2`, `fasttext`, or `skip`. When `fasttext` is used, one also has to specify a model using `--fasttext-model`. Use `skip` to skip language identification entirely.
 * `--fasttext-model` path to FastText model for fasttext classifier. Models can be any [FastText language identification model](https://fasttext.cc/docs/en/language-identification.html) such as [OpenLID lid201-model.ftz](https://github.com/laurieburchell/open-lid-dataset#quantised-model)
+* `--skip-text-extraction` Skip text extraction and output only html. This option is not compatible with "text" value in -f option and also requires to skip language identification.
 * `--tag-filters` file containing filters that are used to eliminate matching documents
 * `--invert-tag-filters` output only documents that match the filter
 * `--url-filters` file containing regular expressions that match urls of documents to eliminate
diff --git a/warc2text_main.cc b/warc2text_main.cc
index 9c01058..75be471 100644
--- a/warc2text_main.cc
+++ b/warc2text_main.cc
@@ -82,10 +82,13 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " --skip-text-extraction           Skip text extraction and output only html\n"
                 "                                  This option is not compatible with \"text\" value in -f option \n"
                 "                                  and also requires to skip language identification\n"
-                " --jsonl                          Write JSONLines to stdout\n"
+                " --jsonl                          Produce \"html\" and \"text\" files in JSONLines format,\n"
+                "                                  instead of bease64 encoded lines\n"
+                " --stdout                         Write all the information in JSONLines to stdout\n"
+                "                                  Needs --jsonl option\n"
                 " --compress <compression>         Compression algorithm for the output files\n"
                 "                                  Default: gzip. Values: gzip or zstd\n"
-                " --compress-level <level>         Compression level to use.\n"
+                " --compress-level <level>         Compression level to use\n"
                 " -s                               Only output errors\n"
                 " -v                               Verbose output (print trace)\n\n";
         exit(1);