Update CLI parameters in help and README

bitextor · Feb 2, 2024 · 6fffe62 · 6fffe62
1 parent 549b564
commit 6fffe62
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -42,15 +42,17 @@ warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
           [ --paragraph-identification ] [ --tag-filters <filters_file> ] <warc_file>...
 ```
 * `--output`/`-o` output folder
-* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output).
-* `--jsonl` Produce JSON Lines on stdout instead of writing to files per language.
+* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`metadata`, `url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output).
+* `--jsonl` Produce JSON Lines for `html` and `text` files instead of base64 encoding.
+* `--stdout` Write all the information in JSONLines to stdout. Needs --jsonl option.
 * `--pdfpass` WARC file where PDF records will be stored
 * `--robotstxtpass` WARC file where robots.txt related records will be stored
 * `--encode-urls` Escape non-ascii characters that appear in the record URL with `%dd` encoding.
 * `--multilang` Detect multiple languages in the document, and split the document accordingly. Only supported with CLD2 classifier.
 * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML
 * `--classifier` classifier to use: `cld2`, `fasttext`, or `skip`. When `fasttext` is used, one also has to specify a model using `--fasttext-model`. Use `skip` to skip language identification entirely.
 * `--fasttext-model` path to FastText model for fasttext classifier. Models can be any [FastText language identification model](https://fasttext.cc/docs/en/language-identification.html) such as [OpenLID lid201-model.ftz](https://github.com/laurieburchell/open-lid-dataset#quantised-model)
+* `--skip-text-extraction` Skip text extraction and output only html. This option is not compatible with "text" value in -f option and also requires to skip language identification.
 * `--tag-filters` file containing filters that are used to eliminate matching documents
 * `--invert-tag-filters` output only documents that match the filter
 * `--url-filters` file containing regular expressions that match urls of documents to eliminate

diff --git a/warc2text_main.cc b/warc2text_main.cc
@@ -82,10 +82,13 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " --skip-text-extraction           Skip text extraction and output only html\n"
                 "                                  This option is not compatible with \"text\" value in -f option \n"
                 "                                  and also requires to skip language identification\n"
-                " --jsonl                          Write JSONLines to stdout\n"
+                " --jsonl                          Produce \"html\" and \"text\" files in JSONLines format,\n"
+                "                                  instead of bease64 encoded lines\n"
+                " --stdout                         Write all the information in JSONLines to stdout\n"
+                "                                  Needs --jsonl option\n"
                 " --compress <compression>         Compression algorithm for the output files\n"
                 "                                  Default: gzip. Values: gzip or zstd\n"
-                " --compress-level <level>         Compression level to use.\n"
+                " --compress-level <level>         Compression level to use\n"
                 " -s                               Only output errors\n"
                 " -v                               Verbose output (print trace)\n\n";
         exit(1);