From 6fffe62ee98d76a177b511556f01f41c76f8c071 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 2 Feb 2024 14:34:21 +0000 Subject: [PATCH] Update CLI parameters in help and README --- README.md | 6 ++++-- warc2text_main.cc | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 073b588..4b04aaf 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,9 @@ warc2text -o [ -f ] [ --pdfpass ] [ --paragraph-identification ] [ --tag-filters ] ... ``` * `--output`/`-o` output folder -* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output). -* `--jsonl` Produce JSON Lines on stdout instead of writing to files per language. +* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`metadata`, `url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output). +* `--jsonl` Produce JSON Lines for `html` and `text` files instead of base64 encoding. +* `--stdout` Write all the information in JSONLines to stdout. Needs --jsonl option. * `--pdfpass` WARC file where PDF records will be stored * `--robotstxtpass` WARC file where robots.txt related records will be stored * `--encode-urls` Escape non-ascii characters that appear in the record URL with `%dd` encoding. @@ -51,6 +52,7 @@ warc2text -o [ -f ] [ --pdfpass ] * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML * `--classifier` classifier to use: `cld2`, `fasttext`, or `skip`. When `fasttext` is used, one also has to specify a model using `--fasttext-model`. Use `skip` to skip language identification entirely. * `--fasttext-model` path to FastText model for fasttext classifier. Models can be any [FastText language identification model](https://fasttext.cc/docs/en/language-identification.html) such as [OpenLID lid201-model.ftz](https://github.com/laurieburchell/open-lid-dataset#quantised-model) +* `--skip-text-extraction` Skip text extraction and output only html. This option is not compatible with "text" value in -f option and also requires to skip language identification. * `--tag-filters` file containing filters that are used to eliminate matching documents * `--invert-tag-filters` output only documents that match the filter * `--url-filters` file containing regular expressions that match urls of documents to eliminate diff --git a/warc2text_main.cc b/warc2text_main.cc index 9c01058..75be471 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -82,10 +82,13 @@ void parseArgs(int argc, char *argv[], Options& out) { " --skip-text-extraction Skip text extraction and output only html\n" " This option is not compatible with \"text\" value in -f option \n" " and also requires to skip language identification\n" - " --jsonl Write JSONLines to stdout\n" + " --jsonl Produce \"html\" and \"text\" files in JSONLines format,\n" + " instead of bease64 encoded lines\n" + " --stdout Write all the information in JSONLines to stdout\n" + " Needs --jsonl option\n" " --compress Compression algorithm for the output files\n" " Default: gzip. Values: gzip or zstd\n" - " --compress-level Compression level to use.\n" + " --compress-level Compression level to use\n" " -s Only output errors\n" " -v Verbose output (print trace)\n\n"; exit(1);