From 1ef782e80db870f7927686bb5a8daae79e927246 Mon Sep 17 00:00:00 2001 From: Duncan Garmonsway Date: Fri, 1 Jul 2022 18:18:02 +0100 Subject: [PATCH 1/3] Add flag to force html mode This is an attempt to address #671 by adding a flag `--html` to parse the input as HTML. Otherwise, STDIN and local files without the `.html` suffix are parsed as plain text. --- fixtures/configs/smoketest.toml | 3 +++ lychee-bin/src/options.rs | 17 ++++++++++++++--- lychee-lib/src/types/input.rs | 23 +++++++++++++++++------ lychee.example.toml | 3 +++ 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index 597674bb85..a67631a038 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -87,6 +87,9 @@ include_verbatim = false # Ignore case of paths when matching glob patterns. glob_ignore_case = false +# Treat input as HTML +html = false + # Exclude URLs from checking (supports regex). exclude = [ '.*\.github.com\.*' ] diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 661e202212..4374756c88 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -2,8 +2,8 @@ use crate::parse::{parse_base, parse_statuscodes}; use anyhow::{anyhow, Context, Error, Result}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, - DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, + Base, Input, FileType, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, + DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -122,6 +122,11 @@ impl LycheeOptions { // but we'd get no access to `glob_ignore_case`. /// Get parsed inputs from options. pub(crate) fn inputs(&self) -> Result> { + let file_type_hint = if self.config.html { + Some(FileType::Html) + } else { + None + }; let excluded = if self.config.exclude_path.is_empty() { None } else { @@ -129,7 +134,7 @@ impl LycheeOptions { }; self.raw_inputs .iter() - .map(|s| Input::new(s, None, self.config.glob_ignore_case, excluded.clone())) + .map(|s| Input::new(s, file_type_hint, self.config.glob_ignore_case, excluded.clone())) .collect::>() .context("Cannot parse inputs from arguments") } @@ -319,6 +324,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) glob_ignore_case: bool, + /// Treat the input as HTML + #[structopt(long)] + #[serde(default)] + pub(crate) html: bool, + /// Output file of status report #[structopt(short, long, parse(from_os_str))] #[serde(default)] @@ -393,6 +403,7 @@ impl Config { skip_missing: false; include_verbatim: false; glob_ignore_case: false; + html: false; output: None; require_https: false; } diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 7281f63fac..544495cdbc 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,7 +3,7 @@ use crate::{helpers, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; -use jwalk::WalkDir; +use jwalk::WalkDirGeneric; use reqwest::Url; use serde::Serialize; use shellexpand::tilde; @@ -198,7 +198,7 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in WalkDir::new(path).skip_hidden(true) + for entry in WalkDirGeneric::<((usize), (Option))>::new(path).skip_hidden(true) .process_read_dir(move |_, _, _, children| { children.retain(|child| { let entry = match child.as_ref() { @@ -224,19 +224,24 @@ impl Input { } return valid_extension(&entry.path()); }); + children.first_mut().map(|child| { + if let Ok(entry) = child { + entry.client_state = self.file_type_hint; + } + }); }) { let entry = entry?; if entry.file_type().is_dir() { continue; } - let content = Self::path_content(entry.path()).await?; + let content = Self::path_content(entry.path(), entry.client_state).await?; yield content } } else { if self.is_excluded_path(path) { return (); } - let content = Self::path_content(path).await; + let content = Self::path_content(path, self.file_type_hint).await; match content { Err(_) if skip_missing => (), Err(e) => Err(e)?, @@ -301,7 +306,7 @@ impl Input { if self.is_excluded_path(&path) { continue; } - let content: InputContent = Self::path_content(&path).await?; + let content: InputContent = Self::path_content(&path, self.file_type_hint).await?; yield content; } Err(e) => eprintln!("{e:?}"), @@ -325,13 +330,19 @@ impl Input { /// Will return `Err` if file contents can't be read pub async fn path_content + AsRef + Clone>( path: P, + file_type_hint: Option, ) -> Result { let path = path.into(); let content = tokio::fs::read_to_string(&path) .await .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?; + let file_type = if file_type_hint.is_none() { + FileType::from(&path) + } else { + file_type_hint.unwrap_or_default() + }; let input_content = InputContent { - file_type: FileType::from(&path), + file_type: file_type, source: InputSource::FsPath(path), content, }; diff --git a/lychee.example.toml b/lychee.example.toml index 88425d8821..a86ed6595a 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -86,6 +86,9 @@ include_verbatim = false # Ignore case of paths when matching glob patterns. glob_ignore_case = false +# Treat the input as HTML. +html = false + # Exclude URLs from checking (supports regex). exclude = [ '.*\.github.com\.*' ] From 88120e980dabe4acd8f8f5961ab7139ad7604907 Mon Sep 17 00:00:00 2001 From: Duncan Garmonsway Date: Sun, 3 Jul 2022 22:36:07 +0100 Subject: [PATCH 2/3] Update README section USAGE --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index de55c411dd..81516c8692 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ FLAGS: --exclude-private Exclude private IP address ranges from checking --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information + --html Treat the input as HTML --include-verbatim Find links in verbatim sections like `pre`- and `code` blocks -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. @@ -273,9 +274,9 @@ ARGS: ### Ignoring links You can exclude links from getting checked by specifying regex patterns -with `--exclude` (e.g. `--exclude example\.(com|org)`). +with `--exclude` (e.g. `--exclude example\.(com|org)`). If a file named `.lycheeignore` exists in the current working directory, its -contents are excluded as well. The file allows you to list multiple regular +contents are excluded as well. The file allows you to list multiple regular expressions for exclusion (one pattern per line). ### Caching From d87b68777e387eda28949810268ddea1c7605e54 Mon Sep 17 00:00:00 2001 From: Duncan Garmonsway Date: Sun, 3 Jul 2022 23:14:07 +0100 Subject: [PATCH 3/3] Lint --- lychee-bin/src/options.rs | 11 +++++++++-- lychee-lib/src/types/input.rs | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 4374756c88..79f45b6a5d 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -2,7 +2,7 @@ use crate::parse::{parse_base, parse_statuscodes}; use anyhow::{anyhow, Context, Error, Result}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, Input, FileType, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, + Base, FileType, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; @@ -134,7 +134,14 @@ impl LycheeOptions { }; self.raw_inputs .iter() - .map(|s| Input::new(s, file_type_hint, self.config.glob_ignore_case, excluded.clone())) + .map(|s| { + Input::new( + s, + file_type_hint, + self.config.glob_ignore_case, + excluded.clone(), + ) + }) .collect::>() .context("Cannot parse inputs from arguments") } diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 544495cdbc..a11909ede7 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -342,7 +342,7 @@ impl Input { file_type_hint.unwrap_or_default() }; let input_content = InputContent { - file_type: file_type, + file_type, source: InputSource::FsPath(path), content, };