diff --git a/Cargo.lock b/Cargo.lock index e202275..0fa5bf0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -181,6 +181,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "tokio", "toml", "tracing", @@ -1916,6 +1917,17 @@ dependencies = [ "digest", ] +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index 0aeb2b5..a669e04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ markup5ever_rcdom = "0.3.0" rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.128" +sha2 = "0.10.8" tokio = { version = "1.40.0", features = ["full"] } toml = "0.8.19" tracing = "0.1.40" diff --git a/src/attachments.rs b/src/attachments.rs index 35c023c..1e29304 100644 --- a/src/attachments.rs +++ b/src/attachments.rs @@ -5,7 +5,8 @@ use std::{ use jane_eyre::eyre::{self, bail}; use reqwest::redirect::Policy; -use tracing::{debug, trace}; +use sha2::{digest::generic_array::functional::FunctionalSequence, Digest, Sha256}; +use tracing::{debug, trace, warn}; use crate::{ cohost::attachment_id_to_url, @@ -13,12 +14,25 @@ use crate::{ }; pub trait AttachmentsContext { + fn cache_imported(&self, url: &str, post_id: usize) -> eyre::Result; fn cache_cohost_file(&self, id: &str) -> eyre::Result; fn cache_cohost_thumb(&self, id: &str) -> eyre::Result; } pub struct RealAttachmentsContext; impl AttachmentsContext for RealAttachmentsContext { + #[tracing::instrument(skip(self))] + fn cache_imported(&self, url: &str, post_id: usize) -> eyre::Result { + let mut hash = Sha256::new(); + hash.update(url); + let hash = hash.finalize().map(|o| format!("{o:02x}")).join(""); + let path = AttachmentsPath::ROOT.join(&format!("imported-{post_id}-{hash}"))?; + trace!(?path); + create_dir_all(&path)?; + + cache_imported_attachment(url, &path)?.site_path() + } + #[tracing::instrument(skip(self))] fn cache_cohost_file(&self, id: &str) -> eyre::Result { let url = attachment_id_to_url(id); @@ -58,6 +72,48 @@ fn cached_attachment_url(id: &str, dir: &AttachmentsPath) -> eyre::Result eyre::Result { + // if the attachment id directory exists... + if let Ok(mut entries) = read_dir(&path) { + // and the directory contains a file... + if let Some(entry) = entries.next() { + // and we can open the file... + // TODO: move this logic into path module + let path = path.join_dir_entry(&entry?)?; + if let Ok(mut file) = File::open(&path) { + trace!("cache hit: {url}"); + // check if we can read the file. + let mut result = Vec::default(); + file.read_to_end(&mut result)?; + return Ok(path); + } + } + } + + trace!("cache miss"); + debug!("downloading attachment"); + + let response = reqwest::blocking::get(url)?; + let extension = match response.headers().get("Content-Type") { + Some(x) if x == "image/gif" => "gif", + Some(x) if x == "image/jpeg" => "jpg", + Some(x) if x == "image/png" => "png", + Some(x) if x == "image/svg+xml" => "svg", + Some(x) if x == "image/webp" => "webp", + other => { + warn!("unknown attachment mime type: {other:?}"); + "bin" + } + }; + let path = path.join(&format!("file.{extension}"))?; + debug!(?path); + + let result = response.bytes()?.to_vec(); + File::create(&path)?.write_all(&result)?; + + Ok(path) +} + /// given a cohost attachment redirect (`url`) and path to a uuid dir (`path`), /// return the cached attachment path (`path/original-filename.ext`). /// diff --git a/src/command/cohost2autost.rs b/src/command/cohost2autost.rs index 5420812..46d7acb 100644 --- a/src/command/cohost2autost.rs +++ b/src/command/cohost2autost.rs @@ -436,6 +436,9 @@ fn process_chost_fragment( fn test_render_markdown_block() -> eyre::Result<()> { struct TestAttachmentsContext {} impl AttachmentsContext for TestAttachmentsContext { + fn cache_imported(&self, _url: &str, _post_id: usize) -> eyre::Result { + unreachable!(); + } fn cache_cohost_file(&self, id: &str) -> eyre::Result { Ok(SitePath::ATTACHMENTS.join(&format!("{id}"))?) } diff --git a/src/command/import.rs b/src/command/import.rs index bd61148..b161290 100644 --- a/src/command/import.rs +++ b/src/command/import.rs @@ -5,14 +5,17 @@ use std::{ }; use askama::Template; +use html5ever::Attribute; use jane_eyre::eyre::{self, bail, OptionExt}; use markup5ever_rcdom::{Handle, NodeData}; use tracing::{debug, info, trace}; use url::Url; use crate::{ + attachments::{AttachmentsContext, RealAttachmentsContext}, dom::{ - attr_value, make_html_tag_name, parse_html_document, serialize_node, text_content, Traverse, + attr_value, find_attr_mut, make_attribute_name, make_html_tag_name, parse, + parse_html_document, serialize, serialize_node, tendril_to_str, text_content, Traverse, }, migrations::run_migrations, path::PostsPath, @@ -114,7 +117,7 @@ pub async fn main(mut args: impl Iterator) -> eyre::Result<()> { let path = PostsPath::imported_post_path(post_id); match File::create_new(&path) { Ok(file) => { - result = Some((file, path)); + result = Some((post_id, path, file)); break; } Err(error) if error.kind() == io::ErrorKind::AlreadyExists => { @@ -129,12 +132,12 @@ pub async fn main(mut args: impl Iterator) -> eyre::Result<()> { Err(other) => Err(other)?, } } - let (mut file, path) = result.ok_or_eyre("too many posts :(")?; + let (post_id, path, mut file) = result.ok_or_eyre("too many posts :(")?; info!("writing {path:?}"); file.write_all(meta.render()?.as_bytes())?; file.write_all(b"\n\n")?; - let unsafe_html = e_content; + let unsafe_html = process_content(&e_content, post_id, &base_href, &RealAttachmentsContext)?; let post = TemplatedPost::filter(&unsafe_html, Some(path.clone()))?; file.write_all(post.safe_html.as_bytes())?; @@ -143,6 +146,52 @@ pub async fn main(mut args: impl Iterator) -> eyre::Result<()> { Ok(()) } +fn process_content( + content: &str, + post_id: usize, + base_href: &Url, + context: &dyn AttachmentsContext, +) -> eyre::Result { + let dom = parse(content.as_bytes())?; + + // rewrite attachment urls to relative cached paths. + for node in Traverse::new(dom.document.clone()) { + match &node.data { + NodeData::Element { name, attrs, .. } => { + let element_attr_names = match name { + name if name == &make_html_tag_name("img") => Some(("img", "src")), + _ => None, + }; + if let Some((element_name, attr_name)) = element_attr_names { + let mut attrs = attrs.borrow_mut(); + if let Some(attr) = find_attr_mut(&mut attrs, &attr_name) { + let old_url = tendril_to_str(&attr.value)?.to_owned(); + let fetch_url = base_href.join(&old_url)?; + trace!("found attachment url in <{element_name} {attr_name}>: {old_url}"); + attr.value = context + .cache_imported(&fetch_url.to_string(), post_id)? + .base_relative_url() + .into(); + attrs.push(Attribute { + name: make_attribute_name(&format!("data-import-{attr_name}")), + value: old_url.into(), + }); + } + if element_name == "img" { + attrs.push(Attribute { + name: make_attribute_name("loading"), + value: "lazy".into(), + }); + } + } + } + _ => {} + } + } + + Ok(serialize(dom)?) +} + fn mf2_e(node: Handle, class: &str) -> eyre::Result> { // TODO: handle full return value in let Some(node) = mf2_find(node, class) else { diff --git a/src/command/server.rs b/src/command/server.rs index ab898b6..6ce57d7 100644 --- a/src/command/server.rs +++ b/src/command/server.rs @@ -21,7 +21,7 @@ use warp::{ Filter, }; -use crate::SETTINGS; +use crate::{path::AttachmentsPath, SETTINGS}; use crate::{ path::{PostsPath, SitePath}, render_markdown, PostMeta, TemplatedPost, Thread, ThreadsContentTemplate, @@ -158,8 +158,17 @@ pub async fn main(mut _args: impl Iterator) -> eyre::Result<()> { let default_route = warp::filters::method::get() .and(warp::filters::path::peek()) .and_then(|peek: Peek| async move { - let mut path = PathBuf::from("site"); - for component in peek.segments() { + let mut segments = peek.segments().peekable(); + // serve attachments out of main attachment store, in case we need to preview a post + // that refers to an attachment for the first time. otherwise they will 404, since + // render won’t have hard-linked it into the site output dir. + let mut path: PathBuf = if segments.peek() == Some(&"attachments") { + segments.next(); + (&*AttachmentsPath::ROOT).as_ref().to_owned() + } else { + (&*SitePath::ROOT).as_ref().to_owned() + }; + for component in segments { let component = urlencoding::decode(component) .wrap_err("failed to decode url path component") .map_err(BadRequest)?; diff --git a/src/lib.rs b/src/lib.rs index 0442ea2..6f18e2a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -280,6 +280,7 @@ impl TemplatedPost { let safe_html = ammonia::Builder::default() .add_generic_attributes(["style", "id"]) .add_generic_attributes(["data-cohost-href", "data-cohost-src"]) // cohost2autost + .add_generic_attributes(["data-import-src"]) // autost import .add_tag_attributes("a", ["target"]) .add_tag_attributes("audio", ["controls", "src"]) .add_tag_attributes("details", ["open"])