Skip to content

Commit

Permalink
import: cache embedded images as attachments (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
delan committed Oct 4, 2024
1 parent a1b7b21 commit 8b81edf
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 8 deletions.
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ markup5ever_rcdom = "0.3.0"
rayon = "1.10.0"
serde = { version = "1.0.210", features = ["derive"] }
serde_json = "1.0.128"
sha2 = "0.10.8"
tokio = { version = "1.40.0", features = ["full"] }
toml = "0.8.19"
tracing = "0.1.40"
Expand Down
58 changes: 57 additions & 1 deletion src/attachments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,34 @@ use std::{

use jane_eyre::eyre::{self, bail};
use reqwest::redirect::Policy;
use tracing::{debug, trace};
use sha2::{digest::generic_array::functional::FunctionalSequence, Digest, Sha256};
use tracing::{debug, trace, warn};

use crate::{
cohost::attachment_id_to_url,
path::{AttachmentsPath, SitePath},
};

pub trait AttachmentsContext {
fn cache_imported(&self, url: &str, post_id: usize) -> eyre::Result<SitePath>;
fn cache_cohost_file(&self, id: &str) -> eyre::Result<SitePath>;
fn cache_cohost_thumb(&self, id: &str) -> eyre::Result<SitePath>;
}

pub struct RealAttachmentsContext;
impl AttachmentsContext for RealAttachmentsContext {
#[tracing::instrument(skip(self))]
fn cache_imported(&self, url: &str, post_id: usize) -> eyre::Result<SitePath> {
let mut hash = Sha256::new();
hash.update(url);
let hash = hash.finalize().map(|o| format!("{o:02x}")).join("");
let path = AttachmentsPath::ROOT.join(&format!("imported-{post_id}-{hash}"))?;
trace!(?path);
create_dir_all(&path)?;

cache_imported_attachment(url, &path)?.site_path()
}

#[tracing::instrument(skip(self))]
fn cache_cohost_file(&self, id: &str) -> eyre::Result<SitePath> {
let url = attachment_id_to_url(id);
Expand Down Expand Up @@ -58,6 +72,48 @@ fn cached_attachment_url(id: &str, dir: &AttachmentsPath) -> eyre::Result<Attach
Ok(path.join_dir_entry(&entry?)?)
}

fn cache_imported_attachment(url: &str, path: &AttachmentsPath) -> eyre::Result<AttachmentsPath> {
// if the attachment id directory exists...
if let Ok(mut entries) = read_dir(&path) {
// and the directory contains a file...
if let Some(entry) = entries.next() {
// and we can open the file...
// TODO: move this logic into path module
let path = path.join_dir_entry(&entry?)?;
if let Ok(mut file) = File::open(&path) {
trace!("cache hit: {url}");
// check if we can read the file.
let mut result = Vec::default();
file.read_to_end(&mut result)?;
return Ok(path);
}
}
}

trace!("cache miss");
debug!("downloading attachment");

let response = reqwest::blocking::get(url)?;
let extension = match response.headers().get("Content-Type") {
Some(x) if x == "image/gif" => "gif",
Some(x) if x == "image/jpeg" => "jpg",
Some(x) if x == "image/png" => "png",
Some(x) if x == "image/svg+xml" => "svg",
Some(x) if x == "image/webp" => "webp",
other => {
warn!("unknown attachment mime type: {other:?}");
"bin"
}
};
let path = path.join(&format!("file.{extension}"))?;
debug!(?path);

let result = response.bytes()?.to_vec();
File::create(&path)?.write_all(&result)?;

Ok(path)
}

/// given a cohost attachment redirect (`url`) and path to a uuid dir (`path`),
/// return the cached attachment path (`path/original-filename.ext`).
///
Expand Down
3 changes: 3 additions & 0 deletions src/command/cohost2autost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,9 @@ fn process_chost_fragment(
fn test_render_markdown_block() -> eyre::Result<()> {
struct TestAttachmentsContext {}
impl AttachmentsContext for TestAttachmentsContext {
fn cache_imported(&self, _url: &str, _post_id: usize) -> eyre::Result<SitePath> {
unreachable!();
}
fn cache_cohost_file(&self, id: &str) -> eyre::Result<SitePath> {
Ok(SitePath::ATTACHMENTS.join(&format!("{id}"))?)
}
Expand Down
57 changes: 53 additions & 4 deletions src/command/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ use std::{
};

use askama::Template;
use html5ever::Attribute;
use jane_eyre::eyre::{self, bail, OptionExt};
use markup5ever_rcdom::{Handle, NodeData};
use tracing::{debug, info, trace};
use url::Url;

use crate::{
attachments::{AttachmentsContext, RealAttachmentsContext},
dom::{
attr_value, make_html_tag_name, parse_html_document, serialize_node, text_content, Traverse,
attr_value, find_attr_mut, make_attribute_name, make_html_tag_name, parse,
parse_html_document, serialize, serialize_node, tendril_to_str, text_content, Traverse,
},
migrations::run_migrations,
path::PostsPath,
Expand Down Expand Up @@ -114,7 +117,7 @@ pub async fn main(mut args: impl Iterator<Item = String>) -> eyre::Result<()> {
let path = PostsPath::imported_post_path(post_id);
match File::create_new(&path) {
Ok(file) => {
result = Some((file, path));
result = Some((post_id, path, file));
break;
}
Err(error) if error.kind() == io::ErrorKind::AlreadyExists => {
Expand All @@ -129,12 +132,12 @@ pub async fn main(mut args: impl Iterator<Item = String>) -> eyre::Result<()> {
Err(other) => Err(other)?,
}
}
let (mut file, path) = result.ok_or_eyre("too many posts :(")?;
let (post_id, path, mut file) = result.ok_or_eyre("too many posts :(")?;

info!("writing {path:?}");
file.write_all(meta.render()?.as_bytes())?;
file.write_all(b"\n\n")?;
let unsafe_html = e_content;
let unsafe_html = process_content(&e_content, post_id, &base_href, &RealAttachmentsContext)?;
let post = TemplatedPost::filter(&unsafe_html, Some(path.clone()))?;
file.write_all(post.safe_html.as_bytes())?;

Expand All @@ -143,6 +146,52 @@ pub async fn main(mut args: impl Iterator<Item = String>) -> eyre::Result<()> {
Ok(())
}

fn process_content(
content: &str,
post_id: usize,
base_href: &Url,
context: &dyn AttachmentsContext,
) -> eyre::Result<String> {
let dom = parse(content.as_bytes())?;

// rewrite attachment urls to relative cached paths.
for node in Traverse::new(dom.document.clone()) {
match &node.data {
NodeData::Element { name, attrs, .. } => {
let element_attr_names = match name {
name if name == &make_html_tag_name("img") => Some(("img", "src")),
_ => None,
};
if let Some((element_name, attr_name)) = element_attr_names {
let mut attrs = attrs.borrow_mut();
if let Some(attr) = find_attr_mut(&mut attrs, &attr_name) {
let old_url = tendril_to_str(&attr.value)?.to_owned();
let fetch_url = base_href.join(&old_url)?;
trace!("found attachment url in <{element_name} {attr_name}>: {old_url}");
attr.value = context
.cache_imported(&fetch_url.to_string(), post_id)?
.base_relative_url()
.into();
attrs.push(Attribute {
name: make_attribute_name(&format!("data-import-{attr_name}")),
value: old_url.into(),
});
}
if element_name == "img" {
attrs.push(Attribute {
name: make_attribute_name("loading"),
value: "lazy".into(),
});
}
}
}
_ => {}
}
}

Ok(serialize(dom)?)
}

fn mf2_e(node: Handle, class: &str) -> eyre::Result<Option<String>> {
// TODO: handle full return value in <https://microformats.org/wiki/microformats2-parsing#parsing_an_e-_property>
let Some(node) = mf2_find(node, class) else {
Expand Down
15 changes: 12 additions & 3 deletions src/command/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use warp::{
Filter,
};

use crate::SETTINGS;
use crate::{path::AttachmentsPath, SETTINGS};
use crate::{
path::{PostsPath, SitePath},
render_markdown, PostMeta, TemplatedPost, Thread, ThreadsContentTemplate,
Expand Down Expand Up @@ -158,8 +158,17 @@ pub async fn main(mut _args: impl Iterator<Item = String>) -> eyre::Result<()> {
let default_route = warp::filters::method::get()
.and(warp::filters::path::peek())
.and_then(|peek: Peek| async move {
let mut path = PathBuf::from("site");
for component in peek.segments() {
let mut segments = peek.segments().peekable();
// serve attachments out of main attachment store, in case we need to preview a post
// that refers to an attachment for the first time. otherwise they will 404, since
// render won’t have hard-linked it into the site output dir.
let mut path: PathBuf = if segments.peek() == Some(&"attachments") {
segments.next();
(&*AttachmentsPath::ROOT).as_ref().to_owned()
} else {
(&*SitePath::ROOT).as_ref().to_owned()
};
for component in segments {
let component = urlencoding::decode(component)
.wrap_err("failed to decode url path component")
.map_err(BadRequest)?;
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ impl TemplatedPost {
let safe_html = ammonia::Builder::default()
.add_generic_attributes(["style", "id"])
.add_generic_attributes(["data-cohost-href", "data-cohost-src"]) // cohost2autost
.add_generic_attributes(["data-import-src"]) // autost import
.add_tag_attributes("a", ["target"])
.add_tag_attributes("audio", ["controls", "src"])
.add_tag_attributes("details", ["open"])
Expand Down

0 comments on commit 8b81edf

Please sign in to comment.