Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: logging for xtask, scrape more links #134

Merged
merged 1 commit into from
Jan 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 60 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,17 @@ default-members = [
"mwp-web"
]

[profile.dev]
opt-level = 0

[profile.release]
lto = "thin"
lto = "fat"
opt-level = 3

[workspace.dependencies]
tantivy = { version = "0.22.0", features = ["mmap"] }
url = { version = "2.5.4", features = ["serde"] }
time = "0.3.37"
rusqlite = { version = "0.31.0", features = ["time", "url", "bundled"]}
log = "0.4.22"
tokio = { version = "1.42.0", features= ["full"]}
Binary file modified db.db3
Binary file not shown.
1 change: 1 addition & 0 deletions mwp-scraper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ rusqlite = { workspace = true }
tantivy = { workspace = true }
url = { workspace = true }
time = { workspace = true }
log = { workspace = true }

mwp-content = { path="../mwp-content" }
mwp-search = { path="../mwp-search" }
59 changes: 33 additions & 26 deletions mwp-scraper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use rusqlite::Connection;
use time::OffsetDateTime;
use url::Url;

#[macro_use]
extern crate log;

mod parser;

use crate::parser::{DomParser, DomParserResult};
Expand All @@ -20,6 +23,8 @@ pub async fn scrape(link: &Url) -> Result<DomParserResult, Box<dyn std::error::E
}

pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
info!("scraping all links");

let conn = Connection::open("./db.db3")?;
conn.execute(
r#"
Expand All @@ -38,13 +43,14 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {

let content = mwp_content::Content::from_dir("../wiki").await;

// only needed before links from content are migrated to bookmarking system
let links = content
.all()
.values()
.flat_map(|p| p.links.clone())
.collect::<Vec<Link>>();

info!("collected {} links", links.len());

let mut stmt = conn.prepare("SELECT * FROM links WHERE url = ?1")?;
for Link {
title, url, tags, ..
Expand All @@ -63,6 +69,8 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
// }
// };

info!("inserting new link {}", url);

// println!("Link {} to {}", link.title, link.url);
let mut doc = Doc::new(
title,
Expand All @@ -86,43 +94,42 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
)?;
}

loop {
let link = match conn.query_row(
"SELECT title, url, domain, body, tags, created_at, scraped_at FROM links WHERE body IS NULL AND scraped_at IS NULL LIMIT 1",
[],
|row| Ok(Doc {
title: row.get(0)?,
url: row.get(1)?,
domain: row.get(2)?,
body: row.get(3)?,
tags: row.get::<usize, Option<String>>(4).map(|res| res.map(|s| s.split(';').map(|s| s.into()).collect::<Vec<String>>()))?,
created_at: row.get(5)?,
scraped_at: row.get(6)?,
}),
) {
Ok(link) => link,
Err(e) => {
println!("query link: {:?}", e);
break;
},
};

let data = scrape(&link.url).await;
let docs = conn.prepare(
"SELECT title, url, domain, body, tags, created_at, scraped_at FROM links WHERE body IS NULL AND scraped_at IS NULL",
)?.query_map(
[],
|row| Ok(Doc {
title: row.get(0)?,
url: row.get(1)?,
domain: row.get(2)?,
body: row.get(3)?,
tags: row.get::<usize, Option<String>>(4).map(|res| res.map(|s| s.split(';').map(|s| s.into()).collect::<Vec<String>>()))?,
created_at: row.get(5)?,
scraped_at: row.get(6)?,
}),
)?.collect::<Result<Vec<_>, _>>()?;

info!("will scrape {} documents", docs.len());

for doc in docs {
info!("scraping link {}, tags {:?}", doc.url, doc.tags);

let data = scrape(&doc.url).await;
let data = match data {
Ok(data) => data,
Err(err) => {
println!("Scrape {}: {}", link.url, err);
error!("scrape {}: {}", doc.url, err);
conn.execute(
"UPDATE links SET scraped_at = datetime('now') WHERE url = ?1",
rusqlite::params![link.url],
rusqlite::params![doc.url],
)?;
continue;
}
};

conn.execute(
"UPDATE links SET body = ?1, scraped_at = datetime('now') WHERE url = ?2",
rusqlite::params![data.digest, link.url],
rusqlite::params![data.digest, doc.url],
)?;
}

Expand Down
1 change: 1 addition & 0 deletions mwp-search/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub struct SearchIndex {
pub index: Index,
}

#[derive(Clone)]
pub struct Doc {
pub title: String,
pub url: Url,
Expand Down
14 changes: 6 additions & 8 deletions mwp-web/src/static/styles.scss
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,13 @@ form {

.layout {
display: grid;
grid-template-columns: 16em minmax(0, 1fr);
grid-template-columns: 16em minmax(0, 1fr) 16em;
grid-template-rows: auto auto 1fr auto;
grid-template-areas:
'nav nav'
'sidebar meta'
'sidebar content'
'footer footer';
'nav nav nav'
'sidebar meta blank'
'sidebar content blank'
'footer footer footer';

@media (width <=1000px) {
grid-template-columns: auto;
Expand All @@ -168,7 +168,6 @@ form {
justify-content: space-between;
align-items: center;
padding: var(--spacings-kilo) var(--spacings-giga);
border-bottom: 2px solid var(--light);

.burger {
display: none;
Expand Down Expand Up @@ -237,7 +236,6 @@ footer {
}

article {
max-width: 920px;
width: 100%;

img {
Expand Down Expand Up @@ -388,4 +386,4 @@ article {
display: block;
}
}
}
}
4 changes: 3 additions & 1 deletion xtask/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ clap = { version = "4.5.23", features = [
"env",
"wrap_help",
] }
tokio = { version = "1.42.0", features= ["full"]}
pretty_env_logger = "0.5.0"

tokio = { workspace = true }

mwp-scraper = { path="../mwp-scraper" }
8 changes: 2 additions & 6 deletions xtask/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,10 @@ enum Xtask {

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
pretty_env_logger::init();
let xtask = Xtask::parse();

match xtask {
Xtask::Scrape => scrape().await,
Xtask::Scrape => mwp_scraper::scrape_all().await,
}
}

async fn scrape() -> Result<(), Box<dyn std::error::Error>> {
mwp_scraper::scrape_all().await?;
Ok(())
}
Loading