matoous · matoous · Jan 1, 2025 · Jan 1, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,11 +20,17 @@ default-members = [
   "mwp-web"
 ]
 
+[profile.dev]
+opt-level = 0
+
 [profile.release]
-lto = "thin"
+lto = "fat"
+opt-level = 3
 
 [workspace.dependencies]
 tantivy = { version = "0.22.0", features = ["mmap"] }
 url = { version = "2.5.4", features = ["serde"] }
 time = "0.3.37"
 rusqlite = { version = "0.31.0", features = ["time", "url", "bundled"]}
+log = "0.4.22"
+tokio = { version = "1.42.0", features= ["full"]}
diff --git a/db.db3 b/db.db3
diff --git a/mwp-scraper/Cargo.toml b/mwp-scraper/Cargo.toml
@@ -17,6 +17,7 @@ rusqlite = { workspace = true }
 tantivy = { workspace = true }
 url = { workspace = true }
 time = { workspace = true }
+log = { workspace = true }
 
 mwp-content = { path="../mwp-content" }
 mwp-search = { path="../mwp-search" }
diff --git a/mwp-scraper/src/lib.rs b/mwp-scraper/src/lib.rs
@@ -4,6 +4,9 @@ use rusqlite::Connection;
 use time::OffsetDateTime;
 use url::Url;
 
+#[macro_use]
+extern crate log;
+
 mod parser;
 
 use crate::parser::{DomParser, DomParserResult};
@@ -20,6 +23,8 @@ pub async fn scrape(link: &Url) -> Result<DomParserResult, Box<dyn std::error::E
 }
 
 pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
+    info!("scraping all links");
+
     let conn = Connection::open("./db.db3")?;
     conn.execute(
         r#"
@@ -38,13 +43,14 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
 
     let content = mwp_content::Content::from_dir("../wiki").await;
 
-    // only needed before links from content are migrated to bookmarking system
     let links = content
         .all()
         .values()
         .flat_map(|p| p.links.clone())
         .collect::<Vec<Link>>();
 
+    info!("collected {} links", links.len());
+
     let mut stmt = conn.prepare("SELECT * FROM links WHERE url = ?1")?;
     for Link {
         title, url, tags, ..
@@ -63,6 +69,8 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
         //     }
         // };
 
+        info!("inserting new link {}", url);
+
         // println!("Link {} to {}", link.title, link.url);
         let mut doc = Doc::new(
             title,
@@ -86,43 +94,42 @@ pub async fn scrape_all() -> Result<(), Box<dyn std::error::Error>> {
         )?;
     }
 
-    loop {
-        let link =  match conn.query_row(
-            "SELECT title, url, domain, body, tags, created_at, scraped_at FROM links WHERE body IS NULL AND scraped_at IS NULL LIMIT 1",
-            [],
-            |row| Ok(Doc {
-                 title: row.get(0)?,
-                 url: row.get(1)?,
-                 domain: row.get(2)?,
-                 body: row.get(3)?,
-                 tags: row.get::<usize, Option<String>>(4).map(|res| res.map(|s| s.split(';').map(|s| s.into()).collect::<Vec<String>>()))?,
-                 created_at: row.get(5)?,
-                 scraped_at: row.get(6)?,
-            }),
-        ) {
-            Ok(link) => link,
-            Err(e) => {
-                println!("query link: {:?}", e);
-                break;
-            },
-        };
-
-        let data = scrape(&link.url).await;
+    let docs = conn.prepare(
+        "SELECT title, url, domain, body, tags, created_at, scraped_at FROM links WHERE body IS NULL AND scraped_at IS NULL",
+    )?.query_map(
+        [],
+        |row| Ok(Doc {
+             title: row.get(0)?,
+             url: row.get(1)?,
+             domain: row.get(2)?,
+             body: row.get(3)?,
+             tags: row.get::<usize, Option<String>>(4).map(|res| res.map(|s| s.split(';').map(|s| s.into()).collect::<Vec<String>>()))?,
+             created_at: row.get(5)?,
+             scraped_at: row.get(6)?,
+        }),
+    )?.collect::<Result<Vec<_>, _>>()?;
+
+    info!("will scrape {} documents", docs.len());
+
+    for doc in docs {
+        info!("scraping link {}, tags {:?}", doc.url, doc.tags);
+
+        let data = scrape(&doc.url).await;
         let data = match data {
             Ok(data) => data,
             Err(err) => {
-                println!("Scrape {}: {}", link.url, err);
+                error!("scrape {}: {}", doc.url, err);
                 conn.execute(
                     "UPDATE links SET scraped_at = datetime('now') WHERE url = ?1",
-                    rusqlite::params![link.url],
+                    rusqlite::params![doc.url],
                 )?;
                 continue;
             }
         };
 
         conn.execute(
             "UPDATE links SET body = ?1, scraped_at = datetime('now') WHERE url = ?2",
-            rusqlite::params![data.digest, link.url],
+            rusqlite::params![data.digest, doc.url],
         )?;
     }
 

diff --git a/mwp-search/src/lib.rs b/mwp-search/src/lib.rs
@@ -6,6 +6,7 @@ pub struct SearchIndex {
     pub index: Index,
 }
 
+#[derive(Clone)]
 pub struct Doc {
     pub title: String,
     pub url: Url,

diff --git a/mwp-web/src/static/styles.scss b/mwp-web/src/static/styles.scss
@@ -141,13 +141,13 @@ form {
 
 .layout {
   display: grid;
-  grid-template-columns: 16em minmax(0, 1fr);
+  grid-template-columns: 16em minmax(0, 1fr) 16em;
   grid-template-rows: auto auto 1fr auto;
   grid-template-areas:
-    'nav nav'
-    'sidebar meta'
-    'sidebar content'
-    'footer footer';
+    'nav nav nav'
+    'sidebar meta blank'
+    'sidebar content blank'
+    'footer footer footer';
 
   @media (width <=1000px) {
     grid-template-columns: auto;
@@ -168,7 +168,6 @@ form {
   justify-content: space-between;
   align-items: center;
   padding: var(--spacings-kilo) var(--spacings-giga);
-  border-bottom: 2px solid var(--light);
 
   .burger {
     display: none;
@@ -237,7 +236,6 @@ footer {
 }
 
 article {
-  max-width: 920px;
   width: 100%;
 
   img {
@@ -388,4 +386,4 @@ article {
       display: block;
     }
   }
-}
+}
diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml
@@ -11,6 +11,8 @@ clap = { version = "4.5.23", features = [
   "env",
   "wrap_help",
 ] }
-tokio = { version = "1.42.0", features= ["full"]}
+pretty_env_logger = "0.5.0"
+
+tokio = { workspace = true }
 
 mwp-scraper = { path="../mwp-scraper" }
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
@@ -12,14 +12,10 @@ enum Xtask {
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    pretty_env_logger::init();
     let xtask = Xtask::parse();
 
     match xtask {
-        Xtask::Scrape => scrape().await,
+        Xtask::Scrape => mwp_scraper::scrape_all().await,
     }
 }
-
-async fn scrape() -> Result<(), Box<dyn std::error::Error>> {
-    mwp_scraper::scrape_all().await?;
-    Ok(())
-}