Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
maniyar1 committed Jun 6, 2021
0 parents commit 0f685c2
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/target
Cargo.lock
wiki/
18 changes: 18 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "wikipedia-category-downloader"
version = "0.1.0"
authors = ["maniyar"]
edition = "2018"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
select = "0.5.0"
url = "2.2.2"
rayon = "1.5"
getopts = "0.2"

[dependencies.reqwest]
version = "0.11"
default-features = false
features = ["blocking", "rustls-tls"]
25 changes: 25 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
This is free and unencumbered software released into the public domain.

Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.

In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

For more information, please refer to <https://unlicense.org>

9 changes: 9 additions & 0 deletions README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Simple tool to download a wikipedia category, example usage:

Download pages from this category and (default) one level down of sub-categories

./wikipedia-category-downloader https://en.wikipedia.org/wiki/Category:Marxism

Download pages just from this category, no sub-categories

./wikipedia-category-downloader https://en.wikipedia.org/wiki/Category:Marxism -l 0
135 changes: 135 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// importation syntax
use getopts::Options;
use rayon::prelude::*;
use select::document::Document;
use select::predicate::{Attr, Name, Predicate};
use std::env;
use std::fs::File;
use std::io::prelude::*;
use url::Url;

fn main() {
let args: Vec<String> = env::args().collect();
let program = args[0].clone();

let mut opts = Options::new();
opts.optopt("l", "levels", "how many category levels to go down", "1");
opts.optflag(
"a",
"append-html",
"append .html to the end of file names, will break links",
);
opts.optflag("h", "help", "print this help menu");
let matches = match opts.parse(&args[1..]) {
Ok(m) => m,
Err(f) => {
panic!("{}", f.to_string())
}
};

if matches.opt_present("h") || args.len() < 2 {
print_usage(&program, opts);
return;
}

let input = &args[1];
let issue_list_url = Url::parse(input).unwrap();
std::fs::create_dir_all("./wiki/").unwrap();

if matches.opt_present("l") {
if matches.opt_present("a") {
get_category(
issue_list_url.path(),
matches.opt_get("l").unwrap().unwrap(),
0,
true,
);
} else {
get_category(
issue_list_url.path(),
matches.opt_get("l").unwrap().unwrap(),
0,
false,
);
}
} else {
if matches.opt_present("a") {
get_category(issue_list_url.path(), 1, 0, true);
} else {
get_category(issue_list_url.path(), 1, 0, false);
}
}
}

fn get_category(
rel_url: &str,
max_category_level: i32,
current_category_level: i32,
append_html: bool,
) {
std::thread::sleep(std::time::Duration::from_millis(50)); // wikipedia is pretty cool, don't wanna spam em
let url = "https://wikipedia.org/".to_owned() + rel_url;
let resp = reqwest::blocking::get(&url).unwrap();
assert!(resp.status().is_success());
// Save file
let body = resp.text().unwrap();
let category_parsed_url = Url::parse(&url).unwrap();
let segments = category_parsed_url
.path_segments()
.map(|c| c.collect::<Vec<_>>())
.unwrap();
let file_name = if append_html {
format!("wiki/{}.html", segments[segments.len() - 1]) // Eg. https://en.wikipedia.org/wiki/Category:Libertarian_socialism -> wiki/Category:Libertarian_socialism.html
} else {
format!("wiki/{}", segments[segments.len() - 1]) // Eg. https://en.wikipedia.org/wiki/Category:Libertarian_socialism -> wiki/Category:Libertarian_socialism
};
let mut file = File::create(&file_name).unwrap();
file.write_all(body.as_bytes()).unwrap();
println!("{} Written", file_name);

let document = Document::from(&body[..]);
if current_category_level <= max_category_level {
let subcats: Vec<&str> = document
.find(Attr("id", "mw-subcategories").descendant(Name("a"))) // Get subcategories under <div id="mw-subcategories"> <a href="..."> </a> </div>
.filter_map(|n| n.attr("href"))
.collect();

subcats
.par_iter()
.for_each(|x| get_category(x, max_category_level, current_category_level + 1, append_html));
}

let pages: Vec<&str> = document
.find(Attr("id", "mw-pages").descendant(Name("a"))) // Get pages under <div id="mw-pages"> <a href="..."> </a> </div>
.filter_map(|n| n.attr("href"))
.collect();
pages.par_iter()
.for_each(|x| get_page(x, append_html));
}

fn get_page(rel_url: &str, append_html: bool) {
std::thread::sleep(std::time::Duration::from_millis(50)); // wikipedia is pretty cool, don't wanna spam em
let url = "https://wikipedia.org/".to_owned() + rel_url;
let resp = reqwest::blocking::get(&url).unwrap();
assert!(resp.status().is_success());
let body = resp.text().unwrap();
// Save file
let category_parsed_url = Url::parse(&url).unwrap();
let segments = category_parsed_url
.path_segments()
.map(|c| c.collect::<Vec<_>>())
.unwrap();
let file_name = if append_html {
format!("wiki/{}.html", segments[segments.len() - 1]) // Eg. https://en.wikipedia.org/wiki/Category:Libertarian_socialism -> wiki/Category:Libertarian_socialism.html
} else {
format!("wiki/{}", segments[segments.len() - 1]) // Eg. https://en.wikipedia.org/wiki/Category:Libertarian_socialism -> wiki/Category:Libertarian_socialism
};
let mut file = File::create(&file_name).unwrap();
file.write_all(body.as_bytes()).unwrap();
println!("{} Written", file_name);
}

fn print_usage(program: &str, opts: Options) {
let brief = format!("Usage: {} URL [options]", program);
print!("{}", opts.usage(&brief));
}

0 comments on commit 0f685c2

Please sign in to comment.