-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lintangsutawika's work on new scraping squash together
added propublica add process_map add multiprocess add differentiation between clean and raw directory renamed directory add os added new source update unified process for all news add way to save index run individual news in process.py remove better parsing added notes for each site processes ahref em and strong tags allow both html and url choice to be used add byline and fix html_path update how pages are saved add dependancies failed pages are saved to a new file process to get_page.py, and added news sites add script for processing html set new arguments add filename to jsonfile add args and capture exceptions update get_record removed comments not use wget add list of sites update to split page download and page processing remove duplicates in page_list removed arg fix args moved limit fix typo add process italic fix script to process text tqdm move to inside map moved process to get-text.sh simplify multiprocess simplify multiprocess add empty line add license arg alphabetical order better process to capture bylines and time add header add paramter of searching for date and bylines attrs are searched as regex string update attribute search update method name change header name updated parameters for CC BY sites author then date update search and attrs add readme Create a shared scraping function. This PR adds a shared scraping function to the licensed pile shared library. It has a default user-agent string and smart retries. We should use it when we need to `HTTP GET` a resource from within python.
- Loading branch information
1 parent
28e210b
commit c9a2d14
Showing
8 changed files
with
486 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
Sites included. Source: https://opennewswire.org/feed/ | ||
|
||
CC BY | ||
|
||
- 360info | ||
- Africa is a Country | ||
- Alt News | ||
- Balkan Diskurs | ||
- Factly | ||
- Freedom of the Press Foundation | ||
- Agenzia Fides | ||
- Global Voices | ||
- Meduza | ||
- Mekong Eye | ||
- Milwaukee Neighborhood News Service | ||
- Minority Africa | ||
- New Canadian Media | ||
- SciDev.Net | ||
- The Solutions Journalism Exchange | ||
- Tasnim News Agency | ||
- ZimFact | ||
|
||
CC BY-SA | ||
|
||
- Liberty TV | ||
- Oxpeckers | ||
- Propastop | ||
- The Public Record | ||
|
||
Public Domain | ||
|
||
- Caravanserai | ||
|
||
Use | ||
|
||
`bash news/get-data.sh` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/usr/bin/env sh | ||
|
||
bash news/get-metadata.sh | ||
bash news/get-text.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env sh | ||
|
||
# CC-BY | ||
python news/get_page.py --url https://360info.org/ --output_dir data/news/raw/360info/ | ||
python news/get_page.py --url https://africasacountry.com/ --output_dir data/news/raw/africasacountry/ | ||
python news/get_page.py --url https://www.altnews.in/ --output_dir data/news/raw/altnews/ | ||
python news/get_page.py --url https://balkandiskurs.com/en/ --output_dir data/news/raw/balkandiskurs/ | ||
python news/get_page.py --url https://factly.in/ --output_dir data/news/raw/factly/ | ||
python news/get_page.py --url https://www.fides.org/en --output_dir data/news/raw/fides/ | ||
python news/get_page.py --url https://freedom.press/ --output_dir data/news/raw/freedom/ | ||
python news/get_page.py --url https://globalvoices.org/ --output_dir data/news/raw/globalvoices/ | ||
python news/get_page.py --url https://meduza.io/en --output_dir data/news/raw/meduza/ | ||
python news/get_page.py --url https://www.mekongeye.com/ --output_dir data/news/raw/mekongeye/ | ||
python news/get_page.py --url https://milwaukeenns.org/ --output_dir data/news/raw/milwaukeenns/ | ||
python news/get_page.py --url https://minorityafrica.org/ --output_dir data/news/raw/minorityafrica/ | ||
python news/get_page.py --url https://www.newcanadianmedia.ca/ --output_dir data/news/raw/newcanadianmedia/ | ||
# python news/get_page.py --url https://projectmultatuli.org/en/ --output_dir data/news/raw/projectmultatuli/ | ||
python news/get_page.py --url http://scidev.net/ --output_dir data/news/raw/scidev/ | ||
python news/get_page.py --url https://sojoexchange.solutionsjournalism.org/ --output_dir data/news/raw/solutionsjournalism/ | ||
python news/get_page.py --url https://www.tasnimnews.com/en --output_dir data/news/raw/tasnimnews/ | ||
python news/get_page.py --url https://zimfact.org/ --output_dir data/news/raw/zimfact/ | ||
|
||
# CC BY-SA | ||
python news/get_page.py --url https://educeleb.com/ --output_dir data/news/raw/educeleb/ | ||
python news/get_page.py --url https://libertytvradio.com/ --output_dir data/news/raw/libertytvradio/ | ||
python news/get_page.py --url https://oxpeckers.org/ --output_dir data/news/raw/oxpeckers/ | ||
python news/get_page.py --url https://www.propastop.org/eng/ --output_dir data/news/raw/propastop/ | ||
python news/get_page.py --url https://www.thepublicrecord.ca/ --output_dir data/news/raw/thepublicrecord/ | ||
|
||
# Public Domain | ||
python news/get_page.py --url https://central.asia-news.com/en_GB/ --output_dir data/news/raw/caravanserai/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/usr/bin/env sh | ||
|
||
# CC BY | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/360info/ --filename news-360info.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "copy main-copy"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/africasacountry/ --filename news-africasacountry.jsonl.gz --output_dir data/news/ --tag article --attrs '{"class": "po__article"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/altnews/ --filename news-altnews.jsonl.gz --output_dir data/news/ --tag div --attrs '{"id": "primary"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/balkandiskurs/ --filename news-balkandiskurs.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "entry-content"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/factly/ --filename news-factly.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "post-content-right"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/fides/ --filename news-fides.jsonl.gz --output_dir data/news/ --tag div --attrs '{"id": "news"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/freedom/ --filename news-freedom.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "blog-page"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/globalvoices/ --filename news-globalvoices.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "entry-container"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/meduza/ --filename news-meduza.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "article"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/mekongeye/ --filename news-mekongeye.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "main-content"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/milwaukeenns/ --filename news-milwaukeenns.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "entry-content"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/minorityafrica/ --filename news-minorityafrica.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "post-content-container"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/newcanadianmedia/ --filename news-newcanadianmedia.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "content-main"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/scidev/ --filename news-scidev.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "fl-col-content fl-node-content"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/solutionsjournalism/ --filename news-solutionsjournalism.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "sqs-html-content"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/tasnimnews/ --filename news-tasnimnews.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "story"}' | ||
python news/get_text.py --license CC BY --input_dir data/news/raw/zimfact/ --filename news-zimfact.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "entry-content"}' | ||
|
||
# CC BY-SA | ||
python news/get_text.py --license CC BY-SA --input_dir data/news/raw/educeleb/ --filename news-educeleb.jsonl.gz --output_dir data/news/ | ||
python news/get_text.py --license CC BY-SA --input_dir data/news/raw/libertytvradio/ --filename news-libertytvradio.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "td-post-content"}' | ||
python news/get_text.py --license CC BY-SA --input_dir data/news/raw/oxpeckers/ --filename news-oxpeckers.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "post_text_inner"}' | ||
python news/get_text.py --license CC BY-SA --input_dir data/news/raw/propastop/ --filename news-propastop.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "post-wrap"}' | ||
python news/get_text.py --license CC BY-SA --input_dir data/news/raw/thepublicrecord/ --filename news-thepublicrecord.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "entry-content"}' | ||
|
||
# Public Domain | ||
python news/get_text.py --license Public Domain --input_dir data/news/raw/caravanserai/ --filename news-caravanserai.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "article__content"}' | ||
|
||
# CC NC ND | ||
# python news/get_text.py --license CC NC ND --input_dir data/news/raw/projectmultatuli/ --filename news-projectmultatuli.jsonl.gz --output_dir data/news/ --tag div --attrs '{"class": "elementor-widget-container"}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import os | ||
import wget | ||
import requests | ||
import argparse | ||
import jsonlines | ||
import multiprocessing as mp | ||
|
||
from tqdm import tqdm | ||
from pathlib import Path | ||
from functools import partial | ||
|
||
import utils | ||
|
||
parser = argparse.ArgumentParser(description="Download News Sites") | ||
parser.add_argument( | ||
"--url", default="https://www.propublica.org/", help="Base URL" | ||
) | ||
parser.add_argument( | ||
"--index_path", | ||
default=None, | ||
help="File that list of all pages", | ||
) | ||
parser.add_argument( | ||
"--output_dir", | ||
default="data/news-propublica/", | ||
help="Path to output directory where raw pages are downloaded.", | ||
) | ||
parser.add_argument( | ||
"--overwrite", | ||
action="store_true", | ||
help="Should we overwrite previously downloaded copies?", | ||
) | ||
parser.add_argument( | ||
"--num_workers", | ||
type=int, | ||
default=5, | ||
help="Number of workers", | ||
) | ||
parser.add_argument( | ||
"--limit", | ||
type=int, | ||
default=None, | ||
help="Set number of pages", | ||
) | ||
parser.add_argument( | ||
"--dl", | ||
action="store_true", | ||
help="Download pages", | ||
) | ||
|
||
def get_pages(page_index, output_path): | ||
idx = page_index["idx"] | ||
url = page_index["url"] | ||
filename = page_index["filename"] | ||
|
||
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} | ||
headers = {'User-Agent': 'My User Agent 1.0'} | ||
page_file_path = os.path.join(output_path, filename) | ||
# try: | ||
# wget.download(url, out=page_file_path) | ||
# return (url, 0) | ||
# except: | ||
# print("WGET Error", url) | ||
try: | ||
page = requests.get( | ||
url, | ||
headers=headers, | ||
# verify=False, | ||
# allow_redirects=False, | ||
# stream=True, | ||
# timeout=10 | ||
) | ||
with open(page_file_path, 'wb') as fp: | ||
fp.write(page.content) | ||
return (url, 0) | ||
except Exception as err: | ||
return (url, str(err)) | ||
|
||
def main(args): | ||
|
||
output_dir = args.output_dir | ||
Path(output_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
if args.index_path: | ||
with jsonlines.open(args.index_path) as reader: | ||
page_index = [line for line in reader] | ||
else: | ||
pagelist_path = os.path.join(output_dir, "pagelist.jsonl") | ||
if os.path.isfile(pagelist_path) and args.overwrite is False: | ||
with jsonlines.open(pagelist_path) as reader: | ||
page_index = [line for line in reader] | ||
else: | ||
page_list = utils.build_url_index(args.url) | ||
|
||
# Remove duplicates | ||
page_list = list(dict.fromkeys(page_list)) | ||
page_index = [{"idx": idx, "url": url, "filename": f"{utils.sanitize_url(url)}.html"} for idx, url in enumerate(page_list)] | ||
with jsonlines.open(pagelist_path, mode="w") as writer: | ||
writer.write_all(page_index) | ||
|
||
if args.limit is not None: | ||
page_index = page_index[:args.limit] | ||
|
||
if args.dl: | ||
# Download all pages | ||
download_fn = partial(get_pages, output_path=output_dir) | ||
num_workers = mp.cpu_count() if args.num_workers is None else args.num_workers | ||
with mp.Pool(num_workers) as p: | ||
failed_pages = p.map(download_fn, tqdm(page_index)) | ||
|
||
failedlist_path = os.path.join(output_dir, "failedlist.jsonl") | ||
with jsonlines.open(failedlist_path, mode="w") as writer: | ||
writer.write_all([{"idx": idx, "url": url, "error": err} for idx, (url, err) in enumerate(failed_pages)]) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import os | ||
import json | ||
import wget | ||
import requests | ||
import argparse | ||
import jsonlines | ||
import multiprocessing as mp | ||
|
||
from tqdm import tqdm | ||
from pathlib import Path | ||
from functools import partial | ||
from datetime import datetime | ||
|
||
import utils | ||
from licensed_pile.write import to_dolma | ||
|
||
parser = argparse.ArgumentParser(description="Download News Sites") | ||
parser.add_argument( | ||
"--input_dir", | ||
default="data/news-propublica/", | ||
help="Path to output directory where raw pages are downloaded.", | ||
) | ||
parser.add_argument( | ||
"--output_dir", | ||
default="data/news-propublica/", | ||
help="Path to output directory for processed data.", | ||
) | ||
parser.add_argument( | ||
"--version", | ||
type=int, | ||
default=1, | ||
help="Version of the subset", | ||
) | ||
parser.add_argument( | ||
"--index_path", | ||
default=None, | ||
help="File that list of all pages", | ||
) | ||
parser.add_argument( | ||
"--overwrite", | ||
action="store_true", | ||
help="Should we overwrite previously downloaded copies?", | ||
) | ||
parser.add_argument( | ||
"--filename", default="pro.jsonl.gz", help="The base filename for our data." | ||
) | ||
parser.add_argument( | ||
"--shard_size", type=int, default=1, help="Size, in GB, for each shard." | ||
) | ||
parser.add_argument( | ||
"--license", type=str, default="CC-BY", help="Type of license" | ||
) | ||
parser.add_argument( | ||
"--tag", type=str, default="div", help="Tag for the article or content" | ||
) | ||
parser.add_argument( | ||
"--attrs", type=json.loads, default=None, help="dict of attributes" | ||
) | ||
parser.add_argument( | ||
"--num_workers", | ||
default=5, | ||
help="Number of workers", | ||
) | ||
|
||
def get_record(page_index, input_dir=None, date=None, license_type=None, tag="div", attrs=None): | ||
idx = page_index["idx"] | ||
url = page_index["url"] | ||
filename = page_index["filename"] | ||
|
||
html_path = os.path.join(input_dir, filename) | ||
if os.path.exists(html_path): | ||
page_text = utils.get_text_from_page(html_path=html_path, tag=tag, attrs=attrs) | ||
|
||
return { | ||
"id": idx, | ||
"text": page_text, | ||
"source": url, | ||
"added": date, | ||
"metadata": { | ||
"license": license_type, | ||
} | ||
} | ||
else: | ||
return None | ||
|
||
def main(args): | ||
|
||
input_dir = os.path.join(args.input_dir) | ||
Path(input_dir).mkdir(parents=True, exist_ok=True) | ||
output_dir = os.path.join(args.output_dir, f"v{args.version}") | ||
Path(output_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
current_datetime = datetime.now() | ||
date = f"{current_datetime.year}-{current_datetime.month}-{current_datetime.day}" | ||
|
||
if args.index_path: | ||
with jsonlines.open(args.index_path) as reader: | ||
page_index = [line for line in reader] | ||
else: | ||
pagelist_path = os.path.join(input_dir, "pagelist.jsonl") | ||
if os.path.isfile(pagelist_path) and args.overwrite is False: | ||
with jsonlines.open(pagelist_path) as reader: | ||
page_index = [line for line in reader] | ||
|
||
# TODO Save HTML files | ||
# Then process/extract | ||
get_record_fn = partial(get_record, input_dir=input_dir, date=date, license_type=args.license, tag=args.tag, attrs=args.attrs) | ||
num_workers = mp.cpu_count() if args.num_workers is None else args.num_workers | ||
with mp.Pool(num_workers) as p: | ||
page_data = p.map(get_record_fn, tqdm(page_index)) | ||
|
||
page_data = [page for page in page_data if page is not None] | ||
|
||
# Cleaned Version | ||
output_dir = os.path.join(args.output_dir, f"v{args.version}") | ||
to_dolma(page_data, output_dir, args.filename, args.shard_size) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
main(args) |
Oops, something went wrong.