-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR updates the news parsing code to use some of the new shared u…
…tilities and splits it into steps better. It also updates the information extraction steps to have cleaner authors and filters out some pages with little content.
- Loading branch information
1 parent
c9a2d14
commit 55ad138
Showing
19 changed files
with
622 additions
and
378 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
"""Build out a list of all pages on a website based on their sitemap.""" | ||
|
||
import argparse | ||
import json | ||
import os | ||
|
||
import utils | ||
|
||
from licensed_pile import logs, scrape | ||
|
||
parser = argparse.ArgumentParser(description="Find all pages on a news site.") | ||
parser.add_argument("--url", required=True, help="Base URL") | ||
parser.add_argument( | ||
"--index_path", | ||
required=True, | ||
help="File that list of all pages", | ||
) | ||
parser.add_argument( | ||
"--overwrite", | ||
action="store_true", | ||
help="Create a new index.", | ||
) | ||
|
||
|
||
def main(args): | ||
logger = logs.get_logger("news") | ||
if os.path.exists(args.index_path) and not args.overwrite: | ||
logger.error(f"Page Index already exists at {args.index_path}, aborting.") | ||
return | ||
logger.info(f"Building page index from {args.url}") | ||
page_list = utils.build_url_index(args.url) | ||
page_list = sorted(set(page_list)) | ||
logger.info(f"Found {len(page_list)} pages.") | ||
page_index = [ | ||
{"idx": idx, "url": url, "filename": f"{utils.url_to_filename(url)}.html"} | ||
for idx, url in enumerate(page_list) | ||
] | ||
logger.info(f"Saving page index to {args.index_path}") | ||
os.makedirs(os.path.dirname(args.index_path), exist_ok=True) | ||
with open(args.index_path, "w") as wf: | ||
wf.write("\n".join(json.dumps(p) for p in page_index) + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
logs.configure_logging("news") | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env bash | ||
|
||
data_dir=${1:-"data"} | ||
data_dir=${data_dir%/} | ||
test_run=${2:-"0"} | ||
|
||
declare -a sites=( | ||
360info | ||
# africasacountry | ||
altnews | ||
balkandiskurs | ||
factly | ||
# fides | ||
freedom | ||
globalvoices | ||
# meduza | ||
mekongeye | ||
milwaukeenns | ||
minorityafrica | ||
newcanadianmedia | ||
# scidev | ||
solutionsjournalism | ||
# tasnimnews | ||
zimfact | ||
educeleb | ||
libertytvradio | ||
oxpeckers | ||
propastop | ||
thepublicrecord | ||
# caravanserai | ||
) | ||
|
||
for site in ${sites[@]}; do | ||
python download_pages.py --index_path ${data_dir}/pages/${site}/pagelist.jsonl --test_run ${test_run} | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
"""Download all the files from a site.""" | ||
|
||
import argparse | ||
import functools | ||
import json | ||
import multiprocessing.dummy as mp | ||
import os | ||
import random | ||
import time | ||
|
||
import utils | ||
|
||
from licensed_pile import logs, scrape | ||
|
||
parser = argparse.ArgumentParser(description="Download pages from a news site.") | ||
parser.add_argument( | ||
"--index_path", | ||
required=True, | ||
help="File that list of all pages", | ||
) | ||
parser.add_argument( | ||
"--output_dir", | ||
help="Path to output directory where raw pages are downloaded.", | ||
) | ||
parser.add_argument( | ||
"--overwrite", | ||
action="store_true", | ||
help="Should we overwrite previously downloaded copies?", | ||
) | ||
parser.add_argument( | ||
"--num_workers", | ||
type=int, | ||
default=32, | ||
help="Number of workers", | ||
) | ||
parser.add_argument( | ||
"--test_run", | ||
type=int, | ||
help="Set number of pages", | ||
) | ||
parser.add_argument( | ||
"--wait", | ||
type=int, | ||
default=1, | ||
help="Time to wait between requests.", | ||
) | ||
parser.add_argument( | ||
"--dry_run", action="store_true", help="Don't actually download anything." | ||
) | ||
|
||
|
||
def get_pages( | ||
page_index, output_dir, overwrite: bool = True, wait: int = 0, dry_run: bool = False | ||
): | ||
idx = page_index["idx"] | ||
url = page_index["url"] | ||
filename = page_index["filename"] | ||
|
||
page_file_path = os.path.join(output_dir, filename) | ||
logger = logs.get_logger("news") | ||
|
||
if not utils.filter_url(url): | ||
return | ||
|
||
if not overwrite and os.path.exists(page_file_path): | ||
logger.info(f"{page_file_path} already exists, not downloading.") | ||
return | ||
|
||
if dry_run: | ||
logger.info(f"Not downloading {url} as --dry_run was set.") | ||
return | ||
try: | ||
logger.info(f"Downloading {url}") | ||
page = scrape.get_page(url) | ||
with open(page_file_path, "wb") as fp: | ||
fp.write(page.content) | ||
except Exception as err: | ||
logger.error(f"Failed to fetch {url}") | ||
if wait: | ||
time.sleep(wait) | ||
|
||
|
||
def main(args): | ||
args.output_dir = ( | ||
args.output_dir | ||
if args.output_dir is not None | ||
else os.path.dirname(args.index_path) | ||
) | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
|
||
logger = logs.get_logger("news") | ||
logger.info(f"Downloading pages found in {args.index_path}") | ||
with open(args.index_path) as f: | ||
page_index = [json.loads(line) for l in f if (line := l.strip())] | ||
|
||
if not page_index: | ||
logger.error(f"{args.index_path} is empty.") | ||
raise ValueError(f"{args.index_path} is empty.") | ||
|
||
if args.test_run: | ||
logger.info(f"Test Run, only downloading {args.test_run} random pages.") | ||
random.seed(42) | ||
random.shuffle(page_index) | ||
page_index = page_index[: args.test_run] | ||
|
||
# Download all pages | ||
# We don't process the results, they are just written to disk, so we | ||
# use map to make sure it actually gets run. | ||
logger.info(f"Saving pages to {args.output_dir}") | ||
with mp.Pool(args.num_workers) as p: | ||
_ = p.map( | ||
functools.partial( | ||
get_pages, | ||
output_dir=args.output_dir, | ||
overwrite=args.overwrite, | ||
wait=args.wait, | ||
dry_run=args.dry_run, | ||
), | ||
page_index, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
logs.configure_logging("news") | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"id": 1, | ||
"text": "भारत के अर्धसैनिक बलों (सशस्त्र सैन्य बल) को जानें - FACTLY\nSai Krishna Muthyanolla\nMarch 25, 2016\nThe Central Armed Police Forces ( commonly referred to as Paramilitary forces) play an important role in border security as well as internal security. Along with the Defence Establishment, they play an important role in National Security. This video is an effort to explain the role played by the Seven Central Armed Police Forces that come under the Ministry of Home Affairs.\nThis video is made in association with Inshorts, India’s highest rated news app. You can download the Inshorts app on Play Store & App Store", | ||
"source": "news-factly", | ||
"added": "2024-05-27T07:09:48.522547", | ||
"created": "March 25, 2016", | ||
"metadata": { | ||
"license": "Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/", | ||
"url": "https://factly.in/%e0%a4%ad%e0%a4%be%e0%a4%b0%e0%a4%a4-%e0%a4%95%e0%a5%87-%e0%a4%85%e0%a4%b0%e0%a5%8d%e0%a4%a7%e0%a4%b8%e0%a5%88%e0%a4%a8%e0%a4%bf%e0%a4%95-%e0%a4%ac%e0%a4%b2%e0%a5%8b%e0%a4%82-%e0%a4%b8%e0%a4%b6/", | ||
"author": "Sai Krishna Muthyanolla" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"id": 4, | ||
"text": "A bad wrap? Using packaging well to reduce food waste - 360\nTammara Soma\nPublished on June 15, 2022\nVast amounts of food are sent to landfill. But the jury’s out on whether packaging is friend or foe in the fight against the problem.\nSown, watered, raised, harvested – and dumped. Food wasted in the agricultural powerhouse of Indonesia over the last two decades could have fed up to 125 million people.\nIn recognition of the growing worldwide problem, the agricultural chief scientists from the G20 nations will meet at a food loss and waste prevention workshop in Bali in July 2022. Solving the problem means going beyond agriculture and understanding the role of food packaging. There are many controversies around the environmental impact of packaging, and while no packaging at all is sometimes best, packaging can still play a role in solving the food-waste puzzle.\nFrom 2000 to 2019, as much as 184 kilograms of food per person per year was wasted, accounting for 7.29 percent of Indonesia’s greenhouse-gas emissions. Wasted food costs Indonesia up to 551 trillion rupiah (US$38 billion) each year.\nSome studies show packaging may extend shelf life and reduce loss and waste by protecting food during transportation from farm to retail. In Indonesia, poor-quality packaging has been identified as a driver of food loss. Unhusked rice, for example, spoils and spills when it is not packaged properly.\nBut single-use packaging has been blamed for interfering with household food-waste management: plastic waste contaminates rubbish and hinders composting initiatives. Packaging needs to be removed from wasted food if the food is to be disposed of properly — removing it to extract the food waste at an industrial scale can be expensive and energy intensive. An audit in Australia found 32 percent of plastic food packaging in the rubbish stream contained food.\nOther studies have found 20 to 25 percent of household food waste is directly or indirectly related to packaging issues. Packaging designs can make it difficult to use all the contents, and best-before dates can confuse consumers. If food is past its best-before date, it is often still safe to eat, but consumers may think it must be thrown away. The date just refers to whether the food is at its best in terms of taste or texture.\nPackaging can also be a source of food waste when it prompts food recalls. In the United States, 36,000 cases of thin-sliced cheese were recalled in 2015 because the plastic film covering the cheese was a choking hazard. In 2017 the Canadian Food Inspection Agency recalled yoghurt because there were plastics in the product.\nZero-packaging grocery stores are emerging around the world, especially in Europe and North America. They encourage reusable packaging in sizes that allow consumers to buy only what they need, and they demonstrate the role that more durable reusable packaging could play in supporting a short food supply chain. This model could be applied in both traditional and modern markets in Indonesia.\nA 2022 study by leading UK sustainability charity WRAP found that selling just five products (apples, bananas, broccoli, cucumber and potatoes) without packaging would save more than 10,300 tonnes of plastic and 100,000 tonnes of food from being wasted annually. Consumers often waste packaged foods because it forces them to buy more than what they need.\nThe role of packaging in food waste reduction and prevention is an important topic for research and is still up for debate, but what is non-debatable are the negative environmental, social and economic impacts of food waste. Single-use food packaging is a potential source of pollution, particularly because there are infrastructural gaps in waste management in Indonesia, so a systematic approach to understanding the pros and cons of packaging is needed.\nDr Tammara Soma (ORCID 0000-0002-4273-1165) is an Assistant Professor in the School of Resource and Environmental Management at Simon Fraser University, Canada. Her research in Indonesia was funded by the Social Sciences and Humanities Research Council, the Pierre Elliott Trudeau Foundation Doctoral Scholarship, the Dr David Chu Asia Pacific Scholarship and the International Development Research Council. During her fieldwork in Indonesia she was affiliated with the Bogor Agricultural University (Institut Pertanian Bogor).\nThe author has declared no conflict of interest in relation to this article.\nOriginally published under Creative Commons by 360info™.", | ||
"source": "news-360info", | ||
"added": "2024-05-27T06:57:36.763796", | ||
"created": "Published on June 15, 2022", | ||
"metadata": { | ||
"license": "Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/", | ||
"url": "https://360info.org/a-bad-wrap-using-packaging-well-to-reduce-food-waste/", | ||
"author": "Tammara Soma" | ||
} | ||
} |
Oops, something went wrong.