diff --git a/configs/config_greenfo.yaml b/configs/config_greenfo.yaml new file mode 100644 index 0000000..6c08270 --- /dev/null +++ b/configs/config_greenfo.yaml @@ -0,0 +1,12 @@ +"schema": "site_schemas/greenfo_schema.yaml" + +# "output_corpus": "greenfo_corpus.txt" + +"log_file_articles": "logs/greenfo_articles_log.txt" +"log_file_archive": "logs/greenfo_archive_log.txt" +"new_problematic_urls": "logs/greenfo_new_problematic_urls.txt" +"new_good_urls": "logs/greenfo_new_good_urls.txt" +"new_problematic_archive_urls": "logs/greenfo_new_problematic_archive_urls.txt" +"new_good_archive_urls": "logs/greenfo_new_good_archive_urls.txt" + +# "date_from": 2001-01-26 diff --git a/configs/config_neokohn.yaml b/configs/config_neokohn.yaml new file mode 100644 index 0000000..216602c --- /dev/null +++ b/configs/config_neokohn.yaml @@ -0,0 +1,12 @@ +"schema": "site_schemas/neokohn_schema.yaml" + +# "output_corpus": "neokohn_corpus.txt" + +"log_file_articles": "logs/neokohn_articles_log.txt" +"log_file_archive": "logs/neokohn_archive_log.txt" +"new_problematic_urls": "logs/neokohn_new_problematic_urls.txt" +"new_good_urls": "logs/neokohn_new_good_urls.txt" +"new_problematic_archive_urls": "logs/neokohn_new_problematic_archive_urls.txt" +"new_good_archive_urls": "logs/neokohn_new_good_archive_urls.txt" + +# "date_from": 2019-01-20 diff --git a/configs/config_szombat.yaml b/configs/config_szombat.yaml new file mode 100644 index 0000000..fec720a --- /dev/null +++ b/configs/config_szombat.yaml @@ -0,0 +1,10 @@ +"schema": "site_schemas/szombat_schema.yaml" + +# "output_corpus": "szombat_corpus.txt" + +"log_file_articles": "logs/szombat_articles_log.txt" +"log_file_archive": "logs/szombat_archive_log.txt" +"new_problematic_urls": "logs/szombat_new_problematic_urls.txt" +"new_good_urls": "logs/szombat_new_good_urls.txt" +"new_problematic_archive_urls": "logs/szombat_new_problematic_archive_urls.txt" +"new_good_archive_urls": "logs/szombat_new_good_archive_urls.txt" diff --git a/configs/extractors/site_specific_extractor_functions_szng.py b/configs/extractors/site_specific_extractor_functions_szng.py new file mode 100644 index 0000000..03b24a5 --- /dev/null +++ b/configs/extractors/site_specific_extractor_functions_szng.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8, vim: expandtab:ts=4 -*- + +from os.path import abspath, dirname, join as os_path_join +from bs4 import BeautifulSoup +from webarticlecurator import WarcCachingDownloader, Logger + + +# BEGIN SITE SPECIFIC extract_next_page_url FUNCTIONS ################################################################## + +def extract_next_page_url_neokohn(archive_page_raw_html): + """ + extracts and returns next page URL from an HTML code if there is one... + Specific for neokohn.hu + :returns string of url if there is one, None otherwise + """ + ret = None + soup = BeautifulSoup(archive_page_raw_html, 'lxml') + next_page = soup.find('a', class_='next page-numbers') + if next_page is not None and 'href' in next_page.attrs: + ret = next_page['href'] + return ret + + +def extract_next_page_url_szombat(archive_page_raw_html): + """ + Extract next page url from current archive page + Specific for szombat.org + :returns string of url if there is one, None otherwise + """ + ret = None + soup = BeautifulSoup(archive_page_raw_html, 'lxml') + next_page_url = soup.find('a', class_='nextpostslink') + if next_page_url is not None and 'href' in next_page_url.attrs: + ret = next_page_url['href'] + return ret + + +def extract_next_page_url_test(filename, test_logger): + """Quick test for extracting "next page" URLs when needed""" + # This function is intended to be used from this file only as the import of WarcCachingDownloader is local to main() + w = WarcCachingDownloader(filename, None, test_logger, just_cache=True, download_params={'stay_offline': True}) + + # Some of these are intentionally yields None + test_logger.log('INFO', 'Testing neokohn') + text = w.download_url('https://neokohn.hu/2019/02/04/') + assert extract_next_page_url_neokohn(text) == 'https://neokohn.hu/2019/02/04/page/2/' + text = w.download_url('https://neokohn.hu/2022/01/20/') + assert extract_next_page_url_neokohn(text) == 'https://neokohn.hu/2022/01/20/page/2/' + text = w.download_url('https://neokohn.hu/2019/02/09/page/2/') + assert extract_next_page_url_neokohn(text) is None + text = w.download_url('https://neokohn.hu/2019/01/20/') + assert extract_next_page_url_neokohn(text) is None + + test_logger.log('INFO', 'Testing szombat') + text = w.download_url('https://szombat.org/archiv?q=&y=1990&mo=2') + assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/2?q&y=1990&mo=2' + text = w.download_url('https://szombat.org/archiv/page/2?q&y=2020&mo=2') + assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/3?q&y=2020&mo=2' + text = w.download_url('https://szombat.org/archiv/page/7?q&y=2022&mo=1') + assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/8?q&y=2022&mo=1' + text = w.download_url('https://szombat.org/archiv/page/8?q&y=2022&mo=1') + assert extract_next_page_url_szombat(text) is None + + test_logger.log('INFO', 'Test OK!') + + +# END SITE SPECIFIC extract_next_page_url FUNCTIONS #################################################################### + + +def safe_extract_hrefs_from_a_tags(main_container): + """ + Helper function to extract href from a tags + :param main_container: An iterator over Tag()-s + :return: Generator over the extracted links + """ + for a_tag in main_container: + a_tag_a = a_tag.find('a') + if a_tag_a is not None and 'href' in a_tag_a.attrs: + yield a_tag_a['href'] + + +def extract_article_urls_from_page_greenfo(archive_page_raw_html): + """ + extracts and returns as a list the URLs belonging to articles from an HTML code + :param archive_page_raw_html: archive page containing list of articles with their URLs + :return: list that contains URLs + Not all links point to a real article (but the same news from other sites) + hence the use of 'more-link' class instead of the 'h2' tag. + """ + soup = BeautifulSoup(archive_page_raw_html, 'lxml') + main_container = soup.find_all(class_='more-link') + urls = {link['href'] for link in main_container} + return urls + + +def extract_article_urls_from_page_neokohn(archive_page_raw_html): + """ + extracts and returns as a list the URLs belonging to articles from an HTML code + :param archive_page_raw_html: archive page containing list of articles with their URLs + :return: list that contains URLs + """ + soup = BeautifulSoup(archive_page_raw_html, 'lxml') + main_container = soup.find_all('h3', class_='entry-title mh-posts-list-title') + urls = {link for link in safe_extract_hrefs_from_a_tags(main_container)} + return urls + + +def extract_article_urls_from_page_szombat(archive_page_raw_html): + """ + extracts and returns as a list the URLs belonging to articles from an HTML code + :param archive_page_raw_html: archive page containing list of articles with their URLs + :return: list that contains URLs + """ + soup = BeautifulSoup(archive_page_raw_html, 'lxml') + main_container = soup.find_all('h1', class_='title') + urls = {link for link in safe_extract_hrefs_from_a_tags(main_container)} + return urls + + +def extract_article_urls_from_page_test(filename, test_logger): + """Quick test for extracting URLs form an archive page""" + # This function is intended to be used from this file only as the import of WarcCachingDownloader is local to main() + w = WarcCachingDownloader(filename, None, test_logger, just_cache=True, download_params={'stay_offline': True}) + + test_logger.log('INFO', 'Testing greenfo') + text = w.download_url('https://greenfo.hu/page/1/?post_types=all&post_date=2001-01-26%202001-01-26') + extracted = extract_article_urls_from_page_greenfo(text) + expected = {'https://greenfo.hu/hir/tarifaemeles-helyett-esszeru-kozlekedesszervezest_980483759/#more-49915', + 'https://greenfo.hu/hir/a-hulladekgazdalkodasra-400-700-milliard-forint_980483643/#more-49909', + 'https://greenfo.hu/hir/a-4-es-metro-megepitese-sok-fa-elpusztitasaval-jarna_980463600/#more-49910', + 'https://greenfo.hu/hir/nemzeti-okologiai-kutatasi-program-indul_980463600/#more-49911', + 'https://greenfo.hu/hir/illes-zoltan-el-a-kezekkel-a-nemzeti-parkoktol_980463600/#more-49912', + 'https://greenfo.hu/hir/az-unios-tagsag-nem-oldja-meg-a-gondjainkat_980463600/#more-49913', + 'https://greenfo.hu/hir/lanyi-andras-iro-kornyezetvedo_980463600/#more-49914' + } + assert (extracted, len(extracted)) == (expected, 7) + + text = w.download_url('https://greenfo.hu/page/3/?post_types=all&post_date=2010-05-05%202010-05-05') + extracted = extract_article_urls_from_page_greenfo(text) + expected = {'https://greenfo.hu/hir/csikosfeju-nadiposzata-baranyaban_1273049570/#more-74036', + } + assert (extracted, len(extracted)) == (expected, 1) + + text = w.download_url('https://greenfo.hu/page/1/?post_types=all&post_date=2018-08-06%202018-08-06') + extracted = extract_article_urls_from_page_greenfo(text) + expected = {'https://greenfo.hu/hir/egymas-nyelvet-kepesek-megtanulni-a-madarak/#more-48578', + 'https://greenfo.hu/hir/a-magyar-mehallomany-fele-elpusztulhatott/#more-48577' + } + assert (extracted, len(extracted)) == (expected, 2) + + test_logger.log('INFO', 'Testing neokohn') + text = w.download_url('https://neokohn.hu/2019/01/20/') + extracted = extract_article_urls_from_page_neokohn(text) + expected = {'https://neokohn.hu/2019/01/20/new-york-times-eilat-is-a-legjobb-uti-celok-kozott-szerepel/' + } + assert (extracted, len(extracted)) == (expected, 1) + + text = w.download_url('https://neokohn.hu/2021/09/21/') + extracted = extract_article_urls_from_page_neokohn(text) + expected = {'https://neokohn.hu/2021/09/21/biden-kudarcot-vallott-erotlen-vezetese-katasztrofak-sorozatat' + '-okozta-es-artott-az-usa-nak/', + 'https://neokohn.hu/2021/09/21/donald-trump-szerint-hazaarulast-kovetett-el-mark-milley' + '-vezerkari-fonok/', + 'https://neokohn.hu/2021/09/21/izgalmas-reszletek-derultek-ki-az-irani-atomprogram-atyjanak' + '-kiiktatasarol/', + 'https://neokohn.hu/2021/09/21/hogyan-unnepeltek-a-szukkotot-2000-evvel-ezelott/', + 'https://neokohn.hu/2021/09/21/ensz-fotitkar-mindenaron-meg-kell-akadalyoznunk-az-usa' + '-kina-hideghaborut/', + 'https://neokohn.hu/2021/09/21/parizs-hazugsag-ketszinuseg-a-bizalom-sulyos-megsertese' + '-es-megvetese-tortent/' + } + assert (extracted, len(extracted)) == (expected, 6) + + text = w.download_url('https://neokohn.hu/2022/01/20/page/2/') + extracted = extract_article_urls_from_page_neokohn(text) + expected = {'https://neokohn.hu/2022/01/20/80-eve-dontottek-a-zsidokerdes-vegso-megoldasarol/' + } + assert (extracted, len(extracted)) == (expected, 1) + + test_logger.log('INFO', 'Testing szombat') + text = w.download_url('https://szombat.org/archiv/page/5?q&y=2022&mo=1') + extracted = extract_article_urls_from_page_szombat(text) + expected = {'https://szombat.org/hirek-lapszemle/netanjahu-es-a-vadalku-lehetosege-mandelblit-nem-enged-a-het-ev-' + 'kozszolgalattol-valo-eltiltasbol', + 'https://szombat.org/hirek-lapszemle/borrell-szerint-lehetseges-a-megallapodas-az-irani-atomalkurol-' + 'parizs-ketelkedik', + 'https://szombat.org/hagyomany-tortenelem/a-birkanyiras-elseje-misna-magyarul-hulin-11', + 'https://szombat.org/kultura-muveszetek/uri-asaf-ujjaszuletesei', + 'https://szombat.org/politika/macron-unios-vitat-indit-az-istenkaromlasrol-es-dzsihadrol', + 'https://szombat.org/kultura-muveszetek/elhunyt-fenakel-judit', + 'https://szombat.org/tortenelem/lengyelorszag-tobb-mint-tizezer-onkentes-segitett-szaz-zsido-temeto-' + 'helyreallitasaban-2021-ben', + 'https://szombat.org/hirek-lapszemle/heisler-andras-egy-eros-mazsihisz-a-magyarorszagi-zsidosag-erdeke', + 'https://szombat.org/hirek-lapszemle/belgium-tiz-evre-kiutasitottak-a-legnagyobb-mecset-uszitassal-es-' + 'kemkedessel-vadolt-vezetojet', + 'https://szombat.org/hirek-lapszemle/izrael-tobb-mint-250-ezer-virushordozo-14-kneszet-kepviselo-' + 'karantenban', + 'https://szombat.org/hirek-lapszemle/izraeli-baleset-barati-tuz-vegzett-ket-kommandossal', + 'https://szombat.org/politika/zsidozasrol-es-antiszemitazasrol-magyarorszag-ideges-tele-a-valasztasok-' + 'elott', + 'https://szombat.org/kultura-muveszetek/77-eves-koraban-elhunyt-michael-lang', + 'https://szombat.org/kozosseg/tu-bisvat-i-szeder-az-orthodoxian', + 'https://szombat.org/hirek-lapszemle/izrael-sok-a-fertozott-de-tiz-naprol-hetre-csokkentik-a-karanten-' + 'idejet', + 'https://szombat.org/politika/europai-biralat-a-simon-wiesenthal-kozpont-antiszemitizmus-listajanak', + 'https://szombat.org/hirek-lapszemle/magyar-nemzeti-ongyuloletettel-vadolja-az-ellenzeket-szabo-gyorgy', + 'https://szombat.org/hirek-lapszemle/kibovitett-zsido-nepesseg-javaslat-izraeli-bevandorlok-' + 'statuszanak-rendezesere', + 'https://szombat.org/hirek-lapszemle/marki-zay-megprobaltak-engem-antiszemitanak-beallitani', + 'https://szombat.org/hirek-lapszemle/jair-lapid-izraeli-kulugyminiszter-koronavirusos' + } + assert (extracted, len(extracted)) == (expected, 20) + + test_logger.log('INFO', 'Test OK!') + + +# END SITE SPECIFIC extract_article_urls_from_page FUNCTIONS ########################################################### + +# BEGIN SITE SPECIFIC next_page_of_article FUNCTIONS ################################################################### + + +# END SITE SPECIFIC next_page_of_article FUNCTIONS ##################################################################### + +def main_test(): + main_logger = Logger() + + # Relative path from this directory to the files in the project's test directory + choices = {'nextpage': os_path_join(dirname(abspath(__file__)), '../../tests/next_page_url_szng.warc.gz'), + 'article_nextpage': os_path_join(dirname(abspath(__file__)), + '../../tests/next_page_of_article_szng.warc.gz'), + 'archive': os_path_join(dirname(abspath(__file__)), + '../../tests/extract_article_urls_from_page_szng.warc.gz') + } + + # Use the main module to modify the warc files! + extract_next_page_url_test(choices['nextpage'], main_logger) + extract_article_urls_from_page_test(choices['archive'], main_logger) + # next_page_of_article_test(choices['article_nextpage'], main_logger) + + +if __name__ == '__main__': + main_test() diff --git a/configs/site_schemas/greenfo_schema.yaml b/configs/site_schemas/greenfo_schema.yaml new file mode 100644 index 0000000..1328da3 --- /dev/null +++ b/configs/site_schemas/greenfo_schema.yaml @@ -0,0 +1,91 @@ +"site_name": "greenfo" + +"columns": + "beg_0126": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-01-26 + "date_last_article": 2001-01-27 + + "initial_pagenum": 1 + + "beg_0224": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-02-22 + "date_last_article": 2001-02-24 + + "initial_pagenum": 1 + + "beg_0228": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-02-28 + "date_last_article": 2001-02-28 + + "initial_pagenum": 1 + + "beg_0306": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-03-06 + "date_last_article": 2001-03-12 + + "initial_pagenum": 1 + + "beg_0321": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-03-21 + "date_last_article": 2001-03-24 + + "initial_pagenum": 1 + "min_pagenum": 2 + + "beg_0329": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-03-29 + "date_last_article": 2001-03-30 + + "initial_pagenum": 1 + "min_pagenum": 2 + + "beg_0404": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-04-04 + "date_last_article": 2001-04-28 + + "initial_pagenum": 1 + "min_pagenum": 2 + + "beg_0505": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-05-05 + "date_last_article": 2001-07-08 + + "initial_pagenum": 1 + + "main-column": + "archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day" + + "date_first_article": 2001-07-16 + + "initial_pagenum": 1 + +"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_szng.py" +"extract_article_urls_from_page_fun": "extract_article_urls_from_page_greenfo" + +"next_url_by_pagenum": true +"infinite_scrolling": false +"archive_page_urls_by_date": true +"go_reverse_in_archive": false +"verify_request": true +"ignore_archive_cache": false + +# "new_article_url_threshold": 0 + +"corpus_converter_file": "../extractors/corpus_converters.py" +"corpus_converter": "dummy-converter" diff --git a/configs/site_schemas/neokohn_schema.yaml b/configs/site_schemas/neokohn_schema.yaml new file mode 100644 index 0000000..20abc99 --- /dev/null +++ b/configs/site_schemas/neokohn_schema.yaml @@ -0,0 +1,30 @@ +"site_name": "neokohn" + +"columns": + "beginning": + "archive_url_format": "https://neokohn.hu/#year/#month/#day" + + "date_first_article": 2019-01-20 + "date_last_article": 2019-01-20 + + "main-column": + "archive_url_format": "https://neokohn.hu/#year/#month/#day" + + "date_first_article": 2019-02-01 + # There is no article on 02-02 and 02-03. + +"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_szng.py" +"extract_next_page_url_fun": "extract_next_page_url_neokohn" +"extract_article_urls_from_page_fun": "extract_article_urls_from_page_neokohn" + +"next_url_by_pagenum": false +"infinite_scrolling": false +"archive_page_urls_by_date": true +"go_reverse_in_archive": false +"verify_request": true +"ignore_archive_cache": false + +# "new_article_url_threshold": 0 + +"corpus_converter_file": "../extractors/corpus_converters.py" +"corpus_converter": "dummy-converter" diff --git a/configs/site_schemas/szombat_schema.yaml b/configs/site_schemas/szombat_schema.yaml new file mode 100644 index 0000000..858d39d --- /dev/null +++ b/configs/site_schemas/szombat_schema.yaml @@ -0,0 +1,23 @@ +"site_name": "szombat" + +"columns": + "main-column": + "archive_url_format": "https://szombat.org/archiv?q=&y=#year&mo=#month" + + "date_first_article": 1989-11-01 + +"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_szng.py" +"extract_article_urls_from_page_fun": "extract_article_urls_from_page_szombat" +"extract_next_page_url_fun": "extract_next_page_url_szombat" + +"next_url_by_pagenum": false +"infinite_scrolling": false +"archive_page_urls_by_date": true +"go_reverse_in_archive": false +"verify_request": true +"ignore_archive_cache": false + +# "new_article_url_threshold": 0 + +"corpus_converter_file": "../extractors/corpus_converters.py" +"corpus_converter": "dummy-converter" diff --git a/tests/extract_article_urls_from_page_szng.warc.gz b/tests/extract_article_urls_from_page_szng.warc.gz new file mode 100644 index 0000000..c2ca5e2 Binary files /dev/null and b/tests/extract_article_urls_from_page_szng.warc.gz differ diff --git a/tests/next_page_url_szng.warc.gz b/tests/next_page_url_szng.warc.gz new file mode 100644 index 0000000..d3731fa Binary files /dev/null and b/tests/next_page_url_szng.warc.gz differ