Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add szombat.org, neokohn.hu, greenfo.hu #41

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions configs/config_greenfo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"schema": "site_schemas/greenfo_schema.yaml"

# "output_corpus": "greenfo_corpus.txt"

"log_file_articles": "logs/greenfo_articles_log.txt"
"log_file_archive": "logs/greenfo_archive_log.txt"
"new_problematic_urls": "logs/greenfo_new_problematic_urls.txt"
"new_good_urls": "logs/greenfo_new_good_urls.txt"
"new_problematic_archive_urls": "logs/greenfo_new_problematic_archive_urls.txt"
"new_good_archive_urls": "logs/greenfo_new_good_archive_urls.txt"

# "date_from": 2001-01-26
12 changes: 12 additions & 0 deletions configs/config_neokohn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"schema": "site_schemas/neokohn_schema.yaml"

# "output_corpus": "neokohn_corpus.txt"

"log_file_articles": "logs/neokohn_articles_log.txt"
"log_file_archive": "logs/neokohn_archive_log.txt"
"new_problematic_urls": "logs/neokohn_new_problematic_urls.txt"
"new_good_urls": "logs/neokohn_new_good_urls.txt"
"new_problematic_archive_urls": "logs/neokohn_new_problematic_archive_urls.txt"
"new_good_archive_urls": "logs/neokohn_new_good_archive_urls.txt"

# "date_from": 2019-01-20
10 changes: 10 additions & 0 deletions configs/config_szombat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"schema": "site_schemas/szombat_schema.yaml"

# "output_corpus": "szombat_corpus.txt"

"log_file_articles": "logs/szombat_articles_log.txt"
"log_file_archive": "logs/szombat_archive_log.txt"
"new_problematic_urls": "logs/szombat_new_problematic_urls.txt"
"new_good_urls": "logs/szombat_new_good_urls.txt"
"new_problematic_archive_urls": "logs/szombat_new_problematic_archive_urls.txt"
"new_good_archive_urls": "logs/szombat_new_good_archive_urls.txt"
243 changes: 243 additions & 0 deletions configs/extractors/site_specific_extractor_functions_szng.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

from os.path import abspath, dirname, join as os_path_join
from bs4 import BeautifulSoup
from webarticlecurator import WarcCachingDownloader, Logger


# BEGIN SITE SPECIFIC extract_next_page_url FUNCTIONS ##################################################################

def extract_next_page_url_neokohn(archive_page_raw_html):
"""
extracts and returns next page URL from an HTML code if there is one...
Specific for neokohn.hu
:returns string of url if there is one, None otherwise
"""
ret = None
soup = BeautifulSoup(archive_page_raw_html, 'lxml')
next_page = soup.find('a', class_='next page-numbers')
if next_page is not None and 'href' in next_page.attrs:
ret = next_page['href']
return ret


def extract_next_page_url_szombat(archive_page_raw_html):
"""
Extract next page url from current archive page
Specific for szombat.org
:returns string of url if there is one, None otherwise
"""
ret = None
soup = BeautifulSoup(archive_page_raw_html, 'lxml')
next_page_url = soup.find('a', class_='nextpostslink')
if next_page_url is not None and 'href' in next_page_url.attrs:
ret = next_page_url['href']
return ret


def extract_next_page_url_test(filename, test_logger):
"""Quick test for extracting "next page" URLs when needed"""
# This function is intended to be used from this file only as the import of WarcCachingDownloader is local to main()
w = WarcCachingDownloader(filename, None, test_logger, just_cache=True, download_params={'stay_offline': True})

# Some of these are intentionally yields None
test_logger.log('INFO', 'Testing neokohn')
text = w.download_url('https://neokohn.hu/2019/02/04/')
assert extract_next_page_url_neokohn(text) == 'https://neokohn.hu/2019/02/04/page/2/'
text = w.download_url('https://neokohn.hu/2022/01/20/')
assert extract_next_page_url_neokohn(text) == 'https://neokohn.hu/2022/01/20/page/2/'
text = w.download_url('https://neokohn.hu/2019/02/09/page/2/')
assert extract_next_page_url_neokohn(text) is None
text = w.download_url('https://neokohn.hu/2019/01/20/')
assert extract_next_page_url_neokohn(text) is None

test_logger.log('INFO', 'Testing szombat')
text = w.download_url('https://szombat.org/archiv?q=&y=1990&mo=2')
assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/2?q&y=1990&mo=2'
text = w.download_url('https://szombat.org/archiv/page/2?q&y=2020&mo=2')
assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/3?q&y=2020&mo=2'
text = w.download_url('https://szombat.org/archiv/page/7?q&y=2022&mo=1')
assert extract_next_page_url_szombat(text) == 'https://szombat.org/archiv/page/8?q&y=2022&mo=1'
text = w.download_url('https://szombat.org/archiv/page/8?q&y=2022&mo=1')
assert extract_next_page_url_szombat(text) is None

test_logger.log('INFO', 'Test OK!')


# END SITE SPECIFIC extract_next_page_url FUNCTIONS ####################################################################


def safe_extract_hrefs_from_a_tags(main_container):
"""
Helper function to extract href from a tags
:param main_container: An iterator over Tag()-s
:return: Generator over the extracted links
"""
for a_tag in main_container:
a_tag_a = a_tag.find('a')
if a_tag_a is not None and 'href' in a_tag_a.attrs:
yield a_tag_a['href']


def extract_article_urls_from_page_greenfo(archive_page_raw_html):
"""
extracts and returns as a list the URLs belonging to articles from an HTML code
:param archive_page_raw_html: archive page containing list of articles with their URLs
:return: list that contains URLs
Not all links point to a real article (but the same news from other sites)
hence the use of 'more-link' class instead of the 'h2' tag.
"""
soup = BeautifulSoup(archive_page_raw_html, 'lxml')
main_container = soup.find_all(class_='more-link')
urls = {link['href'] for link in main_container}
return urls


def extract_article_urls_from_page_neokohn(archive_page_raw_html):
"""
extracts and returns as a list the URLs belonging to articles from an HTML code
:param archive_page_raw_html: archive page containing list of articles with their URLs
:return: list that contains URLs
"""
soup = BeautifulSoup(archive_page_raw_html, 'lxml')
main_container = soup.find_all('h3', class_='entry-title mh-posts-list-title')
urls = {link for link in safe_extract_hrefs_from_a_tags(main_container)}
return urls


def extract_article_urls_from_page_szombat(archive_page_raw_html):
"""
extracts and returns as a list the URLs belonging to articles from an HTML code
:param archive_page_raw_html: archive page containing list of articles with their URLs
:return: list that contains URLs
"""
soup = BeautifulSoup(archive_page_raw_html, 'lxml')
main_container = soup.find_all('h1', class_='title')
urls = {link for link in safe_extract_hrefs_from_a_tags(main_container)}
return urls


def extract_article_urls_from_page_test(filename, test_logger):
"""Quick test for extracting URLs form an archive page"""
# This function is intended to be used from this file only as the import of WarcCachingDownloader is local to main()
w = WarcCachingDownloader(filename, None, test_logger, just_cache=True, download_params={'stay_offline': True})

test_logger.log('INFO', 'Testing greenfo')
text = w.download_url('https://greenfo.hu/page/1/?post_types=all&post_date=2001-01-26%202001-01-26')
extracted = extract_article_urls_from_page_greenfo(text)
expected = {'https://greenfo.hu/hir/tarifaemeles-helyett-esszeru-kozlekedesszervezest_980483759/#more-49915',
'https://greenfo.hu/hir/a-hulladekgazdalkodasra-400-700-milliard-forint_980483643/#more-49909',
'https://greenfo.hu/hir/a-4-es-metro-megepitese-sok-fa-elpusztitasaval-jarna_980463600/#more-49910',
'https://greenfo.hu/hir/nemzeti-okologiai-kutatasi-program-indul_980463600/#more-49911',
'https://greenfo.hu/hir/illes-zoltan-el-a-kezekkel-a-nemzeti-parkoktol_980463600/#more-49912',
'https://greenfo.hu/hir/az-unios-tagsag-nem-oldja-meg-a-gondjainkat_980463600/#more-49913',
'https://greenfo.hu/hir/lanyi-andras-iro-kornyezetvedo_980463600/#more-49914'
}
assert (extracted, len(extracted)) == (expected, 7)

text = w.download_url('https://greenfo.hu/page/3/?post_types=all&post_date=2010-05-05%202010-05-05')
extracted = extract_article_urls_from_page_greenfo(text)
expected = {'https://greenfo.hu/hir/csikosfeju-nadiposzata-baranyaban_1273049570/#more-74036',
}
assert (extracted, len(extracted)) == (expected, 1)

text = w.download_url('https://greenfo.hu/page/1/?post_types=all&post_date=2018-08-06%202018-08-06')
extracted = extract_article_urls_from_page_greenfo(text)
expected = {'https://greenfo.hu/hir/egymas-nyelvet-kepesek-megtanulni-a-madarak/#more-48578',
'https://greenfo.hu/hir/a-magyar-mehallomany-fele-elpusztulhatott/#more-48577'
}
assert (extracted, len(extracted)) == (expected, 2)

test_logger.log('INFO', 'Testing neokohn')
text = w.download_url('https://neokohn.hu/2019/01/20/')
extracted = extract_article_urls_from_page_neokohn(text)
expected = {'https://neokohn.hu/2019/01/20/new-york-times-eilat-is-a-legjobb-uti-celok-kozott-szerepel/'
}
assert (extracted, len(extracted)) == (expected, 1)

text = w.download_url('https://neokohn.hu/2021/09/21/')
extracted = extract_article_urls_from_page_neokohn(text)
expected = {'https://neokohn.hu/2021/09/21/biden-kudarcot-vallott-erotlen-vezetese-katasztrofak-sorozatat'
'-okozta-es-artott-az-usa-nak/',
'https://neokohn.hu/2021/09/21/donald-trump-szerint-hazaarulast-kovetett-el-mark-milley'
'-vezerkari-fonok/',
'https://neokohn.hu/2021/09/21/izgalmas-reszletek-derultek-ki-az-irani-atomprogram-atyjanak'
'-kiiktatasarol/',
'https://neokohn.hu/2021/09/21/hogyan-unnepeltek-a-szukkotot-2000-evvel-ezelott/',
'https://neokohn.hu/2021/09/21/ensz-fotitkar-mindenaron-meg-kell-akadalyoznunk-az-usa'
'-kina-hideghaborut/',
'https://neokohn.hu/2021/09/21/parizs-hazugsag-ketszinuseg-a-bizalom-sulyos-megsertese'
'-es-megvetese-tortent/'
}
assert (extracted, len(extracted)) == (expected, 6)

text = w.download_url('https://neokohn.hu/2022/01/20/page/2/')
extracted = extract_article_urls_from_page_neokohn(text)
expected = {'https://neokohn.hu/2022/01/20/80-eve-dontottek-a-zsidokerdes-vegso-megoldasarol/'
}
assert (extracted, len(extracted)) == (expected, 1)

test_logger.log('INFO', 'Testing szombat')
text = w.download_url('https://szombat.org/archiv/page/5?q&y=2022&mo=1')
extracted = extract_article_urls_from_page_szombat(text)
expected = {'https://szombat.org/hirek-lapszemle/netanjahu-es-a-vadalku-lehetosege-mandelblit-nem-enged-a-het-ev-'
'kozszolgalattol-valo-eltiltasbol',
'https://szombat.org/hirek-lapszemle/borrell-szerint-lehetseges-a-megallapodas-az-irani-atomalkurol-'
'parizs-ketelkedik',
'https://szombat.org/hagyomany-tortenelem/a-birkanyiras-elseje-misna-magyarul-hulin-11',
'https://szombat.org/kultura-muveszetek/uri-asaf-ujjaszuletesei',
'https://szombat.org/politika/macron-unios-vitat-indit-az-istenkaromlasrol-es-dzsihadrol',
'https://szombat.org/kultura-muveszetek/elhunyt-fenakel-judit',
'https://szombat.org/tortenelem/lengyelorszag-tobb-mint-tizezer-onkentes-segitett-szaz-zsido-temeto-'
'helyreallitasaban-2021-ben',
'https://szombat.org/hirek-lapszemle/heisler-andras-egy-eros-mazsihisz-a-magyarorszagi-zsidosag-erdeke',
'https://szombat.org/hirek-lapszemle/belgium-tiz-evre-kiutasitottak-a-legnagyobb-mecset-uszitassal-es-'
'kemkedessel-vadolt-vezetojet',
'https://szombat.org/hirek-lapszemle/izrael-tobb-mint-250-ezer-virushordozo-14-kneszet-kepviselo-'
'karantenban',
'https://szombat.org/hirek-lapszemle/izraeli-baleset-barati-tuz-vegzett-ket-kommandossal',
'https://szombat.org/politika/zsidozasrol-es-antiszemitazasrol-magyarorszag-ideges-tele-a-valasztasok-'
'elott',
'https://szombat.org/kultura-muveszetek/77-eves-koraban-elhunyt-michael-lang',
'https://szombat.org/kozosseg/tu-bisvat-i-szeder-az-orthodoxian',
'https://szombat.org/hirek-lapszemle/izrael-sok-a-fertozott-de-tiz-naprol-hetre-csokkentik-a-karanten-'
'idejet',
'https://szombat.org/politika/europai-biralat-a-simon-wiesenthal-kozpont-antiszemitizmus-listajanak',
'https://szombat.org/hirek-lapszemle/magyar-nemzeti-ongyuloletettel-vadolja-az-ellenzeket-szabo-gyorgy',
'https://szombat.org/hirek-lapszemle/kibovitett-zsido-nepesseg-javaslat-izraeli-bevandorlok-'
'statuszanak-rendezesere',
'https://szombat.org/hirek-lapszemle/marki-zay-megprobaltak-engem-antiszemitanak-beallitani',
'https://szombat.org/hirek-lapszemle/jair-lapid-izraeli-kulugyminiszter-koronavirusos'
}
assert (extracted, len(extracted)) == (expected, 20)

test_logger.log('INFO', 'Test OK!')


# END SITE SPECIFIC extract_article_urls_from_page FUNCTIONS ###########################################################

# BEGIN SITE SPECIFIC next_page_of_article FUNCTIONS ###################################################################


# END SITE SPECIFIC next_page_of_article FUNCTIONS #####################################################################

def main_test():
main_logger = Logger()

# Relative path from this directory to the files in the project's test directory
choices = {'nextpage': os_path_join(dirname(abspath(__file__)), '../../tests/next_page_url_szng.warc.gz'),
'article_nextpage': os_path_join(dirname(abspath(__file__)),
'../../tests/next_page_of_article_szng.warc.gz'),
'archive': os_path_join(dirname(abspath(__file__)),
'../../tests/extract_article_urls_from_page_szng.warc.gz')
}

# Use the main module to modify the warc files!
extract_next_page_url_test(choices['nextpage'], main_logger)
extract_article_urls_from_page_test(choices['archive'], main_logger)
# next_page_of_article_test(choices['article_nextpage'], main_logger)


if __name__ == '__main__':
main_test()
91 changes: 91 additions & 0 deletions configs/site_schemas/greenfo_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"site_name": "greenfo"

"columns":
"beg_0126":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-01-26
"date_last_article": 2001-01-27

"initial_pagenum": 1

"beg_0224":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-02-22
"date_last_article": 2001-02-24

"initial_pagenum": 1

"beg_0228":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-02-28
"date_last_article": 2001-02-28

"initial_pagenum": 1

"beg_0306":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-03-06
"date_last_article": 2001-03-12

"initial_pagenum": 1

"beg_0321":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-03-21
"date_last_article": 2001-03-24

"initial_pagenum": 1
"min_pagenum": 2

"beg_0329":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-03-29
"date_last_article": 2001-03-30

"initial_pagenum": 1
"min_pagenum": 2

"beg_0404":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-04-04
"date_last_article": 2001-04-28

"initial_pagenum": 1
"min_pagenum": 2

"beg_0505":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-05-05
"date_last_article": 2001-07-08

"initial_pagenum": 1

"main-column":
"archive_url_format": "https://greenfo.hu/page/#pagenum/?post_types=all&post_date=#year-#month-#day%20#year-#month-#day"

"date_first_article": 2001-07-16

"initial_pagenum": 1

"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_szng.py"
"extract_article_urls_from_page_fun": "extract_article_urls_from_page_greenfo"

"next_url_by_pagenum": true
"infinite_scrolling": false
"archive_page_urls_by_date": true
"go_reverse_in_archive": false
"verify_request": true
"ignore_archive_cache": false

# "new_article_url_threshold": 0

"corpus_converter_file": "../extractors/corpus_converters.py"
"corpus_converter": "dummy-converter"
Loading