Skip to content

Commit

Permalink
Filestore instead of urls
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Feb 18, 2020
1 parent 1fa2df0 commit 26e54e2
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 70 deletions.
63 changes: 23 additions & 40 deletions acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,26 @@
"""
import logging
from urllib.parse import quote_plus

from hdx.data.dataset import Dataset
from hdx.data.resource_view import ResourceView
from hdx.data.showcase import Showcase
from hdx.location.country import Country
from slugify import slugify

logger = logging.getLogger(__name__)

hxlate = '&name=ACLEDHXL&tagger-match-all=on&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred&tagger-06-header=year&tagger-06-tag=%23date%2Byear&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&tagger-30-header=iso3&tagger-30-tag=%23country%2Bcode&header-row=1'
resource_name = 'Conflict Data for %s'
hxltags = {'event_id_cnty': '#event+code', 'event_date': '#date+occurred', 'year': '#date+year',
'event_type': '#event+type', 'actor1': '#group+name+first', 'assoc_actor_1': '#group+name+first+assoc',
'actor2': '#group+name+second', 'assoc_actor_2': '#group+name+second+assoc', 'region': '#region+name',
'country': '#country+name', 'admin1': '#adm1+name', 'admin2': '#adm2+name', 'admin3': '#adm3+name',
'location': '#loc+name', 'latitude': '#geo+lat', 'longitude': '#geo+lon', 'source': '#meta+source',
'notes': '#description', 'fatalities': '#affected+killed', 'iso3': '#country+code'}


def get_countriesdata(countries_url, downloader):
def get_countries(countries_url, downloader):
countries = list()
for row in downloader.get_tabular_rows(countries_url, dict_rows=True, headers=1, format='xlsx'):
# country = row['Name']
# iso3, _ = Country.get_iso3_country_code_fuzzy(country, exception=ValueError)
# m49 = Country.get_m49_from_iso3(iso3)
headers, iterator = downloader.get_tabular_rows(countries_url, headers=1, dict_form=True, format='xlsx')
for row in iterator:
m49 = row['ISO Code']
if not m49:
continue
Expand All @@ -37,16 +37,12 @@ def get_countriesdata(countries_url, downloader):
return countries


def generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydata):
"""
Create HXLated URLs to ACLED API
eg. https://data.humdata.org/hxlproxy/data.csv?name=ACLEDHXL&url=https%3A//api.acleddata.com/acled/read.csv%3Flimit%3D0%26iso%3D120&tagger-match-all=on&tagger-02-header=iso&tagger-02-tag=%23country%2Bcode&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred+&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&header-row=1
"""
countryname = countrydata['countryname']
def generate_dataset_and_showcase(base_url, downloader, folder, country):
countryname = country['countryname']
title = '%s - Conflict Data' % countryname
logger.info('Creating dataset: %s' % title)
slugified_name = slugify('ACLED Data for %s' % countryname).lower()
countryiso = countrydata['iso3']
countryiso = country['iso3']
dataset = Dataset({
'name': slugified_name,
'title': title,
Expand All @@ -59,35 +55,22 @@ def generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydat
tags = ['hxl', 'violence and conflict', 'protests', 'security incidents']
dataset.add_tags(tags)

acled_country_url = '%siso=%d' % (base_url, countrydata['m49'])
url = '%s%s.csv?url=%s%s' % (hxlproxy_url, resource_name % countryname, quote_plus(acled_country_url), hxlate)
earliest_year = 10000
latest_year = 0
for row in downloader.get_tabular_rows(acled_country_url, dict_rows=True, headers=1):
year = int(row['year'])
if year < earliest_year:
earliest_year = year
if year > latest_year:
latest_year = year

if latest_year == 0:
url = '%siso=%d' % (base_url, country['m49'])
filename = 'conflict_data_%s.csv' % countryiso
resourcedata = {
'name': 'Conflict Data for %s' % countryname,
'description': 'Conflict data with HXL tags'
}
success, results = dataset.download_and_generate_resource(downloader, url, hxltags, folder, filename, resourcedata, yearcol='year')
if success is False:
logger.warning('%s has no data!' % countryname)
return None, None

resource = {
'name': resource_name % countryname,
'description': 'Conflict data with HXL tags',
'format': 'csv',
'url': url
}
dataset.add_update_resource(resource)
dataset.set_dataset_year_range(earliest_year, latest_year)

showcase = Showcase({
'name': '%s-showcase' % slugified_name,
'title': 'Dashboard for %s' % countrydata['countryname'],
'notes': 'Conflict Data Dashboard for %s' % countrydata['countryname'],
'url': 'https://www.acleddata.com/dashboard/#%03d' % countrydata['m49'],
'title': 'Dashboard for %s' % country['countryname'],
'notes': 'Conflict Data Dashboard for %s' % country['countryname'],
'url': 'https://www.acleddata.com/dashboard/#%03d' % country['m49'],
'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'
})
showcase.add_tags(tags)
Expand Down
1 change: 0 additions & 1 deletion config/project_configuration.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# Collector specific configuration
base_url: "https://api.acleddata.com/acled/read.csv?limit=0&terms=accept&"
countries_url: "https://www.acleddata.com/download/3987/"
hxlproxy_url: "https://proxy.hxlstandard.org/data/download/"
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
python-slugify==3.0.2
hdx-python-api==3.8.3
python-slugify==4.0.0
hdx-python-api==4.2.5
-r docker-requirements.txt
15 changes: 7 additions & 8 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

from hdx.hdx_configuration import Configuration
from hdx.utilities.downloader import Download
from hdx.utilities.path import progress_storing_tempdir

from acled import get_countriesdata, generate_dataset_and_showcase
from acled import get_countries, generate_dataset_and_showcase

from hdx.facades.simple import facade

Expand All @@ -26,20 +27,18 @@ def main():
configuration = Configuration.read()
base_url = configuration['base_url']
countries_url = configuration['countries_url']
hxlproxy_url = configuration['hxlproxy_url']
with Download() as downloader:
countriesdata = get_countriesdata(countries_url, downloader)
logger.info('Number of datasets to upload: %d' % len(countriesdata))
for countrydata in sorted(countriesdata, key=lambda x: x['iso3']):
dataset, showcase = generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydata)
countries = get_countries(countries_url, downloader)
logger.info('Number of datasets to upload: %d' % len(countries))
for folder, country in progress_storing_tempdir('ACLED', sorted(countries, key=lambda x: x['iso3']), 'iso3'):
dataset, showcase = generate_dataset_and_showcase(base_url, downloader, folder, country)
if dataset:
dataset.update_from_yaml()
dataset['license_other'] = dataset['license_other'].replace('\n', ' \n') # ensure markdown has line breaks
dataset.create_in_hdx(hxl_update=False)
dataset.create_in_hdx(remove_additional_resources=True, hxl_update=False, updated_by_script='HDX Scraper: ACLED')
dataset.generate_resource_view()
showcase.create_in_hdx()
showcase.add_dataset(dataset)
sleep(1)


if __name__ == '__main__':
Expand Down
39 changes: 20 additions & 19 deletions tests/test_acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from hdx.hdx_configuration import Configuration
from hdx.hdx_locations import Locations
from hdx.location.country import Country
from hdx.utilities.path import temp_dir

from acled import generate_dataset_and_showcase, get_countriesdata
from acled import generate_dataset_and_showcase, get_countries


class TestAcled():
Expand All @@ -23,8 +24,7 @@ class TestAcled():
'owner_org': 'b67e6c74-c185-4f43-b561-0e114a736f19', 'data_update_frequency': '0',
'title': 'Cameroon - Conflict Data', 'subnational': '1'}
resource = {'description': 'Conflict data with HXL tags', 'name': 'Conflict Data for Cameroon',
'format': 'csv', 'resource_type': 'api', 'url_type': 'api',
'url': 'https://proxy.hxlstandard.org/data/download/Conflict Data for Cameroon.csv?url=http%3A%2F%2Flala%3Fiso%3D120&name=ACLEDHXL&tagger-match-all=on&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred&tagger-06-header=year&tagger-06-tag=%23date%2Byear&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&tagger-30-header=iso3&tagger-30-tag=%23country%2Bcode&header-row=1'}
'format': 'csv', 'resource_type': 'file.upload', 'url_type': 'upload'}

@pytest.fixture(scope='function')
def configuration(self):
Expand All @@ -44,32 +44,33 @@ def json():

class Download:
@staticmethod
def get_tabular_rows(url, dict_rows, headers, format=None):
def get_tabular_rows(url, **kwargs):
if url == 'http://haha':
return [{'Name': 'Cameroon', 'ACLED country-code': 'CMR', 'ISO Code': 120, 'Region-code': 'Middle Africa'}]
return ['Name', 'ACLED country-code', 'ISO Code', 'Region-code'], \
[{'Name': 'Cameroon', 'ACLED country-code': 'CMR', 'ISO Code': 120, 'Region-code': 'Middle Africa'}]
elif url == 'http://lala?iso=120':
return [{'year': '1997'}, {'year': '2018'}]
return ['year'], [{'year': '1997'}, {'year': '2018'}]
elif url == 'http://lala?iso=4':
return list()
return None, list()

return Download()

def test_get_countriesdata(self, downloader):
countriesdata = get_countriesdata('http://haha', downloader)
countriesdata = get_countries('http://haha', downloader)
assert countriesdata == [TestAcled.countrydata]

def test_generate_dataset_and_showcase(self, configuration, downloader):
hxlproxy_url = Configuration.read()['hxlproxy_url']
dataset, showcase = generate_dataset_and_showcase('http://lala?', hxlproxy_url, downloader, TestAcled.countrydata)
assert dataset == TestAcled.dataset
with temp_dir('ACLED') as folder:
dataset, showcase = generate_dataset_and_showcase('http://lala?', downloader, folder, TestAcled.countrydata)
assert dataset == TestAcled.dataset

resources = dataset.get_resources()
assert resources == [TestAcled.resource]
resources = dataset.get_resources()
assert resources == [TestAcled.resource]

assert showcase == {'name': 'acled-data-for-cameroon-showcase', 'notes': 'Conflict Data Dashboard for Cameroon',
'url': 'https://www.acleddata.com/dashboard/#120',
'tags': [{'name': 'hxl', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'violence and conflict', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'protests', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'security incidents', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}],
'title': 'Dashboard for Cameroon', 'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'}
assert showcase == {'name': 'acled-data-for-cameroon-showcase', 'notes': 'Conflict Data Dashboard for Cameroon',
'url': 'https://www.acleddata.com/dashboard/#120',
'tags': [{'name': 'hxl', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'violence and conflict', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'protests', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'security incidents', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}],
'title': 'Dashboard for Cameroon', 'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'}

dataset, showcase = generate_dataset_and_showcase('http://lala?', hxlproxy_url, downloader, {'m49': 4, 'iso3': 'AFG', 'countryname': 'Afghanistan'})
assert dataset is None
dataset, showcase = generate_dataset_and_showcase('http://lala?', downloader, folder, {'m49': 4, 'iso3': 'AFG', 'countryname': 'Afghanistan'})
assert dataset is None

0 comments on commit 26e54e2

Please sign in to comment.