Filestore instead of urls

OCHA-DAP · Feb 18, 2020 · 26e54e2 · 26e54e2
1 parent 1fa2df0
commit 26e54e2
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 70 deletions.
diff --git a/acled.py b/acled.py
@@ -8,26 +8,26 @@
 
 """
 import logging
-from urllib.parse import quote_plus
 
 from hdx.data.dataset import Dataset
-from hdx.data.resource_view import ResourceView
 from hdx.data.showcase import Showcase
 from hdx.location.country import Country
 from slugify import slugify
 
 logger = logging.getLogger(__name__)
 
-hxlate = '&name=ACLEDHXL&tagger-match-all=on&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred&tagger-06-header=year&tagger-06-tag=%23date%2Byear&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&tagger-30-header=iso3&tagger-30-tag=%23country%2Bcode&header-row=1'
-resource_name = 'Conflict Data for %s'
+hxltags = {'event_id_cnty': '#event+code', 'event_date': '#date+occurred', 'year': '#date+year',
+           'event_type': '#event+type', 'actor1': '#group+name+first', 'assoc_actor_1': '#group+name+first+assoc',
+           'actor2': '#group+name+second', 'assoc_actor_2': '#group+name+second+assoc', 'region': '#region+name',
+           'country': '#country+name', 'admin1': '#adm1+name', 'admin2': '#adm2+name', 'admin3': '#adm3+name',
+           'location': '#loc+name', 'latitude': '#geo+lat', 'longitude': '#geo+lon', 'source': '#meta+source',
+           'notes': '#description', 'fatalities': '#affected+killed', 'iso3': '#country+code'}
 
 
-def get_countriesdata(countries_url, downloader):
+def get_countries(countries_url, downloader):
     countries = list()
-    for row in downloader.get_tabular_rows(countries_url, dict_rows=True, headers=1, format='xlsx'):
-        # country = row['Name']
-        # iso3, _ = Country.get_iso3_country_code_fuzzy(country, exception=ValueError)
-        # m49 = Country.get_m49_from_iso3(iso3)
+    headers, iterator = downloader.get_tabular_rows(countries_url, headers=1, dict_form=True, format='xlsx')
+    for row in iterator:
         m49 = row['ISO Code']
         if not m49:
             continue
@@ -37,16 +37,12 @@ def get_countriesdata(countries_url, downloader):
     return countries
 
 
-def generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydata):
-    """
-      Create HXLated URLs to ACLED API
-      eg. https://data.humdata.org/hxlproxy/data.csv?name=ACLEDHXL&url=https%3A//api.acleddata.com/acled/read.csv%3Flimit%3D0%26iso%3D120&tagger-match-all=on&tagger-02-header=iso&tagger-02-tag=%23country%2Bcode&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred+&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&header-row=1
-    """
-    countryname = countrydata['countryname']
+def generate_dataset_and_showcase(base_url, downloader, folder, country):
+    countryname = country['countryname']
     title = '%s - Conflict Data' % countryname
     logger.info('Creating dataset: %s' % title)
     slugified_name = slugify('ACLED Data for %s' % countryname).lower()
-    countryiso = countrydata['iso3']
+    countryiso = country['iso3']
     dataset = Dataset({
         'name': slugified_name,
         'title': title,
@@ -59,35 +55,22 @@ def generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydat
     tags = ['hxl', 'violence and conflict', 'protests', 'security incidents']
     dataset.add_tags(tags)
 
-    acled_country_url = '%siso=%d' % (base_url, countrydata['m49'])
-    url = '%s%s.csv?url=%s%s' % (hxlproxy_url, resource_name % countryname, quote_plus(acled_country_url), hxlate)
-    earliest_year = 10000
-    latest_year = 0
-    for row in downloader.get_tabular_rows(acled_country_url, dict_rows=True, headers=1):
-        year = int(row['year'])
-        if year < earliest_year:
-            earliest_year = year
-        if year > latest_year:
-            latest_year = year
-
-    if latest_year == 0:
+    url = '%siso=%d' % (base_url, country['m49'])
+    filename = 'conflict_data_%s.csv' % countryiso
+    resourcedata = {
+        'name': 'Conflict Data for %s' % countryname,
+        'description': 'Conflict data with HXL tags'
+    }
+    success, results = dataset.download_and_generate_resource(downloader, url, hxltags, folder, filename, resourcedata, yearcol='year')
+    if success is False:
         logger.warning('%s has no data!' % countryname)
         return None, None
 
-    resource = {
-        'name': resource_name % countryname,
-        'description': 'Conflict data with HXL tags',
-        'format': 'csv',
-        'url': url
-    }
-    dataset.add_update_resource(resource)
-    dataset.set_dataset_year_range(earliest_year, latest_year)
-
     showcase = Showcase({
         'name': '%s-showcase' % slugified_name,
-        'title': 'Dashboard for %s' % countrydata['countryname'],
-        'notes': 'Conflict Data Dashboard for %s' % countrydata['countryname'],
-        'url': 'https://www.acleddata.com/dashboard/#%03d' % countrydata['m49'],
+        'title': 'Dashboard for %s' % country['countryname'],
+        'notes': 'Conflict Data Dashboard for %s' % country['countryname'],
+        'url': 'https://www.acleddata.com/dashboard/#%03d' % country['m49'],
         'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'
     })
     showcase.add_tags(tags)

diff --git a/config/project_configuration.yml b/config/project_configuration.yml
@@ -1,4 +1,3 @@
 # Collector specific configuration
 base_url: "https://api.acleddata.com/acled/read.csv?limit=0&terms=accept&"
 countries_url: "https://www.acleddata.com/download/3987/"
-hxlproxy_url: "https://proxy.hxlstandard.org/data/download/"
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-python-slugify==3.0.2
-hdx-python-api==3.8.3
+python-slugify==4.0.0
+hdx-python-api==4.2.5
 -r docker-requirements.txt
diff --git a/run.py b/run.py
@@ -10,8 +10,9 @@
 
 from hdx.hdx_configuration import Configuration
 from hdx.utilities.downloader import Download
+from hdx.utilities.path import progress_storing_tempdir
 
-from acled import get_countriesdata, generate_dataset_and_showcase
+from acled import get_countries, generate_dataset_and_showcase
 
 from hdx.facades.simple import facade
 
@@ -26,20 +27,18 @@ def main():
     configuration = Configuration.read()
     base_url = configuration['base_url']
     countries_url = configuration['countries_url']
-    hxlproxy_url = configuration['hxlproxy_url']
     with Download() as downloader:
-        countriesdata = get_countriesdata(countries_url, downloader)
-        logger.info('Number of datasets to upload: %d' % len(countriesdata))
-        for countrydata in sorted(countriesdata, key=lambda x: x['iso3']):
-            dataset, showcase = generate_dataset_and_showcase(base_url, hxlproxy_url, downloader, countrydata)
+        countries = get_countries(countries_url, downloader)
+        logger.info('Number of datasets to upload: %d' % len(countries))
+        for folder, country in progress_storing_tempdir('ACLED', sorted(countries, key=lambda x: x['iso3']), 'iso3'):
+            dataset, showcase = generate_dataset_and_showcase(base_url, downloader, folder, country)
             if dataset:
                 dataset.update_from_yaml()
                 dataset['license_other'] = dataset['license_other'].replace('\n', '  \n')  # ensure markdown has line breaks
-                dataset.create_in_hdx(hxl_update=False)
+                dataset.create_in_hdx(remove_additional_resources=True, hxl_update=False, updated_by_script='HDX Scraper: ACLED')
                 dataset.generate_resource_view()
                 showcase.create_in_hdx()
                 showcase.add_dataset(dataset)
-                sleep(1)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_acled.py b/tests/test_acled.py
@@ -11,8 +11,9 @@
 from hdx.hdx_configuration import Configuration
 from hdx.hdx_locations import Locations
 from hdx.location.country import Country
+from hdx.utilities.path import temp_dir
 
-from acled import generate_dataset_and_showcase, get_countriesdata
+from acled import generate_dataset_and_showcase, get_countries
 
 
 class TestAcled():
@@ -23,8 +24,7 @@ class TestAcled():
                'owner_org': 'b67e6c74-c185-4f43-b561-0e114a736f19', 'data_update_frequency': '0',
                'title': 'Cameroon - Conflict Data', 'subnational': '1'}
     resource = {'description': 'Conflict data with HXL tags', 'name': 'Conflict Data for Cameroon',
-                'format': 'csv', 'resource_type': 'api', 'url_type': 'api',
-                'url': 'https://proxy.hxlstandard.org/data/download/Conflict Data for Cameroon.csv?url=http%3A%2F%2Flala%3Fiso%3D120&name=ACLEDHXL&tagger-match-all=on&tagger-03-header=event_id_cnty&tagger-03-tag=%23event%2Bcode&tagger-05-header=event_date&tagger-05-tag=%23date%2Boccurred&tagger-06-header=year&tagger-06-tag=%23date%2Byear&tagger-08-header=event_type&tagger-08-tag=%23event%2Btype&tagger-09-header=actor1&tagger-09-tag=%23group%2Bname%2Bfirst&tagger-10-header=assoc_actor_1&tagger-10-tag=%23group%2Bname%2Bfirst%2Bassoc&tagger-12-header=actor2&tagger-12-tag=%23group%2Bname%2Bsecond&tagger-13-header=assoc_actor_2&tagger-13-tag=%23group%2Bname%2Bsecond%2Bassoc&tagger-16-header=region&tagger-16-tag=%23region%2Bname&tagger-17-header=country&tagger-17-tag=%23country%2Bname&tagger-18-header=admin1&tagger-18-tag=%23adm1%2Bname&tagger-19-header=admin2&tagger-19-tag=%23adm2%2Bname&tagger-20-header=admin3&tagger-20-tag=%23adm3%2Bname&tagger-21-header=location&tagger-21-tag=%23loc%2Bname&tagger-22-header=latitude&tagger-22-tag=%23geo%2Blat&tagger-23-header=longitude&tagger-23-tag=%23geo%2Blon&tagger-25-header=source&tagger-25-tag=%23meta%2Bsource&tagger-27-header=notes&tagger-27-tag=%23description&tagger-28-header=fatalities&tagger-28-tag=%23affected%2Bkilled&tagger-30-header=iso3&tagger-30-tag=%23country%2Bcode&header-row=1'}
+                'format': 'csv', 'resource_type': 'file.upload', 'url_type': 'upload'}
 
     @pytest.fixture(scope='function')
     def configuration(self):
@@ -44,32 +44,33 @@ def json():
 
         class Download:
             @staticmethod
-            def get_tabular_rows(url, dict_rows, headers, format=None):
+            def get_tabular_rows(url, **kwargs):
                 if url == 'http://haha':
-                    return [{'Name': 'Cameroon', 'ACLED country-code': 'CMR', 'ISO Code': 120, 'Region-code': 'Middle Africa'}]
+                    return ['Name', 'ACLED country-code', 'ISO Code', 'Region-code'], \
+                           [{'Name': 'Cameroon', 'ACLED country-code': 'CMR', 'ISO Code': 120, 'Region-code': 'Middle Africa'}]
                 elif url == 'http://lala?iso=120':
-                    return [{'year': '1997'}, {'year': '2018'}]
+                    return ['year'], [{'year': '1997'}, {'year': '2018'}]
                 elif url == 'http://lala?iso=4':
-                    return list()
+                    return None, list()
 
         return Download()
 
     def test_get_countriesdata(self, downloader):
-        countriesdata = get_countriesdata('http://haha', downloader)
+        countriesdata = get_countries('http://haha', downloader)
         assert countriesdata == [TestAcled.countrydata]
 
     def test_generate_dataset_and_showcase(self, configuration, downloader):
-        hxlproxy_url = Configuration.read()['hxlproxy_url']
-        dataset, showcase = generate_dataset_and_showcase('http://lala?', hxlproxy_url, downloader, TestAcled.countrydata)
-        assert dataset == TestAcled.dataset
+        with temp_dir('ACLED') as folder:
+            dataset, showcase = generate_dataset_and_showcase('http://lala?', downloader, folder, TestAcled.countrydata)
+            assert dataset == TestAcled.dataset
 
-        resources = dataset.get_resources()
-        assert resources == [TestAcled.resource]
+            resources = dataset.get_resources()
+            assert resources == [TestAcled.resource]
 
-        assert showcase == {'name': 'acled-data-for-cameroon-showcase', 'notes': 'Conflict Data Dashboard for Cameroon',
-                            'url': 'https://www.acleddata.com/dashboard/#120',
-                            'tags': [{'name': 'hxl', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'violence and conflict', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'protests', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'security incidents', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}],
-                            'title': 'Dashboard for Cameroon', 'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'}
+            assert showcase == {'name': 'acled-data-for-cameroon-showcase', 'notes': 'Conflict Data Dashboard for Cameroon',
+                                'url': 'https://www.acleddata.com/dashboard/#120',
+                                'tags': [{'name': 'hxl', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'violence and conflict', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'protests', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}, {'name': 'security incidents', 'vocabulary_id': '4e61d464-4943-4e97-973a-84673c1aaa87'}],
+                                'title': 'Dashboard for Cameroon', 'image_url': 'https://www.acleddata.com/wp-content/uploads/2018/01/dash.png'}
 
-        dataset, showcase = generate_dataset_and_showcase('http://lala?', hxlproxy_url, downloader, {'m49': 4, 'iso3': 'AFG', 'countryname': 'Afghanistan'})
-        assert dataset is None
+            dataset, showcase = generate_dataset_and_showcase('http://lala?', downloader, folder, {'m49': 4, 'iso3': 'AFG', 'countryname': 'Afghanistan'})
+            assert dataset is None