Skip to content

Commit

Permalink
Convert PA to a timeseries (covidatlas#837)
Browse files Browse the repository at this point in the history
* Turn PA scraper into a timeseries

* Hacked version of ArcGIS JSON paginated fetch

* Change type to JSON; stop fetching CSVs now that we can build a timeseries

* Add asserts to test my aggregator

* Modified for new fetch calls

* Pass CI test
  • Loading branch information
shaperilio authored Apr 16, 2020
1 parent 27c32c0 commit b7b0159
Showing 1 changed file with 102 additions and 49 deletions.
151 changes: 102 additions & 49 deletions src/shared/scrapers/PA/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import maintainers from '../../lib/maintainers.js';
import log from '../../lib/log.js';
import provinceToIso2 from './province-to-iso2.json';
import * as transform from '../../lib/transform.js';
import datetime from '../../lib/datetime/index.js';

// const patientStatus = {
// FALLECIDO: 'deaths',
Expand Down Expand Up @@ -151,6 +152,27 @@ function sum(dataArray, key) {
return result;
}

async function TEMPfetchArcGISJSON(obj, featureURL, date) {
// temporary handling of pagination here until Quentin's pull request is brought in
let offset = 0;
const recordCount = 50000;
const result = [];
// eslint-disable-next-line no-constant-condition
while (true) {
const query = `where=0%3D0&outFields=*&resultOffset=${offset}&resultRecordCount=${recordCount}&f=json`;
const theURL = `${featureURL}query?${query}`;
const cacheKey = `arcGISJSON_cases_${offset}`;
const response = await fetch.json(obj, theURL, cacheKey, date);
if (!response) throw new Error(`Response was null for "${theURL}`);
if (response.features && response.features.length === 0) break;
const n = response.features.length;
log(`${n} records from "${theURL}`);
offset += n;
result.push(...response.features.map(({ attributes }) => attributes));
}
return result;
}

const scraper = {
priority: 1,
country: 'iso1:PA',
Expand All @@ -164,22 +186,23 @@ const scraper = {
// URLs were obtained by snooping. See
// https://docs.google.com/document/d/1__rE8LbiB0pK4qqG3vIbjbgnT8SrTx86A_KI7Xakgjk/edit#

// List of "corregimientos", which we will consider counties.
// List of "corregimientos" (smaller than a county)
_corregimientosListUrl: 'https://opendata.arcgis.com/datasets/61980f00977b4dcdad46eda02268ab48_0.csv',

// List of tests per day
_testsUrl: 'https://opendata.arcgis.com/datasets/f966620f339241e9833f111969da8e83_0.csv',

// List of cases, this has most of the data that we want.
_caseListUrl: 'https://opendata.arcgis.com/datasets/898c63fc068745d98fea01b9bf4f05ea_0.csv',
_caseListFeatureURL:
'https://services5.arcgis.com/aqOddbAz6HewRw8I/ArcGIS/rest/services/Casos_Covid19_PA/FeatureServer/0/',

// Time series at national level.
_timeSeriesUrl: 'https://opendata.arcgis.com/datasets/6b7f17658fd845058f7516d6fc591530_0.csv',

// Road blocks (disease related?)
// https://opendata.arcgis.com/datasets/91325f0051a84c72a704725a962a8bc7_0.csv

type: 'csv',
type: 'json',
aggregate: 'county',
maintainers: [maintainers.shaperilio],

Expand Down Expand Up @@ -237,8 +260,9 @@ const scraper = {
// }
// ]

// Cache this.
// TODO: How do we deal with multiple source for the same country?
// Just create a second .js with a lower priority, but we will get a timeseries from the patient list.
// Here we could double-check with this data, but I'm not going to do that now.
// i.e If I wanted to make a Panama nation-level timeseries scraper, how would I do it
// and still keep this one which has greater granularity?
/* const timeseries = */ await fetch.csv(this, this._timeSeriesUrl, 'timeseries');
Expand All @@ -256,8 +280,18 @@ const scraper = {
// FID: '1'
// }

// This is the one we actually get the data from.
const caseList = await fetch.csv(this, this._caseListUrl, 'cases');
// This is the source we actually get the data from.
this.url = this._caseListFeatureURL;
const scrapeDate = datetime.getYYYYMMDD(datetime.scrapeDate());
let caseList;
// use datetime.old here, just like the caching system does.
if (datetime.dateIsBefore(scrapeDate, datetime.old.getDate())) {
// treat the data as a timeseries, so don't cache it.
caseList = await TEMPfetchArcGISJSON(this, this._caseListFeatureURL, false);
} else {
// fetch it the normal way so it gets cached.
caseList = await TEMPfetchArcGISJSON(this, this._caseListFeatureURL);
}
// Array of:
// {
// 'numero_CASO': '201', - case number
Expand All @@ -283,8 +317,14 @@ const scraper = {

const data = [];
// log(`Panama has ${caseList.length} cumulative cases.`);
this.url = this._caseListUrl; // required for source rating.
let rejectedByDate = 0;
for (const patient of caseList) {
const confirmedDate = datetime.getYYYYMMDD(patient.FECHA_de_CONFIRMACION);
if (!datetime.dateIsBeforeOrEqualTo(confirmedDate, scrapeDate)) {
rejectedByDate++;
continue;
} // this turns this scraper into a timeseries.

const location = patientLocation[patient.UBICACIÓN_DEL_PACIENTE];
if (location === patientLocation.UNKNOWN) {
log.warn(`Patient location "${location}" is unparsable; this patient will not be counted.`);
Expand Down Expand Up @@ -329,63 +369,76 @@ const scraper = {
}
}

// tests, can be moved, changed to asserts, etc.
log(`Counting up to ${scrapeDate}: ${rejectedByDate} out of ${caseList.length} rejected by date.`);

if (data.length === 0) return data;

// TODO: move these to a test.
let data2 = [...data];
log(
`With corregimientos: ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data.length} items.`
);
const corrC = sum(data2, 'cases');
const corrD = sum(data2, 'deaths');
const corrR = sum(data2, 'recovered');
log(`With corregimientos: ${corrC} cases, ${corrD} deaths, ${corrR} recovered, with ${data2.length} items.`);

data2 = aggregateItems(data, ['corregimiento']); // our system won't accept this.
log(
`With counties : ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data2.length} items.`
);
const distC = sum(data2, 'cases');
const distD = sum(data2, 'deaths');
const distR = sum(data2, 'recovered');
log(`With counties : ${distC} cases, ${distD} deaths, ${distR} recovered, with ${data2.length} items.`);
// See https://es.wikipedia.org/wiki/Organizaci%C3%B3n_territorial_de_Panam%C3%A1
assert(data2.length <= 81); // Panama has 81 districts (counties)
assert(corrC === distC);
assert(corrD === distD);
assert(corrR === distR);

data2 = aggregateItems(data2, ['county']); // our system won't accept this.
log(
`With states : ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data2.length} items.`
);
data2 = aggregateItems(data, ['county', 'corregimiento']); // our system won't accept this.
log(
`With states : ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data2.length} items.`
);
data2 = aggregateItems(data2, ['county']);
const provC = sum(data2, 'cases');
const provD = sum(data2, 'deaths');
const provR = sum(data2, 'recovered');
log(`With states : ${provC} cases, ${provD} deaths, ${provR} recovered, with ${data2.length} items.`);
assert(provC === distC);
assert(provD === distD);
assert(provR === distR);

data2 = aggregateItems(data, ['county', 'corregimiento']);
const provC2 = sum(data2, 'cases');
const provD2 = sum(data2, 'deaths');
const provR2 = sum(data2, 'recovered');
log(`With states : ${provC2} cases, ${provD2} deaths, ${provR2} recovered, with ${data2.length} items.`);
assert(data2.length <= 10 + 3); // Panama has 10 provinces and 3 provincial comarcas
data2 = aggregateItems(data2, ['state']); // our system won't accept this.
log(
`With nothing : ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data2.length} items.`
);
assert(provC === provC2);
assert(provD === provD2);
assert(provR === provR2);

data2 = aggregateItems(data2, ['state']);
const natC = sum(data2, 'cases');
const natD = sum(data2, 'deaths');
const natR = sum(data2, 'recovered');
log(`With nothing : ${natC} cases, ${natD} deaths, ${natR} recovered, with ${data2.length} items.`);
assert(data2.length === 1);
assert(natC === provC2);
assert(natD === provD2);
assert(natR === provR2);

data2 = aggregateItems(data2, ['state', 'county', 'corregimiento']); // our system won't accept this.
log(
`With nothing : ${sum(data2, 'cases')} cases, ${sum(data2, 'deaths')} deaths, ${sum(
data2,
'recovered'
)} recovered, with ${data2.length} items.`
);
const natC2 = sum(data2, 'cases');
const natD2 = sum(data2, 'deaths');
const natR2 = sum(data2, 'recovered');
log(`With nothing : ${natC2} cases, ${natD2} deaths, ${natR2} recovered, with ${data2.length} items.`);
assert(data2.length === 1);
assert(natC === natC2);
assert(natD === natD2);
assert(natR === natR2);
// end tests.

const countyLevel = aggregateItems(data, ['corregimiento']);
const provinceLevel = aggregateItems(countyLevel, ['county']);
const nationLevel = aggregateItems(provinceLevel, ['state']);
assert(nationLevel.length === 1);
addEmptyStates(provinceLevel);
assert(provinceLevel.length === 13); // Panama has 10 provinces and 3 provincial comarcas
// return [...provinceLevel, ...countyLevel]; // Uncomment this to reproduce #798
// As of 2020-04-16, counties are just rejected by the system:
// ❌ location.county is not a country-level id: iso2:PA-8, /Users/barf/coronadatascraper/src/shared/scrapers/PA/index.js
// return [...nationLevel, ...provinceLevel, ...countyLevel]; // Uncomment this to reproduce #798
return [...nationLevel, ...provinceLevel];
}
};
Expand Down

0 comments on commit b7b0159

Please sign in to comment.