Skip to content

Commit

Permalink
Use ISO codes for all states (covidatlas#803)
Browse files Browse the repository at this point in the history
* convert us states to iso2 form
* NYT fixes
* rename nyt-counties
* deleted usa cdc
* update covidtracking
* Brazil state ISO
* fix Korea unassigned
* fix unassigned in US/AR
* fix unassigned in US/FL
* normalization fixes
* lint fix
  • Loading branch information
hyperknot authored Apr 14, 2020
1 parent 2fd1c93 commit 2144dfe
Show file tree
Hide file tree
Showing 102 changed files with 200 additions and 229 deletions.
6 changes: 3 additions & 3 deletions docs/sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ Here's the scraper for Indiana that gets data from a CSV:
{
url: 'https://opendata.arcgis.com/datasets/d14de7e28b0448ab82eb36d6f25b1ea1_0.csv',
country: 'iso1:US',
state: 'IN',
state: 'iso2:US-IN',
scraper: async function() {
let data = await fetch.csv(this.url);
Expand Down Expand Up @@ -259,7 +259,7 @@ Here's the scraper for Oregon that pulls data from a HTML table:

```javascript
{
state: 'OR',
state: 'iso2:US-OR',
country: 'iso1:US',
url: 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Pages/emerging-respiratory-infections.aspx',
scraper: async function() {
Expand Down Expand Up @@ -298,7 +298,7 @@ Scrapers need to be able to operate correctly on old data, so updates to scraper
```javascript
{
state: 'LA',
state: 'iso2:US-LA',
country: 'iso1:US',
aggregate: 'county',
_countyMap: { 'La Salle Parish': 'LaSalle Parish' },
Expand Down
48 changes: 26 additions & 22 deletions src/events/crawler/scrape-data/normalize-locations.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import path from 'path';
// eslint-disable-next-line import/no-extraneous-dependencies
import fipsCodes from 'country-levels/fips.json';
// eslint-disable-next-line import/no-extraneous-dependencies
import iso2Codes from 'country-levels/iso2.json';
import { isId } from '../../../shared/lib/geography/country-levels.js';
import path from 'path';
import * as countryLevels from '../../../shared/lib/geography/country-levels.js';
import { isId } from '../../../shared/lib/geography/country-levels.js';
import * as geography from '../../../shared/lib/geography/index.js';

import log from '../../../shared/lib/log.js';
Expand All @@ -13,7 +11,7 @@ const UNASSIGNED = '(unassigned)';

function findCountryLevelID(location) {
for (const [, properties] of Object.entries(fipsCodes)) {
if (properties.state_code_postal === location.state && properties.name === location.county) {
if (`iso2:${properties.state_code_iso}` === location.state && properties.name === location.county) {
return properties.countrylevel_id;
}
}
Expand All @@ -24,29 +22,36 @@ const normalizeLocations = args => {
log('⏳ Normalizing locations...');

const { locations } = args;
const filteredLocations = [];

// Normalize data
for (const location of locations) {
// make sure location.country is always in country-level id form
// make sure location.country and state is always in country-level id form
if (!isId(location.country)) {
log.error(` ❌ location.country not in country-level id: ${location.country}, ${location._path}`);
log.error(` ❌ location.country is not a country-level id: ${location.country}, ${location._path}`);
continue;
}

if (location.state && !isId(location.state)) {
log.error(` ❌ location.state is not a country-level id: ${location.state}, ${location._path}`);
continue;
}

if (!countryLevels.getIdFromLocation(location)) {
if (location.country === 'iso1:US') {
// Normalize states
location.state = geography.toUSStateAbbreviation(location.state);
if (location.county === UNASSIGNED) {
continue;
}

if (location.county && location.county !== UNASSIGNED) {
if (location.county) {
// Find county FIPS ID
if (Array.isArray(location.county)) {
const aggregatedCounty = [];
let fipsFound = true;
for (const subCounty of location.county) {
const subLocation = {
county: subCounty,
state: location.state,
country: location.country
state: location.state
};
const countryLevelId = findCountryLevelID(subLocation);
if (countryLevelId) {
Expand All @@ -62,6 +67,8 @@ const normalizeLocations = args => {
}
if (fipsFound) {
location.county = aggregatedCounty.join('+');
} else {
continue;
}
} else {
let fipsFound = false;
Expand All @@ -72,18 +79,13 @@ const normalizeLocations = args => {
}
if (!fipsFound) {
log.error(' ❌ Failed to find FIPS code for %s, %s', location.county, location.state);
continue;
}
}
}

// Find state ID
if (location.state) {
if (iso2Codes[`US-${location.state}`]) {
location.state = iso2Codes[`US-${location.state}`].countrylevel_id;
} else {
log.error(' ❌ Failed to find FIPS code for state %s', location.state);
}
}
} else {
log.error(` ❌ location.county is not a country-level id: ${location.state}, ${location._path}`);
continue;
}
}

Expand All @@ -95,9 +97,11 @@ const normalizeLocations = args => {
if (!location.type && path.extname(location.url).substr(1)) {
location.type = path.extname(location.url).substr(1);
}

filteredLocations.push(location);
}

return { ...args, locations };
return { ...args, locations: filteredLocations };
};

export default normalizeLocations;
54 changes: 27 additions & 27 deletions src/shared/scrapers/BR/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,33 +29,33 @@ const scraper = {
}
],
_ufs: {
Acre: ['AC', 881935, [-9.0238, -70.812]],
Alagoas: ['AL', 3337357, [-9.5713, -36.782]],
Amapá: ['AP', 845731, [0.902, -52.003]],
Amazonas: ['AM', 4144597, [-3.4168, -65.8561]],
Bahia: ['BA', 14873064, [-12.5797, -41.7007]],
Ceará: ['CE', 9132078, [-5.4984, -39.3206]],
'Distrito Federal': ['DF', 3015268, [-15.7998, -47.8645]],
'Espírito Santo': ['ES', 4018650, [-19.1834, -40.3089]],
Goiás: ['GO', 7018354, [-15.827, -49.8362]],
Maranhão: ['MA', 7075181, [-4.9609, -45.2744]],
'Mato Grosso': ['MT', 3484466, [-12.6819, -56.9211]],
'Mato Grosso do Sul': ['MS', 2778986, [-20.7722, -54.7852]],
'Minas Gerais': ['MG', 21168791, [-18.5122, -44.555]],
Paraná: ['PR', 11433957, [-25.2521, -52.0215]],
Paraíba: ['PB', 4018127, [-7.24, -36.782]],
Pará: ['PA', 11433957, [-1.9981, -54.9306]],
Pernambuco: ['PE', 9557071, [-8.8137, -36.9541]],
Piauí: ['PI', 3273227, [-7.7183, -42.7289]],
'Rio Grande do Norte': ['RN', 3506853, [-5.4026, -36.9541]],
'Rio Grande do Sul': ['RS', 11377239, [-30.0346, -51.2177]],
'Rio de Janeiro': ['RJ', 17264943, [-22.9099, -43.2095]],
Rondônia: ['RO', 1777225, [-11.5057, -63.5806]],
Roraima: ['RR', 60576, [2.7376, -62.0751]],
'Santa Catarina': ['SC', 7164788, [-27.2423, -50.2189]],
Sergipe: ['SE', 2298696, [-10.5741, -37.3857]],
'São Paulo': ['SP', 45919049, [-23.5505, -46.6333]],
Tocantins: ['TO', 1572866, [-10.1753, -48.2982]]
Acre: ['iso2:BR-AC', 881935, [-9.0238, -70.812]],
Alagoas: ['iso2:BR-AL', 3337357, [-9.5713, -36.782]],
Amapá: ['iso2:BR-AP', 845731, [0.902, -52.003]],
Amazonas: ['iso2:BR-AM', 4144597, [-3.4168, -65.8561]],
Bahia: ['iso2:BR-BA', 14873064, [-12.5797, -41.7007]],
Ceará: ['iso2:BR-CE', 9132078, [-5.4984, -39.3206]],
'Distrito Federal': ['iso2:BR-DF', 3015268, [-15.7998, -47.8645]],
'Espírito Santo': ['iso2:BR-ES', 4018650, [-19.1834, -40.3089]],
Goiás: ['iso2:BR-GO', 7018354, [-15.827, -49.8362]],
Maranhão: ['iso2:BR-MA', 7075181, [-4.9609, -45.2744]],
'Mato Grosso': ['iso2:BR-MT', 3484466, [-12.6819, -56.9211]],
'Mato Grosso do Sul': ['iso2:BR-MS', 2778986, [-20.7722, -54.7852]],
'Minas Gerais': ['iso2:BR-MG', 21168791, [-18.5122, -44.555]],
Paraná: ['iso2:BR-PR', 11433957, [-25.2521, -52.0215]],
Paraíba: ['iso2:BR-PB', 4018127, [-7.24, -36.782]],
Pará: ['iso2:BR-PA', 11433957, [-1.9981, -54.9306]],
Pernambuco: ['iso2:BR-PE', 9557071, [-8.8137, -36.9541]],
Piauí: ['iso2:BR-PI', 3273227, [-7.7183, -42.7289]],
'Rio Grande do Norte': ['iso2:BR-RN', 3506853, [-5.4026, -36.9541]],
'Rio Grande do Sul': ['iso2:BR-RS', 11377239, [-30.0346, -51.2177]],
'Rio de Janeiro': ['iso2:BR-RJ', 17264943, [-22.9099, -43.2095]],
Rondônia: ['iso2:BR-RO', 1777225, [-11.5057, -63.5806]],
Roraima: ['iso2:BR-RR', 60576, [2.7376, -62.0751]],
'Santa Catarina': ['iso2:BR-SC', 7164788, [-27.2423, -50.2189]],
Sergipe: ['iso2:BR-SE', 2298696, [-10.5741, -37.3857]],
'São Paulo': ['iso2:BR-SP', 45919049, [-23.5505, -46.6333]],
Tocantins: ['iso2:BR-TO', 1572866, [-10.1753, -48.2982]]
},
async scraper() {
const response = [];
Expand Down
4 changes: 3 additions & 1 deletion src/shared/scrapers/KR/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const scraper = {
const $ = await fetch.page(this.url);
const $table = $('table.num');

const states = [];
let states = [];
const $headings = $table.find('thead tr:last-child th');
const headingOffset = $table.find('thead tr:first-child th[rowspan="2"]').length - 1; // -1 for the th in each province row.

Expand Down Expand Up @@ -100,6 +100,8 @@ const scraper = {

const summedData = transform.sumData(states);
states.push(summedData);

states = states.filter(s => s.state !== UNASSIGNED);
assert(summedData.cases > 0, 'Cases is not reasonable');

return states;
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/AK/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import * as parse from '../../../lib/parse.js';
import * as transform from '../../../lib/transform.js';

const scraper = {
state: 'AK',
state: 'iso2:US-AK',
country: 'iso1:US',
sources: [
{
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/AL/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import * as geography from '../../../lib/geography/index.js';
// const UNASSIGNED = '(unassigned)';

const scraper = {
state: 'AL',
state: 'iso2:US-AL',
country: 'iso1:US',
url: 'http://www.alabamapublichealth.gov/infectiousdiseases/2019-coronavirus.html',
type: 'table',
Expand Down
3 changes: 2 additions & 1 deletion src/shared/scrapers/US/AR/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';
const UNASSIGNED = '(unassigned)';

const scraper = {
state: 'AR',
state: 'iso2:US-AR',
country: 'iso1:US',
url:
'https://services.arcgis.com/PwY9ZuZRDiI5nXUB/ArcGIS/rest/services/ADH_COVID19_Positive_Test_Results/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*',
Expand Down Expand Up @@ -120,6 +120,7 @@ const scraper = {

counties.push(transform.sumData(counties));
counties = geography.addEmptyRegions(counties, this._counties, 'county');
counties = counties.filter(c => c.county !== UNASSIGNED);
return counties;
}
};
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/AZ/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import * as geography from '../../../lib/geography/index.js';
// const UNASSIGNED = '(unassigned)';

const scraper = {
state: 'AZ',
state: 'iso2:US-AZ',
country: 'iso1:US',
sources: [
{
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/alameda-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Alameda County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
sources: [
{
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/butte-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Butte County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
sources: [
{
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/calaveras-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Calaveras County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'https://covid19.calaverasgov.us/',
maintainers: [maintainers.jbencina],
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/colusa-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Colusa County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'http://www.countyofcolusa.org/99/Public-Health',
maintainers: [maintainers.jbencina],
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/contra-costa-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Contra Costa County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'https://www.coronavirus.cchealth.org/',
maintainers: [maintainers.jbencina],
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/del-norte-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Del Norte County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'http://www.co.del-norte.ca.us/departments/health-human-services/public-health',
maintainers: [maintainers.jbencina],
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/fresno-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Fresno County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'https://www.co.fresno.ca.us/departments/public-health/covid-19',
maintainers: [maintainers.jbencina],
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/glenn-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Glenn County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
maintainers: [maintainers.jbencina],
async scraper() {
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import * as geography from '../../../lib/geography/index.js';
// const UNASSIGNED = '(unassigned)';

const scraper = {
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
priority: 1,
type: 'csv',
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/kern-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import * as parse from '../../../lib/parse.js';

const scraper = {
county: 'Kern County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
url: 'https://kernpublichealth.com/2019-novel-coronavirus/',
type: 'table',
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/kings-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Kings County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
maintainers: [maintainers.jbencina],
url:
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/los-angeles-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ const scraper = {
url: 'http://www.publichealth.lacounty.gov'
}
],
state: 'CA',
state: 'iso2:US-CA',
type: 'table',
url: 'http://www.publichealth.lacounty.gov/media/Coronavirus/js/casecounter.js',
scraper: {
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/madera-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Madera County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
maintainers: [maintainers.jbencina],
url: 'https://www.maderacounty.com/government/public-health/health-updates/corona-virus',
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/marin-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Marin County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
maintainers: [maintainers.jbencina],
url: 'https://coronavirus.marinhhs.org/surveillance',
Expand Down
2 changes: 1 addition & 1 deletion src/shared/scrapers/US/CA/mendocino-county.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import maintainers from '../../../lib/maintainers.js';

const scraper = {
county: 'Mendocino County',
state: 'CA',
state: 'iso2:US-CA',
country: 'iso1:US',
maintainers: [maintainers.jbencina],
url: 'https://www.mendocinocounty.org/community/novel-coronavirus',
Expand Down
Loading

0 comments on commit 2144dfe

Please sign in to comment.