Skip to content

Commit

Permalink
aplicando dumpOsm pretty-osm, ver issue #9
Browse files Browse the repository at this point in the history
  • Loading branch information
ppKrauss committed Jun 28, 2018
1 parent b126a0a commit 7e43829
Show file tree
Hide file tree
Showing 2 changed files with 14,292 additions and 88 deletions.
14,179 changes: 14,091 additions & 88 deletions data/dump_osm/SP.geojson

Large diffs are not rendered by default.

201 changes: 201 additions & 0 deletions src/dumpOsm-lix.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
-- Generating DUMPS --

grep P402 data/wikidata/SP/* | wc -l

<?php
// usage: php dumpWikidata.php [geo][err]

// CONFIGS
$urlWd_tpl = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=';
$urlOsm_tpl = 'http://polygons.openstreetmap.fr/get_geojson.py?id=';
$UF=''; $localCsv = false; $stopAt=0;


$saveFolder = realpath( dirname(__FILE__)."/../data" );
$url = $localCsv
? "$saveFolder/br-state-codes.csv"
: 'https://github.com/datasets-br/state-codes/raw/master/data/br-state-codes.csv'
;
// cols 0=subdivision, 1=name_prefix, 2=name, 3=id, 4=idIBGE, 5=wdId, 6=lexLabel
$uf_idx=0; $wdId_idx = 5; $lexLabel_idx = 6;


$modos = ['geo'=>'GEO', 'err'=>'FIX-ERR', 'pretty-wd'=>'PRETTY-WIKIDATA', 'pretty-osm'=>'PRETTY-OSM'];
$modo = ($argc>=2)? $argv[1]: '';
if (isset($modos[$modo])) $modo=$modos[$modo];
else die("\nERRO modo $modo desconhecido, use: ". join(', ',array_keys($modos)). "\n");

$ext = ($modo=='GEO')? 'geojson': 'json';
print "\n USANDO $modo $url ... \n";


// LOAD DATA:

$R = []; // [fname]= wdId
if (($handle = fopen($url, "r")) !== FALSE) {
for($i=0; ($row=fgetcsv($handle)) && (!$stopAt || $i<$stopAt); $i++)
if ( $i && isset($row[1]) ) //state
$R[ $row[$uf_idx] ] = $row[$wdId_idx];
} else
exit("\nERRO ao abrir planilha das cidades em \n\t$url\n");


if ($modo=='FIX-ERR') foreach($R as $fname=>$wdId) {
$fs = splitFilename($fname,true); // [$fp,$uf,$fname2,$saveFolder2,$size];
if ($fs[4]>50) unset($R[$fname]);
//print "\n-- debug $saveFolder/dump_wikidata/$fs[1]/$fs[2].json";
}

// WGET AND SAVE JSON:
$i=1;
$n=count($R);
$ERR=[];

switch($modo) {

case 'PRETTY-WIKIDATA':
foreach($R as $fname=>$wdId) {
$fs = splitFilename($fname,true); // [$fp,$uf,$fname2,$saveFolder2,$size];
$f = "$saveFolder/dump_wikidata/$fs[1]/$fs[2].json";
if ( file_exists($f) ) {
$jold = file_get_contents($f);
$j = json_decode( $jold, JSON_BIGINT_AS_STRING|JSON_OBJECT_AS_ARRAY);
$jnew = json_encode($j,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES);
if ($jold!==$jnew) {
print "\n-- changing $fname";
file_put_contents($f,$jnew);
} else print " OK ";
} else print " EMP ";
}
break;


case 'PRETTY-OSM':
foreach($R as $fname=>$wdId) {
//$fs = splitFilename($fname,true); // [$fp,$uf,$fname2,$saveFolder2,$size];
$f = "$saveFolder/dump_osm/$fname.geojson";
if ( file_exists($f) ) {
$jold = file_get_contents($f);
$j = json_decode( $jold, JSON_BIGINT_AS_STRING|JSON_OBJECT_AS_ARRAY);
$jnew = json_encode($j,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES);
if ($jold!==$jnew) {
print "\n-- changing $fname";
file_put_contents($f,$jnew);
} else print " OK ";
} else print " EMP ";
}
break;

case '':
case 'FIX-ERR':
foreach($R as $fname=>$wdId) {
print "\n\t($i of $n) $fname: $wdId ";
$json = file_get_contents("$urlWd_tpl$wdId");
if ($json) {
$out = json_stdWikidata($json);
if ($out) {
$savedBytes = file_put_contents( "$saveFolder/dump_wikidata/$fname.$ext", $out );
print "saved ($savedBytes bytes) with fresh $wdId";
} else
ERRset($fname,"invalid Wikidata structure");
} else
ERRset($fname,"empty json");
$i++;
}
break;

case 'GEO':
foreach($R as $fname=>$wdId) {
print "\n\t($i of $n) $fname: $wdId ";
$osmId= getOsmId($fname,$wdId); // usa wdId?
$json='';
if ($osmId) $json = file_get_contents("$urlOsm_tpl$osmId");
else ERRset($fname,"no osmId or P402");
if ($json) {
$out = json_stdOsm($json);
if ($out) {
$savedBytes = file_put_contents( "$saveFolder/dump_osm/$fname.$ext", $out );
print "saved ($savedBytes bytes) with fresh OSM/$osmId";
} else
ERRset($fname,"invalid OSM structure");
} else
ERRset($fname,"empty json");
$i++;
}
break;

default:
die("\n Modo $modo DESCONHECIDO.\n");

} // end switch


if (count($ERR)) { print "\n ----------- ERRORS ---------\n"; foreach($ERR as $msg) print "\n * $msg"; }


///// LIB

function ERRset($fname,$msg) {
global $ERR;
$msg = "ERROR, $msg for $fname.";
print $msg;
$ERR[] = $msg;
}

function json_stdOsm($jstr) {
if (!trim($jstr)) return '';
$j = json_decode($jstr,JSON_BIGINT_AS_STRING|JSON_OBJECT_AS_ARRAY);
if ( !isset($j['type']) ) return '';
return json_encode($j,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES);
}

function json_stdWikidata($jstr) {
if (!trim($jstr)) return '';
$j = json_decode($jstr,JSON_BIGINT_AS_STRING|JSON_OBJECT_AS_ARRAY);
if ( !isset($j['entities']) ) return '';
$ks=array_keys($j['entities']);
$j = $j['entities'][$ks[0]];
if ( !isset($j['claims']) ) return '';
foreach(['lastrevid','modified','labels','descriptions','title','aliases','sitelinks'] as $r) unset($j[$r]);
$a = [];
foreach($j['claims'] as $k=>$r) {
$a[$k] = [];
foreach($j['claims'][$k] as $r2)
$a[$k][] = $r2['mainsnak']['datavalue'];
}
$j['claims'] = $a;
return json_encode($j,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES);
}

function getOsmId($fname) {
global $saveFolder;
$f = "$saveFolder/dump_wikidata/$fname.json";
$j = json_decode( file_get_contents($f), JSON_BIGINT_AS_STRING|JSON_OBJECT_AS_ARRAY);
if (isset($j['claims']['P402'][0]['value']) )
return $j['claims']['P402'][0]['value'];
else
return 0;
}


function lex2filename($s) {
$s=ucwords( str_replace('.',' ',$s) );
return preg_replace('/ D | /','',$s); // elimina preposicao contraida (bug norma lexml)
}


function splitFilename($f,$checkSize=false) {
global $saveFolder;
$uf = substr($f,0,2);
$fname2 = substr($f,3);
$saveFolder2 = "$saveFolder/$uf";
$fp = "$saveFolder2/$fname2.json";
$size = $checkSize? (file_exists($fp)? filesize($fp): 0): null;
return [$fp,$uf,$fname2,$saveFolder2,$size];
}

?>

... Check git status and do git add.


0 comments on commit 7e43829

Please sign in to comment.