Skip to content

Commit

Permalink
Merge pull request #45 from arfc/coordinate_merge
Browse files Browse the repository at this point in the history
Coordinate merge(pris, webscrape)
  • Loading branch information
gyutaepark authored Apr 22, 2018
2 parents 26ea7cc + d7b0524 commit e0249a2
Show file tree
Hide file tree
Showing 4 changed files with 1,763 additions and 780 deletions.
4 changes: 0 additions & 4 deletions analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@
from pyne import nucname


if len(sys.argv) < 2:
print('Usage: python analysis.py [cylus_output_file]')


def get_cursor(file_name):
""" Connects and returns a cursor to an sqlite output file
Expand Down
211 changes: 211 additions & 0 deletions import_data/merge_coordinates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import sqlite3 as sql
import sys

if len(sys.argv) < 3:
print('Usage: python merge_coordinates.py [pris_link] [webscrape_link]')


def get_cursor(file_name):
""" Connects and returns a cursor to an sqlite output file
Parameters
----------
file_name: str
name of the sqlite file
Returns
-------
sqlite cursor
"""
con = sql.connect(file_name)
con.row_factory = sql.Row
return con.cursor()


def import_pris(pris_link):
""" Opens pris_csv using Pandas. Adds Latitude and Longitude
columns
Parameters
----------
pris_link: str
path to reactors_pris_2016.original.csv file
Returns
-------
pris: pd.Dataframe
pris database
"""
pris = pd.read_csv(pris_link,
delimiter=',',
encoding='iso-8859-1'
)
pris.insert(13, 'Latitude', np.nan)
pris.insert(14, 'Longitude', np.nan)
pris = pris.replace(np.nan, '')
return pris


def import_webscrape_data(scrape_link):
""" Returns sqlite content of webscrape by performing an
sqlite query
Parameters
----------
scrape_link: str
path to webscrape.sqlite file
Returns
-------
coords: sqlite cursor
sqlite cursor containing webscrape data
"""
cur = get_cursor(scrape_link)
coords = cur.execute("SELECT name, long, lat FROM reactors_coordinates")
return coords


def get_edge_cases():
""" Returns a dictionary of edge cases that fuzzywuzzy is
unable to catch. This could be because PRIS database stores
reactor names and Webscrape database fetches power plant names,
or because PRIS reactor names are abbreviated.
Parameters
----------
Returns
-------
others: dict
dictionary of edge cases with "key=pris_reactor_name, and
value=webscrape_plant_name"
"""
others = {'OHI-': 'Ōi',
'ASCO-': 'Ascó',
'ROVNO-': 'Rivne',
'SHIN-KORI-': 'Kori',
'ANO-': 'Arkansas One',
'HANBIT-': 'Yeonggwang',
'FERMI-': 'Enrico Fermi',
'BALTIC-': 'Kaliningrad',
'COOK-': 'Donald C. Cook',
'HATCH-': 'Edwin I. Hatch',
'HARRIS-': 'Shearon Harris',
'SHIN-WOLSONG-': 'Wolseong',
'ST. ALBAN-': 'Saint-Alban',
'LASALLE-': 'LaSalle County',
'SUMMER-': 'Virgil C. Summer',
'FARLEY-': 'Joseph M. Farley',
'ST. LAURENT ': 'Saint-Laurent',
'HADDAM NECK': 'Connecticut Yankee',
'HIGASHI DORI-1 (TOHOKU)': 'Higashidōri',
}
return others


def sanitize_webscrape_name(name):
""" Sanitizes webscrape powerplant names by removing unwanted
strings (listed in blacklist), applying lower case, and deleting
trailing whitespace.
Parameters
----------
name: str
webscrape plant name
Returns
-------
name: str
sanitized name for use with fuzzywuzzy
"""
blacklist = ['nuclear', 'power',
'plant', 'generating',
'station', 'reactor', 'atomic',
'energy', 'center', 'electric']
name = name.lower()
for blacklisted in blacklist:
name = name.replace(blacklisted, '')
name = name.strip()
name = ' '.join(name.split())
return name


def merge_coordinates(pris_link, scrape_link):
""" Merges webscrape data with pris data performed by string
comparison of reactor names from pris and webscrape. Returns
updated pris database with coordinates.
Parameters
----------
pris_link: str
path to reactors_pris_2016.original.csv file
scrape_link: str
path to webscrape.sqlite file
Returns
-------
pris: pd.DataFrame
updated PRIS database with latitude and longitude info
"""
others = get_edge_cases()
pris = import_pris(pris_link)
coords = import_webscrape_data(scrape_link)
for web in coords:
for idx, prs in pris.iterrows():
webscrape_name = sanitize_webscrape_name(web['name'])
pris_name = prs[1].lower()
if fuzz.ratio(webscrape_name, pris_name) > 64:
prs[13] = web['lat']
prs[14] = web['long']
else:
for other in others.keys():
edge_case_key = other.lower()
edge_case_value = others[other].lower()
if (fuzz.ratio(pris_name, edge_case_key) > 80 and
fuzz.ratio(webscrape_name, edge_case_value) > 75):
prs[13] = web['lat']
prs[14] = web['long']
return pris


def save_output(pris):
""" Saves updated PRIS database as 'reactors_pris_2016.csv'
Parameters
----------
pris: pd.DataFrame
updated PRIS database with latitude and longitude info
Returns
-------
"""
pris.to_csv('reactors_pris_2016.csv',
index=False,
sep=',',
)


def main(pris_link, scrape_link):
""" Calls all required functions to merge PRIS and webscrape
Parameters
----------
pris_link: str
path to reactors_pris_2016.original.csv file
scrape_link: str
path to webscrape.sqlite file
Returns
-------
"""
pris = merge_coordinates(pris_link, scrape_link)
save_output(pris)


if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
Loading

0 comments on commit e0249a2

Please sign in to comment.