Initial implementation

symerio · Jan 13, 2018 · ce6edce · ce6edce
1 parent 36cb30b
commit ce6edce
Show file tree

Hide file tree

Showing 6 changed files with 572 additions and 3 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2018, Roman Yurchak
+Copyright (c) 2018, Symerio
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,95 @@
+pgeocode
+========
+
+Postal code geocoding and distance calculations
+
+pgeocode is a Python library for high performance off-line querying of GPS coordinates, region name and municipality name
+from postal codes. Distances between postal codes as well as general
+distance queries are also supported.
+The used `GeoNames <http://download.geonames.org/export/zip/>`_ database includes postal codes for 83 countries.
+
+Currently, only queries within the same country are supported.
+
+
+Installation
+------------
+
+pgeocode requires Python 2.7 or 3.5+ as well as ``numpy`` and ``pandas`` packages. It can be installed with,
+
+.. code::
+
+    pip install pgeocode
+
+Quickstart
+----------
+
+**Postal code queries**
+
+.. code:: python
+
+    >>> import pgeocode
+
+    >>> nomi = pgeocode.Nominatim('fr')
+    >>> nomi.query_postal_code("75013")
+    postal_code               75013
+    country code                 FR
+    place_name             Paris 13
+    state_name        Île-de-France
+    state_code                   11
+    county_name               Paris
+    county_code                  75
+    community_name            Paris
+    community_code              751
+    latitude                48.8322
+    longitude                2.3561
+    accuracy                      5
+
+    >>> nomi.query_postal_code(["75013", "69006"])
+          postal_code place_name            state_name  latitude  longitude
+    0       75013   Paris 13         Île-de-France   48.8322     2.3561
+    1       69006    Lyon 06  Auvergne-Rhône-Alpes   45.7679     4.8506
+
+**Distance calculations**
+
+.. code:: python
+
+    >>> nomi = pgeocode.GeoDistance('fr')
+    >>> dist.query_postal_code("75013", "69006")
+    389.156
+    >>> dist.query_postal_code(["75013", "75014", "75015"], ["69006", "69005", "69004"])
+    array([ 389.15648697,  390.12577967,  390.49857655])
+
+
+
+Geocoding format
+----------------
+
+The result of a geo-localistion query is a ``pandas.DataFrame`` with the following columns,
+
+* ``country code``: iso country code, 2 characters
+* ``postal code`` : postal code
+* ``place name``  : place name (e.g. town, city etc)
+* ``state_name`` : 1. order subdivision (state)
+* ``state_code`` : 1. order subdivision (state)
+* ``county_name`` : 2. order subdivision (county/province)
+* ``county_code`` : 2. order subdivision (county/province)
+* ``community_name`` : 3. order subdivision (community)
+* ``community_code`` : 3. order subdivision (community)
+* ``latitude``    : estimated latitude (wgs84)
+* ``longitude``   : estimated longitude (wgs84)
+* ``accuracy``    : accuracy of lat/lng from 1=estimated to 6=centroid
+
+License
+-------
+
+The pgeocode package is distributed under the 3-clause BSD license.
+
+
+Supported countries
+-------------------
+
+The list of countries available in the GeoNames database, with the corresponding country codes, are given below,
+
+Andorra (AD), Argentina (AR), American Samoa (AS), Austria (AT), Australia (AU), Åland Islands (AX), Bangladesh (BD), Belgium (BE), Bulgaria (BG), Bermuda (BM), Brazil (BR), Belarus (BY), Canada (CA), Switzerland (CH), Colombia (CO), Costa Rica (CR), Czechia (CZ), Germany (DE), Denmark (DK), Dominican Republic (DO), Algeria (DZ), Spain (ES), Finland (FI), Faroe Islands (FO), France (FR), United Kingdom of Great Britain and Northern Ireland (GB), French Guiana (GF), Guernsey (GG), Greenland (GL), Guadeloupe (GP), Guatemala (GT), Guam (GU), Croatia (HR), Hungary (HU), Ireland (IE), Isle of Man (IM), India (IN), Iceland (IS), Italy (IT), Jersey (JE), Japan (JP), Liechtenstein (LI), Sri Lanka (LK), Lithuania (LT), Luxembourg (LU), Latvia (LV), Monaco (MC), Republic of Moldova (MD), Marshall Islands (MH), The former Yugoslav Republic of Macedonia (MK), Northern Mariana Islands (MP), Martinique (MQ), Malta (MT), Mexico (MX), Malaysia (MY), New Caledonia (NC), Netherlands (NL), Norway (NO), New Zealand (NZ), Philippines (PH), Pakistan (PK), Poland (PL), Saint Pierre and Miquelon (PM), Puerto Rico (PR), Portugal (PT), Réunion (RE), Romania (RO), Russian Federation (RU), Sweden (SE), Slovenia (SI), Svalbard and Jan Mayen Islands (SJ), Slovakia (SK), San Marino (SM), Thailand (TH), Turkey (TR), Ukraine (UA), United States of America (US), Uruguay (UY), Holy See (VA), United States Virgin Islands (VI), Wallis and Futuna Islands (WF), Mayotte (YT), South Africa (ZA)
+
+See `GeoNames database <http://download.geonames.org/export/zip/>`_ for more information.
diff --git a/pgeocode.py b/pgeocode.py
@@ -0,0 +1,273 @@
+# License 3-clause BSD
+#
+# Authors: Roman Yurchak <[email protected]>
+
+import os
+import warnings
+
+import numpy as np
+import pandas as pd
+
+__version__ = '0.1.dev0'
+
+STORAGE_DIR = os.path.join(os.path.expanduser('~'),
+                           'pgeocode_data')
+
+DOWNLOAD_URL = "http://download.geonames.org/export/zip/{country}.zip"
+
+DATA_FIELDS = ['country code', 'postal_code', 'place_name',
+               'state_name', 'state_code', 'county_name', 'county_code',
+               'community_name', 'community_code',
+               'latitude', 'longitude', 'accuracy']
+
+COUNTRIES_VALID = ["AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM",
+                   "BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO",
+                   "DZ", "ES", "FI", "FO", "FR", "GB", "GF", "GG", "GL", "GP",
+                   "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", "IT", "JE",
+                   "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", "MH", "MK",
+                   "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", "NZ", "PH",
+                   "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", "SE", "SI",
+                   "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", "VA", "VI",
+                   "WF", "YT", "ZA"]
+
+
+class Nominatim(object):
+    """Query geographical location from a city name or a postal code
+
+    Parameters
+    ----------
+    country: str, default='fr'
+       country conde. See the documentation for a list of supported countries.
+    """
+    def __init__(self, country='fr'):
+
+        country = country.upper()
+        if country not in COUNTRIES_VALID:
+            raise ValueError(('country={} is not a known country code. '
+                              'See the README for a list of supported '
+                              'countries')
+                             .format(country))
+        if country == 'AR':
+            warnings.warn("The Argentina data file contains 4-digit postal "
+                          "codes which were replaced with a new system "
+                          "in 1999.")
+        self.country = country
+        self._data_path, self._data = self._get_data(country)
+        self._data_unique = self._index_postal_codes()
+
+    @staticmethod
+    def _get_data(country):
+        """Load the data from disk; otherwise download and save it"""
+        from zipfile import ZipFile
+        from pandas.io.common import get_filepath_or_buffer, _infer_compression
+        data_path = os.path.join(STORAGE_DIR,
+                                 country.upper() + '.txt')
+        if os.path.exists(data_path):
+            data = pd.read_csv(data_path,
+                               dtype={'postal_code': str})
+        else:
+            url = DOWNLOAD_URL.format(country=country)
+            compression = _infer_compression(url, "zip")
+            reader, encoding, compression = get_filepath_or_buffer(url)
+            with ZipFile(reader) as fh_zip:
+                with fh_zip.open(country.upper() + '.txt') as fh:
+                    data = pd.read_csv(fh,
+                                       sep='\t', header=0,
+                                       names=DATA_FIELDS,
+                                       dtype={'postal_code': str})
+            if not os.path.exists(STORAGE_DIR):
+                os.mkdir(STORAGE_DIR)
+            data.to_csv(data_path, index=None)
+
+        return data_path, data
+
+    def _index_postal_codes(self):
+        """ Create a dataframe with unique postal codes """
+        data_path_unique = self._data_path.replace('.txt', '-index.txt')
+
+        if os.path.exists(data_path_unique):
+            data_unique = pd.read_csv(data_path_unique,
+                                      dtype={'postal_code': str})
+        else:
+
+            # group together places with the same postal code
+            df_unique_cp_group = self._data.groupby('postal_code')
+            data_unique = df_unique_cp_group[['latitude', 'longitude']].mean()
+            valid_keys = set(DATA_FIELDS).difference(
+                    ['place_name', 'lattitude', 'longitude', 'postal_code'])
+            data_unique['place_name'] = df_unique_cp_group['place_name'].apply(', '.join)  # noqa
+            for key in valid_keys:
+                data_unique[key] = df_unique_cp_group[key].first()
+            data_unique = data_unique.reset_index()[DATA_FIELDS]
+            data_unique.to_csv(data_path_unique, index=None)
+        return data_unique
+
+    def _normalize_postal_code(self, codes):
+        """Normalize postal codes to the values contained in the database
+
+        For instance, take into account only first letters when applicable.
+        Takes in a pd.DataFrame
+        """
+        codes['postal_code'] = codes.postal_code.str.upper()
+
+        if self.country == 'GB':
+            codes['postal_code'] = codes.postal_code.str.split().str.get(0)
+        elif self.country == 'IE':
+            codes['postal_code'] = codes.postal_code.str.split().str.get(0)
+        elif self.country == 'CA':
+            codes['postal_code'] = codes.postal_code.str.split().str.get(1)
+        else:
+            pass
+
+        return codes
+
+    def query_postal_code(self, codes):
+        """Get locations information from postal codes
+
+        Parameters
+        ----------
+        codes: array, list or int
+          an array of strings containing postal codes
+
+        Returns
+        -------
+        df : pandas.DataFrame
+          a pandas.DataFrame with the relevant information
+        """
+        if isinstance(codes, int):
+            codes = str(codes)
+
+        if isinstance(codes, str):
+            codes = [codes]
+            single_entry = True
+        else:
+            single_entry = False
+
+        if not isinstance(codes, pd.DataFrame):
+            codes = pd.DataFrame(codes, columns=['postal_code'])
+
+        codes = self._normalize_postal_code(codes)
+        response = pd.merge(codes, self._data_unique, on='postal_code',
+                            how='left')
+        if single_entry:
+            response = response.iloc[0]
+        return response
+
+    def query_location(self, name):
+        """Get locations information from a community/minicipality name"""
+        pass
+
+
+class GeoDistance(Nominatim):
+    """ Distance calculation from a city name or a postal code
+
+    Parameters
+    ----------
+    data_path: str
+      path to the dataset
+    error: str, default='ignore'
+      how to handle not found elements. One of
+      'ignore' (return NaNs), 'error' (raise an exception),
+      'nearest' (find from nearest valid points)
+    """
+    def __init__(self, country='fr', errors='ignore'):
+        super(GeoDistance, self).__init__(country)
+
+    def query_postal_code(self, x, y):
+        """ Get distance (in km) between postal codes
+
+        Parameters
+        ----------
+        x: array, list or int
+          a list  of postal codes
+        y: array, list or int
+          a list  of postal codes
+
+        Returns
+        -------
+        d : array or int
+          the calculated distances
+        """
+        if isinstance(x, int):
+            x = str(x)
+
+        if isinstance(y, int):
+            y = str(y)
+
+        if isinstance(x, str):
+            x = [x]
+            single_x_entry = True
+        else:
+            single_x_entry = False
+        df_x = super(GeoDistance, self).query_postal_code(x)
+
+        if isinstance(y, str):
+            y = [y]
+            single_y_entry = True
+        else:
+            single_y_entry = False
+
+        df_y = super(GeoDistance, self).query_postal_code(y)
+
+        x_coords = df_x[['latitude', 'longitude']].values
+        y_coords = df_y[['latitude', 'longitude']].values
+
+        if x_coords.shape[0] == y_coords.shape[0]:
+            pass
+        elif x_coords.shape[0] == 1:
+            x_coords = np.repeat(x_coords, y_coords.shape[0], axis=0)
+        elif y_coords.shape[0] == 1:
+            y_coords = np.repeat(y_coords, x_coords.shape[0], axis=0)
+        else:
+            raise ValueError('x and y must have the same number of elements')
+
+        dist = haversine_distance(x_coords, y_coords)
+        if single_x_entry and single_y_entry:
+            return dist[0]
+        else:
+            return dist
+
+
+# Copied from geopy
+# IUGG mean earth radius in kilometers, from
+# https://en.wikipedia.org/wiki/Earth_radius#Mean_radius.  Using a
+# sphere with this radius results in an error of up to about 0.5%.
+EARTH_RADIUS = 6371.009
+
+
+def haversine_distance(x, y):
+    """Haversine (great circle) distance
+
+    Calculate the great circle distance between two points
+    on the earth (specified in decimal degrees)
+
+    Parameters
+    ----------
+    x : array, shape=(n_samples, 2)
+      the first list of coordinates (degrees)
+    y : array: shape=(n_samples, 2)
+      the second list of coordinates (degress)
+
+    Returns
+    -------
+    d : array, shape=(n_samples,)
+      the distance between corrdinates (km)
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Great-circle_distance
+    """
+    x_rad = np.radians(x)
+    y_rad = np.radians(y)
+
+    d = y_rad - x_rad
+
+    dlat, dlon = d.T
+    x_lat = x_rad[:, 0]
+    y_lat = y_rad[:, 0]
+
+    a = np.sin(dlat/2.0)**2 + \
+        np.cos(x_lat) * np.cos(y_lat) * np.sin(dlon/2.0)**2
+
+    c = 2 * np.arcsin(np.sqrt(a))
+    return EARTH_RADIUS * c