From 8b885e2d5f19ba05dd3742254027493d857b738d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 5 Mar 2019 17:24:09 +0000 Subject: [PATCH 001/138] Add SpatialMapping classes for polygon, admin and grid --- .../flowmachine/core/spatial_mapping.py | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 flowmachine/flowmachine/core/spatial_mapping.py diff --git a/flowmachine/flowmachine/core/spatial_mapping.py b/flowmachine/flowmachine/core/spatial_mapping.py new file mode 100644 index 0000000000..769dd14ad8 --- /dev/null +++ b/flowmachine/flowmachine/core/spatial_mapping.py @@ -0,0 +1,212 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Classes that map cells (or towers or sites) to a spatial unit +(e.g. versioned-cell, admin*, grid, ...). +""" +from typing import List + +from . import Query, GeoTable, Grid + + +class SpatialMapping(Query): + """ + Class that provides a mapping from cell/site data in the location table to + spatial regions defined by geography information in a table. + + Parameters + ---------- + column_name : str or list, optional + The name of the column to fetch from the geometry + table in the database. Can also be a list of names. + geom_table : str or flowmachine.Query, optional + name of the table containing the geography information. + Can be either the name of a table, with the schema, or + a flowmachine.Query object. + geom_col : str, default 'geom' + column that defines the geography. + """ + + def __init__(self, *, column_name, geom_table, geom_col="geom"): + if type(column_name) is str: + self.column_name = [column_name] + else: + self.column_name = column_name + if issubclass(geom_table.__class__, Query): + self.geom_table = geom_table + else: + self.geom_table = GeoTable(name=geom_table, geom_column=geom_col) + self.geom_col = geom_col + self.location_info_table_fqn = self.connection.location_table + # if the subscriber wants to select a geometry from the sites table there + # is no need to join the table with itself. + self.requires_join = not ( + hasattr(self.geo_table, "fully_qualified_table_name") + and ( + self.location_info_table_fqn + == self.geom_table.fully_qualified_table_name + ) + ) + + super().__init__() + + # Need a method to check whether the required data can be found in the DB + + def _other_columns(self): + """ + Helper function which returns the list of returned column names, + excluding self.location_columns. + """ + return [ + "location_id", + "version", + "date_of_first_service", + "date_of_last_service", + ] + + @property + def location_columns(self) -> List[str]: + return self.column_name + + @property + def location_columns_string(self) -> str: + return ", ".join(self.location_columns) + + @property + def column_names(self) -> List[str]: + return self._other_columns() + self.location_columns + + def _join_clause(self): + if self.requires_join: + table_name = "polygon" + join = f""" + INNER JOIN + ({self.geom_table.get_query()}) AS polygon + ON ST_within( + locinfo.geom_point::geometry, + ST_SetSRID(polygon.{self.geom_col}, 4326)::geometry + ) + """ + else: + # if the subscriber wants to select a geometry from the sites table + # there is no need to join the table with itself. + table_name = "locinfo" + join = "" + + def _make_query(self): + table_name, join = self._join_clause() + columns = ", ".join(f"{table_name}.{c}" for c in self.column_name) + + # Create a table + sql = f""" + SELECT + locinfo.id AS location_id, + locinfo.version, + locinfo.date_of_first_service, + locinfo.date_of_last_service, + {columns} + FROM + {self.location_info_table_fqn} AS locinfo + {join} + """ + + return sql + + +class AdminSpatialMapping(SpatialMapping): + """ + Maps all cells (aka sites) to an admin region. This is a thin wrapper to + the more general class SpatialMapping, which assumes that you have + the standard set-up. + + Parameters + ---------- + level : int + Admin level (e.g. 1 for admin1, 2 for admin2, etc.) + column_name : str, optional + Pass a string of the column to use as the + identifier of the admin region. By default + this will be admin*pcod. But you may wish + to use something else, such as admin3name. + """ + + def __init__(self, *, level, column_name=None): + self.level = level + # If there is no column_name passed then we can use + # the default, which is of the form admin3pcod. + if column_name is None: + col_name = self._get_standard_name() + else: + col_name = column_name + table = f"geography.admin{self.level}" + + super().__init__(column_name=col_name, geom_table=table) + + def _get_standard_name(self): + """ + Returns the standard name of the column that identifies + the name of the region. + """ + + return f"admin{self.level}pcod" + + @property + def location_columns(self) -> List[str]: + # If the user has asked for the standard column_name + # then we will alias this column as 'pcod', otherwise + # we'll won't alias it at all. + if self.column_name[0] == self._get_standard_name(): + columns = ["pcod"] + else: + columns = self.column_name + return columns + + def _make_query(self): + table_name, join = self._join_clause() + # If the user has asked for the standard column_name + # then we will alias this column as 'pcod', otherwise + # we'll won't alias it at all. + if self.column_name[0] == self._get_standard_name(): + col_name = f"{table_name}.{self.column_name[0]} AS pcod" + else: + col_name = f"{table_name}.{self.column_name[0]}" + + # Create a table + sql = f""" + SELECT + locinfo.id AS location_id, + locinfo.version, + locinfo.date_of_first_service, + locinfo.date_of_last_service, + {col_name} + FROM + {self.location_info_table_fqn} AS locinfo + {join} + """ + + return sql + + +class GridSpatialMapping(SpatialMapping): + """ + Query representing a mapping between all the sites in the database + and a grid of arbitrary size. + + Parameters + ---------- + size : float or int + Size of the grid in kilometres + """ + + def __init__(self, *, size): + self.size = size + self.grid = Grid(self.size) + super().__init__( + column_name=["grid_id"], geom_table=self.grid, geom_col="geom_square" + ) + + +# class LatLonSpatialMapping(SpatialMapping): + From b93361781f5d4e3a9985df5d3e0c9b12dd9fb118 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 6 Mar 2019 10:36:30 +0000 Subject: [PATCH 002/138] Add SpatialMapping class for lat-lon --- .../flowmachine/core/spatial_mapping.py | 84 ++++++++++++++----- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_mapping.py b/flowmachine/flowmachine/core/spatial_mapping.py index 769dd14ad8..281a4d3eaa 100644 --- a/flowmachine/flowmachine/core/spatial_mapping.py +++ b/flowmachine/flowmachine/core/spatial_mapping.py @@ -11,6 +11,25 @@ from . import Query, GeoTable, Grid +def get_alias(column_name): + """ + Given a column name string, return the alias (if there is one), + or return the provided column name if there is no alias. + + Examples + -------- + >>> get_alias("col AS alias") + "alias" + >>> get_alias("col") + "col" + """ + column_name_split = column_name.split() + if len(column_name_split) == 3 and column_name_split[1].lower() == "as": + return column_name_split[2] + else: + return column_name + + class SpatialMapping(Query): """ Class that provides a mapping from cell/site data in the location table to @@ -29,6 +48,13 @@ class SpatialMapping(Query): column that defines the geography. """ + _columns_from_locinfo_table = ( + "id AS location_id", + "version", + "date_of_first_service", + "date_of_last_service", + ) + def __init__(self, *, column_name, geom_table, geom_col="geom"): if type(column_name) is str: self.column_name = [column_name] @@ -43,7 +69,7 @@ def __init__(self, *, column_name, geom_table, geom_col="geom"): # if the subscriber wants to select a geometry from the sites table there # is no need to join the table with itself. self.requires_join = not ( - hasattr(self.geo_table, "fully_qualified_table_name") + hasattr(self.geom_table, "fully_qualified_table_name") and ( self.location_info_table_fqn == self.geom_table.fully_qualified_table_name @@ -59,12 +85,7 @@ def _other_columns(self): Helper function which returns the list of returned column names, excluding self.location_columns. """ - return [ - "location_id", - "version", - "date_of_first_service", - "date_of_last_service", - ] + return [get_alias(c) for c in self._columns_from_locinfo_table] @property def location_columns(self) -> List[str]: @@ -80,7 +101,7 @@ def column_names(self) -> List[str]: def _join_clause(self): if self.requires_join: - table_name = "polygon" + joined_name = "polygon" join = f""" INNER JOIN ({self.geom_table.get_query()}) AS polygon @@ -92,20 +113,20 @@ def _join_clause(self): else: # if the subscriber wants to select a geometry from the sites table # there is no need to join the table with itself. - table_name = "locinfo" + joined_name = "locinfo" join = "" + return joined_name, join + def _make_query(self): - table_name, join = self._join_clause() + joined_name, join = self._join_clause() + other_cols = ", ".join(f"locinfo.{c}" for c in self._columns_from_locinfo_table) columns = ", ".join(f"{table_name}.{c}" for c in self.column_name) # Create a table sql = f""" SELECT - locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, + {other_cols}, {columns} FROM {self.location_info_table_fqn} AS locinfo @@ -165,6 +186,7 @@ def location_columns(self) -> List[str]: def _make_query(self): table_name, join = self._join_clause() + other_cols = ", ".join(f"locinfo.{c}" for c in self._columns_from_locinfo_table) # If the user has asked for the standard column_name # then we will alias this column as 'pcod', otherwise # we'll won't alias it at all. @@ -176,10 +198,7 @@ def _make_query(self): # Create a table sql = f""" SELECT - locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, + {other_cols}, {col_name} FROM {self.location_info_table_fqn} AS locinfo @@ -208,5 +227,32 @@ def __init__(self, *, size): ) -# class LatLonSpatialMapping(SpatialMapping): +class LatLonSpatialMapping(SpatialMapping): + + _columns_from_locinfo_table = ( + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + ) + + def __init__(self): + super().__init__( + column_name=[ + "ST_X(geom_point::geometry) AS lon", + "ST_Y(geom_point::geometry) AS lat", + ], + geom_table=self.connection.location_table, + ) + + @property + def location_columns(self) -> List[str]: + return ["lon", "lat"] + def _make_query(self): + other_cols = ", ".join(self._columns_from_locinfo_table) + columns = ", ".join(self.column_name) + sql = f""" + SELECT + {other_cols}, + {columns} + FROM {self.location_table_fqn}""" From aa385339d1ff0c6bd10c72e111454af54efbc058 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 8 Mar 2019 17:00:35 +0000 Subject: [PATCH 003/138] Rename SpatialMapping back to SpatialUnit; simplify base class; add classes for versioned-cell and versioned-site --- .../{spatial_mapping.py => spatial_unit.py} | 167 +++++++++++++----- 1 file changed, 126 insertions(+), 41 deletions(-) rename flowmachine/flowmachine/core/{spatial_mapping.py => spatial_unit.py} (63%) diff --git a/flowmachine/flowmachine/core/spatial_mapping.py b/flowmachine/flowmachine/core/spatial_unit.py similarity index 63% rename from flowmachine/flowmachine/core/spatial_mapping.py rename to flowmachine/flowmachine/core/spatial_unit.py index 281a4d3eaa..3946a9003b 100644 --- a/flowmachine/flowmachine/core/spatial_mapping.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -7,6 +7,7 @@ (e.g. versioned-cell, admin*, grid, ...). """ from typing import List +import re from . import Query, GeoTable, Grid @@ -23,24 +24,139 @@ def get_alias(column_name): >>> get_alias("col") "col" """ - column_name_split = column_name.split() - if len(column_name_split) == 3 and column_name_split[1].lower() == "as": - return column_name_split[2] - else: - return column_name + return re.split(" as ", column_name, flags=re.IGNORECASE)[-1] -class SpatialMapping(Query): +class SpatialUnit(Query): + def __init__(self, *, selected_column_names, location_column_names, location_info_table=None, join_clause=""): + if type(selected_column_names) is str: + self._cols = [selected_column_names] + else: + self._cols = selected_column_names + + if type(other_column_names) is str: + self._loc_cols = [location_column_names] + else: + self._loc_cols = location_column_names + + missing_cols = [c for c in self._loc_cols if not (c in self.column_names)] + if missing_cols: + raise ValueError( + f"Location columns {missing_cols} are not in returned columns" + ) + + if location_info_table: + self.location_info_table = location_info_table + else: + self.location_info_table = self.connection.location_table + + self._join_clause = join_clause + + super().__init__() + + # TODO: Need a method to check whether the required data can be found in the DB + + @property + def location_columns(self) -> List[str]: + return self._loc_cols + + @property + def column_names(self) -> List[str]: + return [get_alias(c) for c in self._cols] + + def _make_query(self): + columns = ", ".join(self._cols) + sql = f""" + SELECT + {columns} + FROM {self.location_info_table} + {self._join_clause} + """ + + return sql + + +class LatLonSpatialUnit(SpatialUnit) + def __init__(self): + super().__init__( + selected_column_names=[ + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + "ST_X(geom_point::geometry) AS lon", + "ST_Y(geom_point::geometry) AS lat", + ], + location_column_names=["lat", "lon"], + ) + + +class VersionedCellSpatialUnit(SpatialUnit): + def __init__(self): + if self.connection.location_table != "infrastructure.cells": + raise ValueError("Versioned cell spatial unit is unavailable.") + + super().__init__( + selected_column_names=[ + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + "version", + "ST_X(geom_point::geometry) AS lon", + "ST_Y(geom_point::geometry) AS lat", + ], + location_column_names=["location_id", "version", "lon", "lat"], + location_info_table="infrastructure.cells", + ) + + +class VersionedSiteSpatialUnit(SpatialUnit): + def __init__(self): + location_table = self.connection.location_table + + sites_alias = "s" + if location_table == "infrastructure.sites": + cells_alias = sites_alias + join_clause = f""" + RIGHT JOIN + infrastructure.cells AS {cells_alias} + ON {sites_alias}.id = {cells_alias}.site_id + """ + elif location_table == "infrastructure.cells": + cells_alias = "c" + join_clause = "" + else: + raise ValueError( + f"Expected location table to be 'infrastructure.cells' " + f"or 'infrastructure.sites', not '{location_table}''" + ) + + super().__init__( + selected_column_names=[ + f"{cells_alias}.id AS location_id", + f"{sites_alias}.id AS site_id", + f"{sites_alias}.date_of_first_service AS date_of_first_service", + f"{sites_alias}.date_of_last_service AS date_of_last_service", + f"{sites_alias}.version as version", + f"ST_X({sites_alias}.geom_point::geometry) AS lon", + f"ST_Y({sites_alias}.geom_point::geometry) AS lat", + ], + location_column_names=["location_id", "version", "lon", "lat"], + location_info_table=f"infrastructure.sites AS {sites_alias}", + join_clause=join_clause, + ) + + +class PolygonSpatialUnit(SpatialUnit): """ Class that provides a mapping from cell/site data in the location table to spatial regions defined by geography information in a table. Parameters ---------- - column_name : str or list, optional + column_name : str or list The name of the column to fetch from the geometry table in the database. Can also be a list of names. - geom_table : str or flowmachine.Query, optional + geom_table : str or flowmachine.Query name of the table containing the geography information. Can be either the name of a table, with the schema, or a flowmachine.Query object. @@ -136,7 +252,7 @@ def _make_query(self): return sql -class AdminSpatialMapping(SpatialMapping): +class AdminSpatialMapping(PolygonSpatialMapping): """ Maps all cells (aka sites) to an admin region. This is a thin wrapper to the more general class SpatialMapping, which assumes that you have @@ -208,7 +324,7 @@ def _make_query(self): return sql -class GridSpatialMapping(SpatialMapping): +class GridSpatialMapping(PolygonSpatialMapping): """ Query representing a mapping between all the sites in the database and a grid of arbitrary size. @@ -225,34 +341,3 @@ def __init__(self, *, size): super().__init__( column_name=["grid_id"], geom_table=self.grid, geom_col="geom_square" ) - - -class LatLonSpatialMapping(SpatialMapping): - - _columns_from_locinfo_table = ( - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - ) - - def __init__(self): - super().__init__( - column_name=[ - "ST_X(geom_point::geometry) AS lon", - "ST_Y(geom_point::geometry) AS lat", - ], - geom_table=self.connection.location_table, - ) - - @property - def location_columns(self) -> List[str]: - return ["lon", "lat"] - - def _make_query(self): - other_cols = ", ".join(self._columns_from_locinfo_table) - columns = ", ".join(self.column_name) - sql = f""" - SELECT - {other_cols}, - {columns} - FROM {self.location_table_fqn}""" From ae7e8b1e639262642c34826c5e6ffe0ef456edb6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 8 Mar 2019 18:00:41 +0000 Subject: [PATCH 004/138] Restructure PolygonSpatialUnit to inherit from SpatialUnit, and corresponding changes to AdminSpatialUnit and GridSpatialUnit --- flowmachine/flowmachine/core/spatial_unit.py | 163 +++++-------------- 1 file changed, 44 insertions(+), 119 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 3946a9003b..768128eed6 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -153,10 +153,10 @@ class PolygonSpatialUnit(SpatialUnit): Parameters ---------- - column_name : str or list + polygon_column_names : str or list The name of the column to fetch from the geometry table in the database. Can also be a list of names. - geom_table : str or flowmachine.Query + polygon_table : str or flowmachine.Query name of the table containing the geography information. Can be either the name of a table, with the schema, or a flowmachine.Query object. @@ -164,95 +164,52 @@ class PolygonSpatialUnit(SpatialUnit): column that defines the geography. """ - _columns_from_locinfo_table = ( - "id AS location_id", - "version", - "date_of_first_service", - "date_of_last_service", - ) - - def __init__(self, *, column_name, geom_table, geom_col="geom"): - if type(column_name) is str: - self.column_name = [column_name] - else: - self.column_name = column_name - if issubclass(geom_table.__class__, Query): - self.geom_table = geom_table + def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): + if issubclass(polygon_table.__class__, Query): + self.polygon_table = polygon_table else: - self.geom_table = GeoTable(name=geom_table, geom_column=geom_col) + self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) + self.geom_col = geom_col - self.location_info_table_fqn = self.connection.location_table - # if the subscriber wants to select a geometry from the sites table there - # is no need to join the table with itself. - self.requires_join = not ( - hasattr(self.geom_table, "fully_qualified_table_name") - and ( - self.location_info_table_fqn - == self.geom_table.fully_qualified_table_name - ) - ) - - super().__init__() - - # Need a method to check whether the required data can be found in the DB - def _other_columns(self): - """ - Helper function which returns the list of returned column names, - excluding self.location_columns. - """ - return [get_alias(c) for c in self._columns_from_locinfo_table] - - @property - def location_columns(self) -> List[str]: - return self.column_name - - @property - def location_columns_string(self) -> str: - return ", ".join(self.location_columns) + location_info_table = self.connection.location_table - @property - def column_names(self) -> List[str]: - return self._other_columns() + self.location_columns - - def _join_clause(self): - if self.requires_join: - joined_name = "polygon" - join = f""" + locinfo_alias = "locinfo" + if hasattr(self.polygon_table, "fully_qualified_table_name") and ( + location_info_table == self.polygon_table.fully_qualified_table_name + ): + # if the subscriber wants to select a geometry from the sites table + # there is no need to join the table with itself. + joined_alias = locinfo_alias + join_clause = "" + else: + joined_alias = "polygon" + join_clause = f""" INNER JOIN - ({self.geom_table.get_query()}) AS polygon + ({self.polygon_table.get_query()}) AS {joined_alias} ON ST_within( - locinfo.geom_point::geometry, - ST_SetSRID(polygon.{self.geom_col}, 4326)::geometry + {locinfo_alias}.geom_point::geometry, + ST_SetSRID({joined_alias}.{self.geom_col}, 4326)::geometry ) """ - else: - # if the subscriber wants to select a geometry from the sites table - # there is no need to join the table with itself. - joined_name = "locinfo" - join = "" - - return joined_name, join - - def _make_query(self): - joined_name, join = self._join_clause() - other_cols = ", ".join(f"locinfo.{c}" for c in self._columns_from_locinfo_table) - columns = ", ".join(f"{table_name}.{c}" for c in self.column_name) - # Create a table - sql = f""" - SELECT - {other_cols}, - {columns} - FROM - {self.location_info_table_fqn} AS locinfo - {join} - """ + locinfo_column_names = [ + f"{locinfo_alias}.id AS location_id", + f"{locinfo_alias}.version AS version", + f"{locinfo_alias}.date_of_first_service AS date_of_first_service", + f"{locinfo_alias}.date_of_last_service AS date_of_last_service", + ] + if type(polygon_columns) is str: + polygon_cols = [f"{joined_alias}.{polygon_column_names}"] + else: + polygon_cols = [f"{joined_alias}.{c}" for c in polygon_column_names] + all_column_names = locinfo_column_names + polygon_cols + location_column_names = [get_alias(c) for c in polygon_cols] - return sql + super().__init__(selected_column_names=all_column_names, location_column_names=location_column_names, location_info_table=f"{location_info_table} AS {locinfo_alias}", join_clause=join_clause) -class AdminSpatialMapping(PolygonSpatialMapping): +class AdminSpatialUnit(PolygonSpatialUnit): """ Maps all cells (aka sites) to an admin region. This is a thin wrapper to the more general class SpatialMapping, which assumes that you have @@ -273,13 +230,16 @@ def __init__(self, *, level, column_name=None): self.level = level # If there is no column_name passed then we can use # the default, which is of the form admin3pcod. - if column_name is None: + # If the user has asked for the standard column_name + # then we will alias this column as 'pcod', otherwise + # we'll won't alias it at all. + if (column_name is None) or (column_name == self._get_standard_name()): col_name = self._get_standard_name() else: - col_name = column_name + col_name = f"{column_name} AS pcod" table = f"geography.admin{self.level}" - super().__init__(column_name=col_name, geom_table=table) + super().__init__(polygon_column_names=col_name, polygon_table=table) def _get_standard_name(self): """ @@ -289,42 +249,8 @@ def _get_standard_name(self): return f"admin{self.level}pcod" - @property - def location_columns(self) -> List[str]: - # If the user has asked for the standard column_name - # then we will alias this column as 'pcod', otherwise - # we'll won't alias it at all. - if self.column_name[0] == self._get_standard_name(): - columns = ["pcod"] - else: - columns = self.column_name - return columns - - def _make_query(self): - table_name, join = self._join_clause() - other_cols = ", ".join(f"locinfo.{c}" for c in self._columns_from_locinfo_table) - # If the user has asked for the standard column_name - # then we will alias this column as 'pcod', otherwise - # we'll won't alias it at all. - if self.column_name[0] == self._get_standard_name(): - col_name = f"{table_name}.{self.column_name[0]} AS pcod" - else: - col_name = f"{table_name}.{self.column_name[0]}" - - # Create a table - sql = f""" - SELECT - {other_cols}, - {col_name} - FROM - {self.location_info_table_fqn} AS locinfo - {join} - """ - - return sql - -class GridSpatialMapping(PolygonSpatialMapping): +class GridSpatialUnit(PolygonSpatialUnit): """ Query representing a mapping between all the sites in the database and a grid of arbitrary size. @@ -337,7 +263,6 @@ class GridSpatialMapping(PolygonSpatialMapping): def __init__(self, *, size): self.size = size - self.grid = Grid(self.size) super().__init__( - column_name=["grid_id"], geom_table=self.grid, geom_col="geom_square" + polygon_column_names=["grid_id"], polygon_table=Grid(self.size), geom_col="geom_square" ) From f3ead37c235461af8af701bfdd37e26a64145632 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 11 Mar 2019 09:56:31 +0000 Subject: [PATCH 005/138] Add docstrings to SpatialUnit classes --- flowmachine/flowmachine/core/spatial_unit.py | 86 ++++++++++++++++---- 1 file changed, 69 insertions(+), 17 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 768128eed6..009b875788 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -28,7 +28,34 @@ def get_alias(column_name): class SpatialUnit(Query): - def __init__(self, *, selected_column_names, location_column_names, location_info_table=None, join_clause=""): + """ + Base class for all spatial units. Selects columns from the location table, + and optionally joins to data in another table. + + Parameters + ---------- + selected_column_names : str or list + The name(s) of the column(s) to fetch from the location + table in the database. + location_column_names : str or list + Name(s) of the location-related column(s). + Must be a subset of the column_names for this query. + location_info_table : str, optional + Fully qualified name of the location info table to select from. + Defaults to self.connection.location_table + join_clause : str, optional + Optionally provide a SQL join clause to join data from the + location info table to spatial regions in another table. + """ + + def __init__( + self, + *, + selected_column_names, + location_column_names, + location_info_table=None, + join_clause="", + ): if type(selected_column_names) is str: self._cols = [selected_column_names] else: @@ -39,6 +66,7 @@ def __init__(self, *, selected_column_names, location_column_names, location_inf else: self._loc_cols = location_column_names + # Check that _loc_cols is a subset of column_names missing_cols = [c for c in self._loc_cols if not (c in self.column_names)] if missing_cols: raise ValueError( @@ -49,7 +77,7 @@ def __init__(self, *, selected_column_names, location_column_names, location_inf self.location_info_table = location_info_table else: self.location_info_table = self.connection.location_table - + self._join_clause = join_clause super().__init__() @@ -58,6 +86,9 @@ def __init__(self, *, selected_column_names, location_column_names, location_inf @property def location_columns(self) -> List[str]: + """ + List of the location-related column names. + """ return self._loc_cols @property @@ -76,7 +107,11 @@ def _make_query(self): return sql -class LatLonSpatialUnit(SpatialUnit) +class LatLonSpatialUnit(SpatialUnit): + """ + Class that maps cell location_id to lat-lon coordinates. + """ + def __init__(self): super().__init__( selected_column_names=[ @@ -91,10 +126,14 @@ def __init__(self): class VersionedCellSpatialUnit(SpatialUnit): + """ + Class that maps cell location_id to a cell version and lat-lon coordinates. + """ + def __init__(self): if self.connection.location_table != "infrastructure.cells": raise ValueError("Versioned cell spatial unit is unavailable.") - + super().__init__( selected_column_names=[ "id AS location_id", @@ -110,6 +149,10 @@ def __init__(self): class VersionedSiteSpatialUnit(SpatialUnit): + """ + Class that maps cell location_id to a site version and lat-lon coordinates. + """ + def __init__(self): location_table = self.connection.location_table @@ -129,7 +172,7 @@ def __init__(self): f"Expected location table to be 'infrastructure.cells' " f"or 'infrastructure.sites', not '{location_table}''" ) - + super().__init__( selected_column_names=[ f"{cells_alias}.id AS location_id", @@ -169,14 +212,14 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): self.polygon_table = polygon_table else: self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) - + self.geom_col = geom_col location_info_table = self.connection.location_table locinfo_alias = "locinfo" if hasattr(self.polygon_table, "fully_qualified_table_name") and ( - location_info_table == self.polygon_table.fully_qualified_table_name + location_info_table == self.polygon_table.fully_qualified_table_name ): # if the subscriber wants to select a geometry from the sites table # there is no need to join the table with itself. @@ -200,20 +243,27 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): f"{locinfo_alias}.date_of_last_service AS date_of_last_service", ] if type(polygon_columns) is str: - polygon_cols = [f"{joined_alias}.{polygon_column_names}"] + polygon_cols = [polygon_column_names] else: - polygon_cols = [f"{joined_alias}.{c}" for c in polygon_column_names] - all_column_names = locinfo_column_names + polygon_cols + polygon_cols = polygon_column_names + all_column_names = locinfo_column_names + [ + f"{joined_alias}.{c}" for c in polygon_cols + ] location_column_names = [get_alias(c) for c in polygon_cols] - super().__init__(selected_column_names=all_column_names, location_column_names=location_column_names, location_info_table=f"{location_info_table} AS {locinfo_alias}", join_clause=join_clause) + super().__init__( + selected_column_names=all_column_names, + location_column_names=location_column_names, + location_info_table=f"{location_info_table} AS {locinfo_alias}", + join_clause=join_clause, + ) class AdminSpatialUnit(PolygonSpatialUnit): """ - Maps all cells (aka sites) to an admin region. This is a thin wrapper to - the more general class SpatialMapping, which assumes that you have - the standard set-up. + Class that maps all cells (aka sites) to an admin region. This is a thin + wrapper to the more general class PolygonSpatialUnit, which assumes that + you have the standard set-up. Parameters ---------- @@ -234,9 +284,9 @@ def __init__(self, *, level, column_name=None): # then we will alias this column as 'pcod', otherwise # we'll won't alias it at all. if (column_name is None) or (column_name == self._get_standard_name()): - col_name = self._get_standard_name() + col_name = f"{self._get_standard_name()} AS pcod" else: - col_name = f"{column_name} AS pcod" + col_name = column_name table = f"geography.admin{self.level}" super().__init__(polygon_column_names=col_name, polygon_table=table) @@ -264,5 +314,7 @@ class GridSpatialUnit(PolygonSpatialUnit): def __init__(self, *, size): self.size = size super().__init__( - polygon_column_names=["grid_id"], polygon_table=Grid(self.size), geom_col="geom_square" + polygon_column_names=["grid_id"], + polygon_table=Grid(self.size), + geom_col="geom_square", ) From 0ab2408e7bf8209dd527ce6270b1862a212c2be6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 11 Mar 2019 12:28:49 +0000 Subject: [PATCH 006/138] Move get_alias to utils.py; add tests --- flowmachine/flowmachine/core/spatial_unit.py | 48 ++++---- flowmachine/flowmachine/utils.py | 16 +++ flowmachine/tests/test_spatial_unit.py | 123 +++++++++++++++++++ flowmachine/tests/test_utils.py | 9 ++ 4 files changed, 170 insertions(+), 26 deletions(-) create mode 100644 flowmachine/tests/test_spatial_unit.py diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 009b875788..0320c16716 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -7,26 +7,11 @@ (e.g. versioned-cell, admin*, grid, ...). """ from typing import List -import re +from flowmachine.utils import get_alias from . import Query, GeoTable, Grid -def get_alias(column_name): - """ - Given a column name string, return the alias (if there is one), - or return the provided column name if there is no alias. - - Examples - -------- - >>> get_alias("col AS alias") - "alias" - >>> get_alias("col") - "col" - """ - return re.split(" as ", column_name, flags=re.IGNORECASE)[-1] - - class SpatialUnit(Query): """ Base class for all spatial units. Selects columns from the location table, @@ -201,25 +186,22 @@ class PolygonSpatialUnit(SpatialUnit): table in the database. Can also be a list of names. polygon_table : str or flowmachine.Query name of the table containing the geography information. - Can be either the name of a table, with the schema, or - a flowmachine.Query object. + Can be either the name of a table, with the schema, a flowmachine.Query + object, or a string representing a query. geom_col : str, default 'geom' column that defines the geography. """ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): - if issubclass(polygon_table.__class__, Query): - self.polygon_table = polygon_table - else: - self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) - + self.polygon_table = polygon_table self.geom_col = geom_col location_info_table = self.connection.location_table locinfo_alias = "locinfo" - if hasattr(self.polygon_table, "fully_qualified_table_name") and ( - location_info_table == self.polygon_table.fully_qualified_table_name + if ( + isinstance(self.polygon_table, str) + and location_info_table == self.polygon_table.lower().strip() ): # if the subscriber wants to select a geometry from the sites table # there is no need to join the table with itself. @@ -229,7 +211,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): joined_alias = "polygon" join_clause = f""" INNER JOIN - ({self.polygon_table.get_query()}) AS {joined_alias} + {self._get_subtable()} AS {joined_alias} ON ST_within( {locinfo_alias}.geom_point::geometry, ST_SetSRID({joined_alias}.{self.geom_col}, 4326)::geometry @@ -258,6 +240,20 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): join_clause=join_clause, ) + def _get_subtable(self): + """ + Private method which takes the table and returns a query + representing the object. This is necessary as the table can + be passed in a variety of ways. + """ + + if issubclass(self.polygon_table.__class__, Query): + return f"({self.polygon_table.get_query()})" + elif "select " in self.polygon_table.lower(): + return f"({self.polygon_table})" + else: + return self.polygon_table + class AdminSpatialUnit(PolygonSpatialUnit): """ diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index a5b29cb1c7..e292ad2769 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -9,6 +9,7 @@ import datetime import logging +import re from contextlib import contextmanager from pathlib import Path from pglast import prettify @@ -331,3 +332,18 @@ def _makesafe(x): Function that converts input into a PostgreSQL readable. """ return adapt(x).getquoted().decode() + + +def get_alias(column_name): + """ + Given a column name string, return the alias (if there is one), + or return the provided column name if there is no alias. + + Examples + -------- + >>> get_alias("col AS alias") + "alias" + >>> get_alias("col") + "col" + """ + return re.split(" as ", column_name, flags=re.IGNORECASE)[-1] diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py new file mode 100644 index 0000000000..3eb6b794b0 --- /dev/null +++ b/flowmachine/tests/test_spatial_unit.py @@ -0,0 +1,123 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from flowmachine.core.spatial_unit import ( + SpatialUnit, + LatLonSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + PolygonSpatialUnit, + AdminSpatialUnit, + GridSpatialUnit, +) +import pytest + + +@pytest.mark.parametrize( + "spatial_unit, args", + [ + (LatLonSpatialUnit, {}), + (VersionedCellSpatialUnit, {}), + (VersionedSiteSpatialUnit, {}), + ( + PolygonSpatialUnit, + {"polygon_column_names": "admin3name", "polygon_table": "geography.admin3"}, + ), + ( + PolygonSpatialUnit, + { + "polygon_column_names": "id", + "polygon_table": "infrastructure.sites", + "geom_col": "geom_point", + }, + ), + ( + PolygonSpatialUnit, + { + "polygon_column_names": "id", + "polygon_table": "SELECT * FROM infrastructure.sites", + "geom_col": "geom_point", + }, + ), + (AdminSpatialUnit, {"level": 3}), + (AdminSpatialUnit, {"level": 3, "column_name": "admin3name"}), + (GridSpatialUnit, {"size": 5}), + ], +) +def test_spatial_unit_column_names(spatial_unit, args): + """ + Test that the SpatialUnit classes have accurate column_names properties. + """ + instance = spatial_unit(**args) + assert instance.head(0).columns.tolist() == instance.column_names + + +@pytest.mark.parametrize( + "spatial_unit, args, loc_cols", + [ + (LatLonSpatialUnit, {}, ["lat", "lon"]), + (VersionedCellSpatialUnit, {}, ["location_id", "version", "lon", "lat"]), + (VersionedSiteSpatialUnit, {}, ["site_id", "version", "lon", "lat"]), + ( + PolygonSpatialUnit, + { + "polygon_column_names": "id", + "polygon_table": "infrastructure.sites", + "geom_col": "geom_point", + }, + ["id"], + ), + ( + PolygonSpatialUnit, + { + "polygon_column_names": ["id"], + "polygon_table": "infrastructure.sites", + "geom_col": "geom_point", + }, + ["id"], + ), + (AdminSpatialUnit, {"level": 3}, ["pcod"]), + (AdminSpatialUnit, {"level": 3, "column_name": "admin3pcod"}, ["pcod"]), + (AdminSpatialUnit, {"level": 3, "column_name": "admin3name"}, ["admin3name"]), + (GridSpatialUnit, {"size": 5}, ["grid_id"]), + ], +) +def test_spatial_unit_location_columns(spatial_unit, args, loc_cols): + """ + Test that the SpatialUnit classes have the correct location_columns properties. + """ + instance = spatial_unit(**args) + assert loc_cols == instance.location_columns + + +def test_polygon_spatial_unit_column_list(): + """ + Test that, when supplying polygon_column_names to PolygonSpatialUnit as a + list, location_columns returns it as a new list. + """ + passed_cols = ["id"] + psu = PolygonSpatialUnit( + polygon_column_names=passed_cols, + polygon_table="infrastructure.sites", + geom_col="geom_point", + ) + loc_cols = psu.location_columns + assert passed_cols == loc_cols + assert id(passed_cols) != id(loc_cols) + + +def test_missing_location_columns_raises_error(): + """ + Test that a ValueError is raised if the location_column_names passed to + SpatialUnit are not a subset of column_names. + """ + with pytest.raises(ValueError, match="['NOT_A_COLUMN']"): + su = SpatialUnit( + selected_column_names=[ + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + ], + location_column_names=["location_id", "NOT_A_COLUMN"], + ) diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index 55da2df055..9aa0992342 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -19,6 +19,7 @@ getsecret, pretty_sql, _makesafe, + get_alias, ) from flowmachine.utils import time_period_add @@ -158,3 +159,11 @@ def test_get_secrets_default(monkeypatch): the_secret_name = "SECRET" secret = getsecret(the_secret_name, the_secret) assert the_secret == secret + + +@pytest.mark.parametrize( + "column_name, alias", + [("column", "column"), ("column AS alias", "alias"), ("column as alias", "alias")], +) +def test_get_alias(column_name, alias): + assert alias == get_alias(column_name) From 4a371246e16455f6445a08b7765cbb991a190c1e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 11 Mar 2019 15:41:17 +0000 Subject: [PATCH 007/138] Update JoinToLocation to take a SpatialUnit object instead of a string --- flowmachine/flowmachine/core/__init__.py | 2 + .../flowmachine/core/join_to_location.py | 190 ++---------------- 2 files changed, 14 insertions(+), 178 deletions(-) diff --git a/flowmachine/flowmachine/core/__init__.py b/flowmachine/flowmachine/core/__init__.py index dc31ac82c4..6122844d28 100644 --- a/flowmachine/flowmachine/core/__init__.py +++ b/flowmachine/flowmachine/core/__init__.py @@ -14,6 +14,7 @@ from .init import connect from .join_to_location import JoinToLocation from .custom_query import CustomQuery +from .spatial_unit import SpatialUnit sub_modules = ["errors", "mixins", "api"] @@ -25,6 +26,7 @@ "connect", "JoinToLocation", "CustomQuery", + "SpatialUnit", ] __all__ = methods + sub_modules diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 063e44945e..11d1800833 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -10,10 +10,9 @@ """ from typing import List -from flowmachine.utils import get_columns_for_level from .query import Query from .custom_query import CustomQuery -from .errors import BadLevelError +from .spatial_unit import SpatialUnit class JoinToLocation(Query): @@ -33,35 +32,12 @@ class JoinToLocation(Query): This represents a table that can be joined to the cell information table. This must have a date column (called time) and a location column call 'location_id'. - level : str - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.SpatialUnit + A query which maps cell identifiers in the CDR to a different spatial + unit (e.g. versioned site or admin region) time_col : str, default 'time': The name of the column that identifies the time in the source table e.g. 'time', 'date', 'start_time' etc. - column_name : str, optional - Name of the column that identifies the region. This is only relevant - for admin region levels or polygon. See Also -------- @@ -72,38 +48,10 @@ class JoinToLocation(Query): """ - allowed_levels = [ - "admin0", - "admin1", - "admin2", - "admin3", - "polygon", - "grid", - "lat-lon", - "versioned-site", - "versioned-cell", - ] - - def __init__( - self, - left, - *, - level, - time_col="time", - column_name=None, - size=None, - polygon_table=None, - geom_col="geom", - ): - """ - - """ - - if level not in self.allowed_levels: - raise BadLevelError(level, self.allowed_levels) - self.level = level - if self.level == "polygon" and column_name is None: - raise ValueError("Must pass a column_name for level=polygon") + def __init__(self, left, *, spatial_unit, time_col="time"): + if not isinstance(spatial_unit, SpatialUnit): + raise TypeError("spatial_unit must be a SpatialUnit object") + self.spatial_unit = spatial_unit # If the user passes a string, rather than a flowmachine.Query object # then we'll simply turn this string into a flowmachine.Query object # and proceed as normal. @@ -112,11 +60,6 @@ def __init__( else: self.left = left self.time_col = time_col - self.column_name = column_name - self.location_table_fqn = self.connection.location_table - self.right_query = self._get_site_query( - size=size, polygon_table=polygon_table, geom_col=geom_col - ) super().__init__() def __getattr__(self, name): @@ -127,127 +70,18 @@ def __getattr__(self, name): try: return self.left.__getattribute__(name) except AttributeError: - return self.right_query.__getattribute__(name) - - def _get_site_query(self, *, size, polygon_table, geom_col): - """ - Returns the appropriate object to join on - to the right. - """ - from ..features.spatial.cell_mappings import ( - CellToAdmin, - CellToPolygon, - CellToGrid, - ) - - # The logic here finds a query that represents the mapping - # of cells to an region of interest. - if self.level.startswith("admin"): - return CellToAdmin(level=self.level, column_name=self.column_name) - elif self.level == "polygon": - return CellToPolygon( - column_name=self.column_name, - polygon_table=polygon_table, - geom_col=geom_col, - ) - elif self.level == "grid": - return CellToGrid(size=size) - elif self.level == "lat-lon": - sql = f""" - SELECT - id AS location_id, - date_of_first_service, - date_of_last_service, - ST_X(geom_point::geometry) AS lon, - ST_Y(geom_point::geometry) AS lat - FROM {self.location_table_fqn}""" - return CustomQuery( - sql, - [ - "location_id", - "date_of_first_service", - "date_of_last_service", - "lon", - "lat", - ], - ) - elif self.level == "versioned-site": - if self.location_table_fqn == "infrastructure.sites": - sql = """ - SELECT - id AS location_id, - id AS site_id, - date_of_first_service, - date_of_last_service, - version, - ST_X(geom_point::geometry) AS lon, - ST_Y(geom_point::geometry) AS lat - FROM infrastructure.sites - """ - elif self.location_table_fqn == "infrastructure.cells": - sql = """ - SELECT - c.id AS location_id, - s.id AS site_id, - s.date_of_first_service AS date_of_first_service, - s.date_of_last_service AS date_of_last_service, - s.version as version, - ST_X(s.geom_point::geometry) AS lon, - ST_Y(s.geom_point::geometry) AS lat - FROM infrastructure.sites AS s - RIGHT JOIN - infrastructure.cells AS c - ON s.id = c.site_id - """ - return CustomQuery( - sql, - [ - "location_id", - "site_id", - "date_of_first_service", - "date_of_last_service", - "version", - "lon", - "lat", - ], - ) - elif self.level == "versioned-cell": - if self.location_table_fqn == "infrastructure.cells": - sql = """ - SELECT - id AS location_id, - date_of_first_service, - date_of_last_service, - version, - ST_X(geom_point::geometry) AS lon, - ST_Y(geom_point::geometry) AS lat - FROM infrastructure.cells - """ - return CustomQuery( - sql, - [ - "location_id", - "version", - "date_of_first_service", - "date_of_last_service", - "lon", - "lat", - ], - ) - else: - raise ValueError("Versioned cell level is unavailable.") + return self.spatial_unit.__getattribute__(name) @property def column_names(self) -> List[str]: - right_columns = get_columns_for_level(self.level, self.column_name) + right_columns = self.spatial_unit.location_columns left_columns = self.left.column_names if "location_id" in right_columns and "location_id" in left_columns: left_columns.remove("location_id") return left_columns + right_columns def _make_query(self): - - right_columns = get_columns_for_level(self.level, self.column_name) + right_columns = self.spatial_unit.location_columns left_columns = self.left.column_names if "location_id" in right_columns and "location_id" in left_columns: left_columns.remove("location_id") @@ -262,7 +96,7 @@ def _make_query(self): FROM ({self.left.get_query()}) AS l INNER JOIN - ({self.right_query.get_query()}) AS sites + ({self.spatial_unit.get_query()}) AS sites ON l.location_id = sites.location_id AND From 88ae520472df005a4b2b4f33eed42596579400af Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 11 Mar 2019 16:31:00 +0000 Subject: [PATCH 008/138] Remove the option to pass SQL strings to JoinToLocation --- flowmachine/flowmachine/core/join_to_location.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 11d1800833..f7ff7fa541 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -11,7 +11,6 @@ from typing import List from .query import Query -from .custom_query import CustomQuery from .spatial_unit import SpatialUnit @@ -27,8 +26,7 @@ class JoinToLocation(Query): Parameters ---------- - left : str or flowmachine.Query - String to table (with the schema) or else a flowmachine.Query object. + left : flowmachine.Query This represents a table that can be joined to the cell information table. This must have a date column (called time) and a location column call 'location_id'. @@ -52,13 +50,7 @@ def __init__(self, left, *, spatial_unit, time_col="time"): if not isinstance(spatial_unit, SpatialUnit): raise TypeError("spatial_unit must be a SpatialUnit object") self.spatial_unit = spatial_unit - # If the user passes a string, rather than a flowmachine.Query object - # then we'll simply turn this string into a flowmachine.Query object - # and proceed as normal. - if type(left) is str: - self.left = CustomQuery(left, left.column_names) - else: - self.left = left + self.left = left self.time_col = time_col super().__init__() From 2a76b532ddb499302616ec707d2d5269999d0dfc Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 11 Mar 2019 16:39:31 +0000 Subject: [PATCH 009/138] Remove option to specify polygon_table as a SQL string in PolygonSpatialUnit --- flowmachine/flowmachine/core/spatial_unit.py | 27 ++++++-------------- flowmachine/tests/test_spatial_unit.py | 8 ------ 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 0320c16716..091003d7d8 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -193,15 +193,18 @@ class PolygonSpatialUnit(SpatialUnit): """ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): - self.polygon_table = polygon_table + if isinstance(polygon_table, Query): + self.polygon_table = polygon_table + else: + self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) + self.geom_col = geom_col location_info_table = self.connection.location_table locinfo_alias = "locinfo" - if ( - isinstance(self.polygon_table, str) - and location_info_table == self.polygon_table.lower().strip() + if hasattr(self.polygon_table, "fully_qualified_table_name") and ( + location_info_table == self.polygon_table.fully_qualified_table_name ): # if the subscriber wants to select a geometry from the sites table # there is no need to join the table with itself. @@ -211,7 +214,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): joined_alias = "polygon" join_clause = f""" INNER JOIN - {self._get_subtable()} AS {joined_alias} + ({self.polygon_table.get_query()}) AS {joined_alias} ON ST_within( {locinfo_alias}.geom_point::geometry, ST_SetSRID({joined_alias}.{self.geom_col}, 4326)::geometry @@ -240,20 +243,6 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): join_clause=join_clause, ) - def _get_subtable(self): - """ - Private method which takes the table and returns a query - representing the object. This is necessary as the table can - be passed in a variety of ways. - """ - - if issubclass(self.polygon_table.__class__, Query): - return f"({self.polygon_table.get_query()})" - elif "select " in self.polygon_table.lower(): - return f"({self.polygon_table})" - else: - return self.polygon_table - class AdminSpatialUnit(PolygonSpatialUnit): """ diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 3eb6b794b0..88771a1947 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -32,14 +32,6 @@ "geom_col": "geom_point", }, ), - ( - PolygonSpatialUnit, - { - "polygon_column_names": "id", - "polygon_table": "SELECT * FROM infrastructure.sites", - "geom_col": "geom_point", - }, - ), (AdminSpatialUnit, {"level": 3}), (AdminSpatialUnit, {"level": 3, "column_name": "admin3name"}), (GridSpatialUnit, {"size": 5}), From ab2f65866264521228c7051c186b1e5a51014313 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 12 Mar 2019 10:03:46 +0000 Subject: [PATCH 010/138] Add geo_augment() method to SpatialUnit classes --- flowmachine/flowmachine/core/spatial_unit.py | 91 ++++++++++++++++++-- flowmachine/flowmachine/utils.py | 20 +++-- flowmachine/tests/test_utils.py | 6 +- 3 files changed, 98 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 091003d7d8..8b6ecdc840 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -7,12 +7,13 @@ (e.g. versioned-cell, admin*, grid, ...). """ from typing import List +from abc import ABCMeta, abstractmethod -from flowmachine.utils import get_alias +from flowmachine.utils import get_name_and_alias from . import Query, GeoTable, Grid -class SpatialUnit(Query): +class SpatialUnit(Query, metaclass=ABCMeta): """ Base class for all spatial units. Selects columns from the location table, and optionally joins to data in another table. @@ -78,7 +79,28 @@ def location_columns(self) -> List[str]: @property def column_names(self) -> List[str]: - return [get_alias(c) for c in self._cols] + return [get_name_and_alias(c)[1] for c in self._cols] + + @abstractmethod + def geo_augment(self, query): + """ + Given a query object (which is assumed to be a JoinToLocation object, + joined to this spatial unit), return a version of the query augmented + with a geom column and a gid column. + + Parameters + ---------- + query : flowmachine.Query + The query to augment with geom and gid columns + + Returns + ------- + str + A version of this query with geom and gid columns + list + The columns this query contains + """ + raise NotImplementedError def _make_query(self): columns = ", ".join(self._cols) @@ -109,6 +131,17 @@ def __init__(self): location_column_names=["lat", "lon"], ) + def geo_augment(self, query): + sql = f""" + SELECT + row_number() over() AS gid, + *, + ST_SetSRID(ST_Point(lon, lat), 4326) AS geom + FROM ({query.get_query()}) AS L + """ + cols = list(set(query.column_names + ["gid", "geom"])) + return sql, cols + class VersionedCellSpatialUnit(SpatialUnit): """ @@ -132,6 +165,20 @@ def __init__(self): location_info_table="infrastructure.cells", ) + def geo_augment(self, query): + sql = f""" + SELECT + row_number() OVER () AS gid, + geom_point AS geom, + U.* + FROM ({query.get_query()}) AS U + LEFT JOIN infrastructure.cells AS S + ON U.location_id = S.id AND + U.version = S.version + """ + cols = list(set(query.column_names + ["gid", "geom"])) + return sql, cols + class VersionedSiteSpatialUnit(SpatialUnit): """ @@ -173,6 +220,20 @@ def __init__(self): join_clause=join_clause, ) + def geo_augment(self, query): + sql = f""" + SELECT + row_number() OVER () AS gid, + geom_point AS geom, + U.* + FROM ({query.get_query()}) AS U + LEFT JOIN infrastructure.sites AS S + ON U.site_id = S.id AND + U.version = S.version + """ + cols = list(set(query.column_names + ["gid", "geom"])) + return sql, cols + class PolygonSpatialUnit(SpatialUnit): """ @@ -227,14 +288,14 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): f"{locinfo_alias}.date_of_first_service AS date_of_first_service", f"{locinfo_alias}.date_of_last_service AS date_of_last_service", ] - if type(polygon_columns) is str: - polygon_cols = [polygon_column_names] + if type(polygon_column_names) is str: + self.polygon_column_names = [polygon_column_names] else: - polygon_cols = polygon_column_names + self.polygon_column_names = polygon_column_names all_column_names = locinfo_column_names + [ - f"{joined_alias}.{c}" for c in polygon_cols + f"{joined_alias}.{c}" for c in self.polygon_column_names ] - location_column_names = [get_alias(c) for c in polygon_cols] + location_column_names = [get_name_and_alias(c)[1] for c in self.polygon_column_names] super().__init__( selected_column_names=all_column_names, @@ -242,6 +303,20 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): location_info_table=f"{location_info_table} AS {locinfo_alias}", join_clause=join_clause, ) + + def geo_augment(self, query): + r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) + sql = f""" + SELECT + row_number() OVER () as gid, + {self.geom_col} AS geom, + U.* + FROM ({query.get_query()}) AS U + LEFT JOIN ({self.polygon_table.get_query()}) AS G + ON U.{l_col_name} = G.{r_col_name} + """ + cols = list(set(query.column_names + ["gid", "geom"])) + return sql, cols class AdminSpatialUnit(PolygonSpatialUnit): diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index e292ad2769..ffde4cd4d3 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -334,16 +334,20 @@ def _makesafe(x): return adapt(x).getquoted().decode() -def get_alias(column_name): +def get_name_and_alias(column_name): """ - Given a column name string, return the alias (if there is one), - or return the provided column name if there is no alias. + Given a column name string, return the column name and alias (if there is + one), or return the provided column name twice if there is no alias. Examples -------- - >>> get_alias("col AS alias") - "alias" - >>> get_alias("col") - "col" + >>> get_name_and_alias("col AS alias") + ('col', 'alias') + >>> get_name_and_alias("col") + ('col', 'col') """ - return re.split(" as ", column_name, flags=re.IGNORECASE)[-1] + column_name_split = re.split(" as ", column_name, flags=re.IGNORECASE) + if len(column_name_split) == 1: + return column_name_split[0].strip(), column_name_split[0].strip() + else: + return column_name_split[0].strip(), column_name_split[-1].strip() diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index 9aa0992342..658e5766ff 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -19,7 +19,7 @@ getsecret, pretty_sql, _makesafe, - get_alias, + get_name_and_alias, ) from flowmachine.utils import time_period_add @@ -165,5 +165,5 @@ def test_get_secrets_default(monkeypatch): "column_name, alias", [("column", "column"), ("column AS alias", "alias"), ("column as alias", "alias")], ) -def test_get_alias(column_name, alias): - assert alias == get_alias(column_name) +def test_get_name_and_alias(column_name, alias): + assert ("column", alias) == get_name_and_alias(column_name) From de68a23a33e82d351482bb3d7a1f883066fc2c2f Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 12 Mar 2019 10:09:01 +0000 Subject: [PATCH 011/138] Update GeoDataMixin._geo_augmented_query to call SpatialUnit.geo_augment --- .../flowmachine/core/mixins/geodata_mixin.py | 81 +------------------ 1 file changed, 1 insertion(+), 80 deletions(-) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index af0604f48a..ddec346f01 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -81,86 +81,7 @@ def _geo_augmented_query(self): The columns this query contains """ loc_join = self._get_location_join() - level = loc_join.level - if level == "lat-lon": - # Need to recreate a point - joined_query = """ - - SELECT - row_number() over() AS gid, - *, - ST_SetSRID(ST_Point(lon, lat), 4326) AS geom - FROM ({}) AS L - - """.format( - self.get_query() - ) - elif level == "versioned-site": - joined_query = """ - - SELECT - row_number() OVER () AS gid, - geom_point AS geom, - U.* - FROM ({qur}) AS U - LEFT JOIN infrastructure.sites AS S - ON U.site_id = S.id AND - U.version = S.version - - """.format( - qur=self.get_query() - ) - elif level == "versioned-cell": - joined_query = """ - - SELECT - row_number() OVER () AS gid, - geom_point AS geom, - U.* - FROM ({qur}) AS U - LEFT JOIN infrastructure.cells AS S - ON U.location_id = S.id AND - U.version = S.version - - """.format( - qur=self.get_query() - ) - else: - mapping = loc_join.right_query.mapping - col_name = mapping.column_name[0] - geom_col = mapping.geom_col - poly_query = mapping.polygon_table - - # - # Same as comment above on JoinToLocations. - # This also needs to deal with grids. This - # solution isn't good. - # - if isinstance(poly_query, Query): - sql_polygon_query = poly_query.get_query() - - else: - sql_polygon_query = "SELECT * FROM {}".format(poly_query) - - joined_query = """ - - SELECT - row_number() OVER () as gid, - {geom_col} as geom, - U.* - FROM ({qur}) AS U - LEFT JOIN ({poly_query}) AS G - ON U.{l_col_name} = G.{r_col_name} - - """.format( - qur=self.get_query(), - poly_query=sql_polygon_query, - geom_col=geom_col, - l_col_name=self.column_names[0], - r_col_name=col_name, - ) - cols = list(set(self.column_names + ["gid", "geom"])) - return joined_query, cols + return loc_join.spatial_unit.geo_augment(self) def geojson_query(self, crs=None): """ From db6bb13d9b568030405394a15b3957e6706862bf Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 12 Mar 2019 10:28:41 +0000 Subject: [PATCH 012/138] Update JoinToLocation tests --- flowmachine/tests/test_join_to_location.py | 39 ++++++++++++++-------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 02024b4396..38b7fca0b8 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -12,14 +12,24 @@ from flowmachine.core import JoinToLocation -def test_join_to_location_column_names(exemplar_level_param): +@pytest.mark.parametrize( + "spatial_unit", + [ + AdminSpatialUnit(level=2), + AdminSpatialUnit(level=2, column_name="admin2name"), + VersionedSiteSpatialUnit(), + VersionedCellSpatialUnit(), + LatLonSpatialUnit(), + GridSpatialUnit(size=5), + PolygonSpatialUnit( + polygon_column_names="admin3pcod", polygon_table="geography.admin3" + ), + ], +) +def test_join_to_location_column_names(spatial_unit): """ Test that JoinToLocation's column_names property is accurate.""" - if "cell" == exemplar_level_param["level"]: - pytest.skip( - "Cell level not valid for JoinToLocation" - ) # cell level not valid for JoinToLocation table = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - joined = JoinToLocation(table, **exemplar_level_param) + joined = JoinToLocation(table, spatial_unit=spatial_unit) assert joined.head(0).columns.tolist() == joined.column_names @@ -42,7 +52,7 @@ def test_join_with_versioned_cells(get_dataframe, get_length): Test that flowmachine.JoinToLocation can fetch the cell version. """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - df = get_dataframe(JoinToLocation(ul, level="versioned-cell")) + df = get_dataframe(JoinToLocation(ul, spatial_unit=VersionedCellSpatialUnit())) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) # These should all be version zero, these are the towers before the changeover date, or those that @@ -65,7 +75,7 @@ def test_join_with_lat_lon(get_dataframe): Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - df = get_dataframe(JoinToLocation(ul, level="lat-lon")) + df = get_dataframe(JoinToLocation(ul, spatial_unit=LatLonSpatialUnit())) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) assert sorted(df.columns) == expected_cols @@ -91,10 +101,11 @@ def test_join_with_polygon(get_dataframe, get_length): ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") j = JoinToLocation( ul, - level="polygon", - column_name="admin3pcod", - polygon_table="geography.admin3", - geom_col="geom", + spatial_unit=PolygonSpatialUnit( + polygon_column_names="admin3pcod", + polygon_table="geography.admin3", + geom_col="geom", + ), ) df = get_dataframe(j) @@ -108,7 +119,7 @@ def test_join_to_admin(get_dataframe, get_length): Test that flowmachine.JoinToLocation can join to a admin region. """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - df = get_dataframe(JoinToLocation(ul, level="admin3")) + df = get_dataframe(JoinToLocation(ul, spatial_unit=AdminSpatialUnit(level=3))) assert len(df) == get_length(ul) expected_cols = sorted(["subscriber", "time", "location_id", "pcod"]) assert sorted(df.columns) == expected_cols @@ -119,5 +130,5 @@ def test_join_to_grid(get_dataframe, get_length): Test that we can join to a grid square """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - df = get_dataframe(JoinToLocation(ul, level="grid", size=50)) + df = get_dataframe(JoinToLocation(ul, spatial_unit=GridSpatialUnit(size=50))) assert len(df) == get_length(ul) From 4e6abaffc73fb6cba80e88a30c978bb698c166f7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 18 Mar 2019 13:09:06 +0000 Subject: [PATCH 013/138] Move Grid into flowmachine.core to avoid circular import --- flowmachine/flowmachine/core/__init__.py | 4 +-- .../{features/spatial => core}/grid.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 35 ++++++++++--------- .../flowmachine/features/spatial/__init__.py | 2 -- .../features/spatial/cell_mappings.py | 3 +- .../subscriber/meaningful_locations.py | 3 +- flowmachine/tests/test_grid.py | 2 +- 7 files changed, 25 insertions(+), 26 deletions(-) rename flowmachine/flowmachine/{features/spatial => core}/grid.py (98%) diff --git a/flowmachine/flowmachine/core/__init__.py b/flowmachine/flowmachine/core/__init__.py index 6122844d28..01fc736fe3 100644 --- a/flowmachine/flowmachine/core/__init__.py +++ b/flowmachine/flowmachine/core/__init__.py @@ -14,7 +14,7 @@ from .init import connect from .join_to_location import JoinToLocation from .custom_query import CustomQuery -from .spatial_unit import SpatialUnit +from .grid import Grid sub_modules = ["errors", "mixins", "api"] @@ -26,7 +26,7 @@ "connect", "JoinToLocation", "CustomQuery", - "SpatialUnit", + "Grid", ] __all__ = methods + sub_modules diff --git a/flowmachine/flowmachine/features/spatial/grid.py b/flowmachine/flowmachine/core/grid.py similarity index 98% rename from flowmachine/flowmachine/features/spatial/grid.py rename to flowmachine/flowmachine/core/grid.py index 88c36235fd..68f482101d 100644 --- a/flowmachine/flowmachine/features/spatial/grid.py +++ b/flowmachine/flowmachine/core/grid.py @@ -10,7 +10,7 @@ from flowmachine.core.mixins import GeoDataMixin -from ...core.query import Query +from .query import Query class Grid(GeoDataMixin, Query): diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 8b6ecdc840..4d1052f9d1 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -10,7 +10,8 @@ from abc import ABCMeta, abstractmethod from flowmachine.utils import get_name_and_alias -from . import Query, GeoTable, Grid +from . import Query, GeoTable +from .grid import Grid class SpatialUnit(Query, metaclass=ABCMeta): @@ -80,7 +81,7 @@ def location_columns(self) -> List[str]: @property def column_names(self) -> List[str]: return [get_name_and_alias(c)[1] for c in self._cols] - + @abstractmethod def geo_augment(self, query): """ @@ -166,18 +167,18 @@ def __init__(self): ) def geo_augment(self, query): - sql = f""" - SELECT - row_number() OVER () AS gid, - geom_point AS geom, - U.* - FROM ({query.get_query()}) AS U - LEFT JOIN infrastructure.cells AS S - ON U.location_id = S.id AND - U.version = S.version - """ - cols = list(set(query.column_names + ["gid", "geom"])) - return sql, cols + sql = f""" + SELECT + row_number() OVER () AS gid, + geom_point AS geom, + U.* + FROM ({query.get_query()}) AS U + LEFT JOIN infrastructure.cells AS S + ON U.location_id = S.id AND + U.version = S.version + """ + cols = list(set(query.column_names + ["gid", "geom"])) + return sql, cols class VersionedSiteSpatialUnit(SpatialUnit): @@ -295,7 +296,9 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): all_column_names = locinfo_column_names + [ f"{joined_alias}.{c}" for c in self.polygon_column_names ] - location_column_names = [get_name_and_alias(c)[1] for c in self.polygon_column_names] + location_column_names = [ + get_name_and_alias(c)[1] for c in self.polygon_column_names + ] super().__init__( selected_column_names=all_column_names, @@ -303,7 +306,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): location_info_table=f"{location_info_table} AS {locinfo_alias}", join_clause=join_clause, ) - + def geo_augment(self, query): r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) sql = f""" diff --git a/flowmachine/flowmachine/features/spatial/__init__.py b/flowmachine/flowmachine/features/spatial/__init__.py index 4aeb8febc4..2bf3dacfc9 100644 --- a/flowmachine/flowmachine/features/spatial/__init__.py +++ b/flowmachine/flowmachine/features/spatial/__init__.py @@ -9,7 +9,6 @@ from .distance_matrix import DistanceMatrix from .location_cluster import LocationCluster from .versioned_infrastructure import VersionedInfrastructure -from .grid import Grid from .cell_mappings import CellToAdmin, CellToPolygon, CellToGrid from .circles import Circle, CircleGeometries @@ -18,7 +17,6 @@ "DistanceMatrix", "LocationCluster", "VersionedInfrastructure", - "Grid", "CellToAdmin", "CellToPolygon", "CellToGrid", diff --git a/flowmachine/flowmachine/features/spatial/cell_mappings.py b/flowmachine/flowmachine/features/spatial/cell_mappings.py index e508411ed0..754f81068c 100644 --- a/flowmachine/flowmachine/features/spatial/cell_mappings.py +++ b/flowmachine/flowmachine/features/spatial/cell_mappings.py @@ -9,8 +9,7 @@ """ from typing import List -from ...core import Query -from .grid import Grid +from ...core import Query, Grid class CellToPolygon(Query): diff --git a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py index 4ae2588cda..c7bc792ff1 100644 --- a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py +++ b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py @@ -5,9 +5,8 @@ from typing import Dict, Any, List, Union from flowmachine.core.errors import BadLevelError -from ...core import GeoTable, Query +from ...core import GeoTable, Query, Grid from . import LabelEventScore, HartiganCluster, EventScore -from ..spatial import Grid from flowmachine.utils import get_columns_for_level diff --git a/flowmachine/tests/test_grid.py b/flowmachine/tests/test_grid.py index 7416cf27aa..686dd6e368 100644 --- a/flowmachine/tests/test_grid.py +++ b/flowmachine/tests/test_grid.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.features import Grid +from flowmachine.core import Grid def test_grid_column_names(): From e87e7b44bc81ee39c502b6640a574a8c2fbb39f2 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 18 Mar 2019 13:09:36 +0000 Subject: [PATCH 014/138] Import spatial units in test_join_to_location.py --- flowmachine/tests/test_join_to_location.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 38b7fca0b8..1547d7b17e 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -10,6 +10,14 @@ from flowmachine.features import subscriber_locations from flowmachine.core import JoinToLocation +from flowmachine.core.spatial_unit import ( + AdminSpatialUnit, + VersionedSiteSpatialUnit, + VersionedCellSpatialUnit, + LatLonSpatialUnit, + GridSpatialUnit, + PolygonSpatialUnit, +) @pytest.mark.parametrize( From a50b4de229bdfc089dee3c07f730fffd39a45c72 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 18 Mar 2019 13:10:33 +0000 Subject: [PATCH 015/138] Remove unused "date" kwarg in DistanceMatrix --- .../features/spatial/distance_matrix.py | 21 +++---------------- flowmachine/flowmachine/models/pwo.py | 2 +- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 589400decd..1230ed6fc2 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -25,24 +25,10 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - locations_table : str - Locations table where to find the locations to compute + level : str, default "versioned-cell" + Point locations (either versioned-site or versioned-cell) to compute distances for. - id_column : str - The column with the unique ID for each location. - The default parameter is 'id'. - - geom_column : str - Geometry column for calculating distances. - The default is 'geom_point'. - - date : str - Date string in ISO format (e.g. '2016-01-22') - for retrieving a VersionedInfrastructure() - object. If nothing is passed, that object - will be instantiated with the current date. - return_geometry : bool If True, geometries are returned in query (represented as WKB in a dataframe). This @@ -61,12 +47,11 @@ class DistanceMatrix(GraphMixin, Query): """ - def __init__(self, level="versioned-cell", date=None, return_geometry=False): + def __init__(self, level="versioned-cell", return_geometry=False): if level not in {"versioned-site", "versioned-cell"}: raise ValueError("Only point locations are supported at this time.") self.level = level - self.date = date self.return_geometry = return_geometry super().__init__() diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 3d1045bd90..750019105a 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -271,7 +271,7 @@ def __init__( self.method = method self.level = level self.distance_matrix = DistanceMatrix( - date=self.stop, level=level, return_geometry=True, **kwargs + level=level, return_geometry=True, **kwargs ) if self.method == "home-location": From 3804c899961edb7fe943de6de6a7eef6ea1dacfd Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 18 Mar 2019 17:00:40 +0000 Subject: [PATCH 016/138] Refactor DistanceMatrix, moving the query construction into SpatialUnit --- .../flowmachine/core/join_to_location.py | 4 +- flowmachine/flowmachine/core/spatial_unit.py | 114 ++++++++++++++++-- .../features/spatial/distance_matrix.py | 110 +++-------------- .../tests/test_spatial_distancematrix.py | 5 +- 4 files changed, 130 insertions(+), 103 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index f7ff7fa541..99aa86811e 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -11,7 +11,7 @@ from typing import List from .query import Query -from .spatial_unit import SpatialUnit +from .spatial_unit import BaseSpatialUnit class JoinToLocation(Query): @@ -47,7 +47,7 @@ class JoinToLocation(Query): """ def __init__(self, left, *, spatial_unit, time_col="time"): - if not isinstance(spatial_unit, SpatialUnit): + if not isinstance(spatial_unit, BaseSpatialUnit): raise TypeError("spatial_unit must be a SpatialUnit object") self.spatial_unit = spatial_unit self.left = left diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 4d1052f9d1..013ff58e1e 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -14,7 +14,7 @@ from .grid import Grid -class SpatialUnit(Query, metaclass=ABCMeta): +class BaseSpatialUnit(Query, metaclass=ABCMeta): """ Base class for all spatial units. Selects columns from the location table, and optionally joins to data in another table. @@ -48,7 +48,7 @@ def __init__( else: self._cols = selected_column_names - if type(other_column_names) is str: + if type(location_column_names) is str: self._loc_cols = [location_column_names] else: self._loc_cols = location_column_names @@ -103,6 +103,38 @@ def geo_augment(self, query): """ raise NotImplementedError + def distance_matrix_query(self, return_geometry): + """ + A query that calculates the complete distance matrix between all + elements of this spatial unit. Distance is returned in km. + + Parameters + ---------- + return_geometry : bool + If True, geometries are returned in query + (represented as WKB in a dataframe) + + Returns + ------- + str + SQL query string + + """ + raise NotImplementedError( + f"Spatial units of type {type(self).__name__} do not support distance_matrix_query at this time." + ) + + def distance_matrix_columns(self, return_geometry=False): + """ + List of columns for self.distance_matrix_query + """ + col_names = [f"{c}_from" for c in self.location_columns] + col_names += [f"{c}_to" for c in self.location_columns] + col_names += ["distance"] + if return_geometry: + col_names += ["geom_origin", "geom_destination"] + return col_names + def _make_query(self): columns = ", ".join(self._cols) sql = f""" @@ -115,7 +147,7 @@ def _make_query(self): return sql -class LatLonSpatialUnit(SpatialUnit): +class LatLonSpatialUnit(BaseSpatialUnit): """ Class that maps cell location_id to lat-lon coordinates. """ @@ -144,7 +176,7 @@ def geo_augment(self, query): return sql, cols -class VersionedCellSpatialUnit(SpatialUnit): +class VersionedCellSpatialUnit(BaseSpatialUnit): """ Class that maps cell location_id to a cell version and lat-lon coordinates. """ @@ -180,8 +212,41 @@ def geo_augment(self, query): cols = list(set(query.column_names + ["gid", "geom"])) return sql, cols + def distance_matrix_query(self, return_geometry=False): + return_geometry_statement = "" + if return_geometry: + return_geometry_statement = """ + , + A.geom_point AS geom_origin, + B.geom_point AS geom_destination + """ + + sql = f""" + + SELECT + A.id AS location_id_from, + A.version AS version_from, + B.id AS location_id_to, + B.version AS version_to, + ST_X(A.geom_point::geometry) AS lon_from, + ST_Y(A.geom_point::geometry) AS lat_from, + ST_X(B.geom_point::geometry) AS lon_to, + ST_Y(B.geom_point::geometry) AS lat_to, + ST_Distance( + A.geom_point::geography, + B.geom_point::geography + ) / 1000 AS distance + {return_geometry_statement} + FROM infrastructure.cells AS A + CROSS JOIN infrastructure.cells AS B + ORDER BY distance DESC + + """ + + return sql + -class VersionedSiteSpatialUnit(SpatialUnit): +class VersionedSiteSpatialUnit(BaseSpatialUnit): """ Class that maps cell location_id to a site version and lat-lon coordinates. """ @@ -212,11 +277,11 @@ def __init__(self): f"{sites_alias}.id AS site_id", f"{sites_alias}.date_of_first_service AS date_of_first_service", f"{sites_alias}.date_of_last_service AS date_of_last_service", - f"{sites_alias}.version as version", + f"{sites_alias}.version AS version", f"ST_X({sites_alias}.geom_point::geometry) AS lon", f"ST_Y({sites_alias}.geom_point::geometry) AS lat", ], - location_column_names=["location_id", "version", "lon", "lat"], + location_column_names=["site_id", "version", "lon", "lat"], location_info_table=f"infrastructure.sites AS {sites_alias}", join_clause=join_clause, ) @@ -235,8 +300,41 @@ def geo_augment(self, query): cols = list(set(query.column_names + ["gid", "geom"])) return sql, cols + def distance_matrix_query(self, return_geometry=False): + return_geometry_statement = "" + if return_geometry: + return_geometry_statement = """ + , + A.geom_point AS geom_origin, + B.geom_point AS geom_destination + """ + + sql = f""" + + SELECT + A.id AS site_id_from, + A.version AS version_from, + B.id AS site_id_to, + B.version AS version_to, + ST_X(A.geom_point::geometry) AS lon_from, + ST_Y(A.geom_point::geometry) AS lat_from, + ST_X(B.geom_point::geometry) AS lon_to, + ST_Y(B.geom_point::geometry) AS lat_to, + ST_Distance( + A.geom_point::geography, + B.geom_point::geography + ) / 1000 AS distance + {return_geometry_statement} + FROM infrastructure.sites AS A + CROSS JOIN infrastructure.sites AS B + ORDER BY distance DESC + + """ + + return sql + -class PolygonSpatialUnit(SpatialUnit): +class PolygonSpatialUnit(BaseSpatialUnit): """ Class that provides a mapping from cell/site data in the location table to spatial regions defined by geography information in a table. diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 1230ed6fc2..c4de53cb8e 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -12,6 +12,7 @@ from flowmachine.utils import get_columns_for_level from ...core.query import Query from ...core.mixins import GraphMixin +from ...core.spatial_unit import VersionedCellSpatialUnit class DistanceMatrix(GraphMixin, Query): @@ -21,13 +22,17 @@ class DistanceMatrix(GraphMixin, Query): computation of distance travelled, area of influence, and other features. - Distance is returned in Km. + This calls the SpatialUnit.distance_matrix_query method. + Note: this method is only implemented for the VersionedCellSpatialUnit and + VersionedSiteSpatialUnit at this time. + + Distance is returned in km. Parameters ---------- - level : str, default "versioned-cell" - Point locations (either versioned-site or versioned-cell) to compute - distances for. + spatial_unit : SpatialUnit or None, default None + Locations to compute distances for. + If None, defaults to VersionedCellSpatialUnit(). return_geometry : bool If True, geometries are returned in query @@ -35,101 +40,24 @@ class DistanceMatrix(GraphMixin, Query): is an useful option if one is computing other geographic properties out of the - Examples - -------- - >>> DistanceMatrix().get_dataframe() - origin destination distance - 0 8wPojr GN2k0G 789.232397 - 1 GN2k0G 8wPojr 789.232397 - 2 8wPojr j1m77j 786.102789 - 3 j1m77j 8wPojr 786.102789 - 4 DbWg4K 8wPojr 757.977718 - """ - def __init__(self, level="versioned-cell", return_geometry=False): - - if level not in {"versioned-site", "versioned-cell"}: - raise ValueError("Only point locations are supported at this time.") - self.level = level + def __init__(self, spatial_unit=None, return_geometry=False): + if spatial_unit is None: + self.spatial_unit = VersionedCellSpatialUnit() + else: + self.spatial_unit = spatial_unit self.return_geometry = return_geometry super().__init__() @property def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level) - - try: - cols.remove("lat") - cols.remove("lon") - except ValueError: - pass # Nothing to remove - - col_names = [f"{c}_from" for c in cols] - col_names += [f"{c}_to" for c in cols] - col_names += [f"{c}_from" for c in ("lon", "lat")] - col_names += [f"{c}_to" for c in ("lon", "lat")] - col_names += ["distance"] - if self.return_geometry: - col_names += ["geom_origin", "geom_destination"] - return col_names - - def _make_query(self): - cols = get_columns_for_level(self.level) - sql_location_table = "SELECT * FROM infrastructure." + ( - "sites" if self.level == "versioned-site" else "cells" + return self.spatial_unit.distance_matrix_columns( + return_geometry=self.return_geometry ) - try: - cols.remove("lat") - cols.remove("lon") - except ValueError: - pass # Nothing to remove - from_cols = ", ".join( - "A.{c_id_safe} AS {c}_from".format( - c_id_safe="id" if c.endswith("id") else c, c=c - ) - for c in cols - ) - to_cols = ", ".join( - "B.{c_id_safe} AS {c}_to".format( - c_id_safe="id" if c.endswith("id") else c, c=c - ) - for c in cols - ) - - return_geometry_statement = "" - if self.return_geometry: - return_geometry_statement = """ - , - A.geom_point AS geom_origin, - B.geom_point AS geom_destination - """ - - sql = """ - - SELECT - {froms}, - {tos}, - ST_X(A.geom_point::geometry) AS lon_from, - ST_Y(A.geom_point::geometry) AS lat_from, - ST_X(B.geom_point::geometry) AS lon_to, - ST_Y(B.geom_point::geometry) AS lat_to, - ST_Distance( - A.geom_point::geography, - B.geom_point::geography - ) / 1000 AS distance - {return_geometry_statement} - FROM ({location_table_statement}) AS A - CROSS JOIN ({location_table_statement}) AS B - ORDER BY distance DESC - - """.format( - location_table_statement=sql_location_table, - froms=from_cols, - tos=to_cols, - return_geometry_statement=return_geometry_statement, + def _make_query(self): + return self.spatial_unit.distance_matrix_query( + return_geometry=self.return_geometry ) - - return sql diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index 6cd997a39c..faad0b8bac 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -8,13 +8,14 @@ from flowmachine.features.spatial import DistanceMatrix +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit def test_some_results(get_dataframe): """ DistanceMatrix() returns a dataframe that contains hand-picked results. """ - c = DistanceMatrix(level="versioned-site") + c = DistanceMatrix(spatial_unit=VersionedSiteSpatialUnit()) df = get_dataframe(c) set_df = df.set_index("site_id_from") assert round(set_df.loc["8wPojr"]["distance"].values[0]) == 789 @@ -26,5 +27,5 @@ def test_result_has_correct_length(get_length): """ DistanceMatrix() has the correct length. """ - c = DistanceMatrix(level="versioned-site") + c = DistanceMatrix(spatial_unit=VersionedSiteSpatialUnit()) assert get_length(c) == 35 ** 2 From f8aabba997e13c2c155de985a8cbba5bba02f809 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 09:58:22 +0000 Subject: [PATCH 017/138] Fix SpatialUnit column_names property --- flowmachine/flowmachine/core/spatial_unit.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 013ff58e1e..4b9d9ce12f 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -57,7 +57,7 @@ def __init__( missing_cols = [c for c in self._loc_cols if not (c in self.column_names)] if missing_cols: raise ValueError( - f"Location columns {missing_cols} are not in returned columns" + f"Location columns {missing_cols} are not in returned columns." ) if location_info_table: @@ -80,7 +80,7 @@ def location_columns(self) -> List[str]: @property def column_names(self) -> List[str]: - return [get_name_and_alias(c)[1] for c in self._cols] + return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] @abstractmethod def geo_augment(self, query): @@ -257,14 +257,14 @@ def __init__(self): sites_alias = "s" if location_table == "infrastructure.sites": cells_alias = sites_alias + join_clause = "" + elif location_table == "infrastructure.cells": + cells_alias = "c" join_clause = f""" RIGHT JOIN infrastructure.cells AS {cells_alias} ON {sites_alias}.id = {cells_alias}.site_id """ - elif location_table == "infrastructure.cells": - cells_alias = "c" - join_clause = "" else: raise ValueError( f"Expected location table to be 'infrastructure.cells' " From 1c8d65339579529699697943dbbc034f838ff90d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 10:27:02 +0000 Subject: [PATCH 018/138] Add tests for geo_augment and distance_matrix_query methods --- flowmachine/tests/test_spatial_unit.py | 67 ++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 88771a1947..c4fdea7e19 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -2,8 +2,9 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core import CustomQuery from flowmachine.core.spatial_unit import ( - SpatialUnit, + BaseSpatialUnit, LatLonSpatialUnit, VersionedCellSpatialUnit, VersionedSiteSpatialUnit, @@ -15,7 +16,7 @@ @pytest.mark.parametrize( - "spatial_unit, args", + "spatial_unit, kwargs", [ (LatLonSpatialUnit, {}), (VersionedCellSpatialUnit, {}), @@ -37,16 +38,16 @@ (GridSpatialUnit, {"size": 5}), ], ) -def test_spatial_unit_column_names(spatial_unit, args): +def test_spatial_unit_column_names(spatial_unit, kwargs): """ Test that the SpatialUnit classes have accurate column_names properties. """ - instance = spatial_unit(**args) - assert instance.head(0).columns.tolist() == instance.column_names + su = spatial_unit(**kwargs) + assert su.head(0).columns.tolist() == su.column_names @pytest.mark.parametrize( - "spatial_unit, args, loc_cols", + "spatial_unit, kwargs, loc_cols", [ (LatLonSpatialUnit, {}, ["lat", "lon"]), (VersionedCellSpatialUnit, {}, ["location_id", "version", "lon", "lat"]), @@ -75,12 +76,12 @@ def test_spatial_unit_column_names(spatial_unit, args): (GridSpatialUnit, {"size": 5}, ["grid_id"]), ], ) -def test_spatial_unit_location_columns(spatial_unit, args, loc_cols): +def test_spatial_unit_location_columns(spatial_unit, kwargs, loc_cols): """ Test that the SpatialUnit classes have the correct location_columns properties. """ - instance = spatial_unit(**args) - assert loc_cols == instance.location_columns + su = spatial_unit(**kwargs) + assert loc_cols == su.location_columns def test_polygon_spatial_unit_column_list(): @@ -104,8 +105,13 @@ def test_missing_location_columns_raises_error(): Test that a ValueError is raised if the location_column_names passed to SpatialUnit are not a subset of column_names. """ + + class TestSpatialUnit(BaseSpatialUnit): + def geo_augment(self, query): + pass + with pytest.raises(ValueError, match="['NOT_A_COLUMN']"): - su = SpatialUnit( + su = TestSpatialUnit( selected_column_names=[ "id AS location_id", "date_of_first_service", @@ -113,3 +119,44 @@ def test_missing_location_columns_raises_error(): ], location_column_names=["location_id", "NOT_A_COLUMN"], ) + + +@pytest.mark.parametrize( + "spatial_unit, kwargs", + [ + (LatLonSpatialUnit, {}), + (VersionedCellSpatialUnit, {}), + (VersionedSiteSpatialUnit, {}), + ( + PolygonSpatialUnit, + {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, + ), + (AdminSpatialUnit, {"level": 2}), + (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), + (GridSpatialUnit, {"size": 5}), + ], +) +def test_geo_augment_columns(spatial_unit, kwargs): + """ + Test that the columns returned by the geo_augment method are correct. + """ + su = spatial_unit(**kwargs) + sql, cols = su.geo_augment(su) + cq = CustomQuery(sql, cols) + assert cq.head(0).columns.tolist() == cols + + +@pytest.mark.parametrize( + "spatial_unit", [VersionedCellSpatialUnit, VersionedSiteSpatialUnit] +) +@pytest.mark.parametrize("return_geometry", [True, False]) +def test_distance_matrix_columns(spatial_unit, return_geometry): + """ + Test that the columns returned by the distance_matrix_columns method match + the columns of the distance_matrix_query. + """ + su = spatial_unit() + sql = su.distance_matrix_query(return_geometry=return_geometry) + cols = su.distance_matrix_columns(return_geometry=return_geometry) + cq = CustomQuery(sql, cols) + assert cq.head(0).columns.tolist() == cols From 86cab6b6b4755b1881362b653db3e7ad8bf67c98 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 10:30:06 +0000 Subject: [PATCH 019/138] Fix JoinToLocation column names test --- flowmachine/tests/test_join_to_location.py | 24 ++++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 1547d7b17e..9e55c4878b 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -21,23 +21,25 @@ @pytest.mark.parametrize( - "spatial_unit", + "spatial_unit, kwargs", [ - AdminSpatialUnit(level=2), - AdminSpatialUnit(level=2, column_name="admin2name"), - VersionedSiteSpatialUnit(), - VersionedCellSpatialUnit(), - LatLonSpatialUnit(), - GridSpatialUnit(size=5), - PolygonSpatialUnit( - polygon_column_names="admin3pcod", polygon_table="geography.admin3" + (LatLonSpatialUnit, {}), + (VersionedCellSpatialUnit, {}), + (VersionedSiteSpatialUnit, {}), + ( + PolygonSpatialUnit, + {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, ), + (AdminSpatialUnit, {"level": 2}), + (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), + (GridSpatialUnit, {"size": 5}), ], ) -def test_join_to_location_column_names(spatial_unit): +def test_join_to_location_column_names(spatial_unit, kwargs): """ Test that JoinToLocation's column_names property is accurate.""" + su = spatial_unit(**kwargs) table = subscriber_locations("2016-01-05", "2016-01-07", level="cell") - joined = JoinToLocation(table, spatial_unit=spatial_unit) + joined = JoinToLocation(table, spatial_unit=su) assert joined.head(0).columns.tolist() == joined.column_names From e8f7c068129ac67b2481e76bc799821381345cf3 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 11:28:56 +0000 Subject: [PATCH 020/138] Update docstrings --- .../flowmachine/core/join_to_location.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 22 +++++++++++++++++-- .../features/spatial/distance_matrix.py | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 99aa86811e..f2cafff0a5 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -30,7 +30,7 @@ class JoinToLocation(Query): This represents a table that can be joined to the cell information table. This must have a date column (called time) and a location column call 'location_id'. - spatial_unit : flowmachine.SpatialUnit + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit A query which maps cell identifiers in the CDR to a different spatial unit (e.g. versioned site or admin region) time_col : str, default 'time': diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 4b9d9ce12f..4d4eb232e5 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -3,8 +3,26 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ -Classes that map cells (or towers or sites) to a spatial unit -(e.g. versioned-cell, admin*, grid, ...). +Classes that map cells (or towers or sites) to a spatial unit. + +The available spatial units are: + VersionedCellSpatialUnit: + The identifier as found in the CDR combined with the version from the + cells table. + VersionedSiteSpatialUnit: + The ID found in the sites table, coupled with the version number. + PolygonSpatialUnit: + A custom set of polygons that live in the database. Takes the + parameters polygon_column_names, which is the columns you want to + return after the join, and polygon_table, the table where the polygons + reside (with the schema), and additionally geom_col which is the column + with the geometry information (will default to 'geom'). + AdminSpatialUnit: + An admin region of interest, such as admin3. Must live in the database + in the standard location. + GridSpatialUnit: + A square in a regular grid, in addition pass size to determine the size + of the polygon. """ from typing import List from abc import ABCMeta, abstractmethod diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index c4de53cb8e..34bd5efda0 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -30,7 +30,7 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : SpatialUnit or None, default None + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, default None Locations to compute distances for. If None, defaults to VersionedCellSpatialUnit(). From e64c4d8ade951f84cec499a444206e7dbd82c3e4 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 11:29:49 +0000 Subject: [PATCH 021/138] Update pwo model to take a SpatialUnit argument --- flowmachine/flowmachine/models/pwo.py | 83 ++++++++++++--------------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 750019105a..596e4e2f17 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -31,10 +31,11 @@ import pandas as pd from flowmachine.features import daily_location -from flowmachine.utils import get_columns_for_level, list_of_dates +from flowmachine.utils import list_of_dates from ..features import ModalLocation from ..core.query import Query from ..core.model import Model, model_result +from ..core.spatial_unit import VersionedSiteSpatialUnit from ..features.spatial.distance_matrix import DistanceMatrix logger = logging.getLogger("flowmachine").getChild(__name__) @@ -48,38 +49,16 @@ class _populationBuffer(Query): Parameters ---------- - level : str - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit + Spatial unit to which subscriber locations are mapped population_object : flowmachine.features.utilities.spatial_aggregates.SpatialAggregate An aggregated subscriber locating object distance_matrix : flowmachine.features.spatial.distance_matrix.DistanceMatrix A distance matrix """ - def __init__(self, level, population_object, distance_matrix): - - self.level = level + def __init__(self, spatial_unit, population_object, distance_matrix): + self.spatial_unit = spatial_unit self.population_object = population_object self.distance_matrix = distance_matrix @@ -92,7 +71,7 @@ def __get_location_buffer(self): (i..e an origin) and all its possible counterparts (i.e. destinations). """ - cols = get_columns_for_level(self.level) + cols = self.spatial_unit.location_columns from_cols = ", ".join("{c}_from".format(c=c) for c in cols) to_cols = ", ".join("{c}_to".format(c=c) for c in cols) @@ -117,7 +96,7 @@ def __get_location_buffer(self): @property def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level) + cols = self.spatial_unit.location_columns return ( ["id"] @@ -132,7 +111,7 @@ def _make_query(self): that calculates the population that is covered by a buffer. """ - cols = get_columns_for_level(self.level) + cols = self.spatial_unit.location_columns from_cols = ", ".join("B.{c}_from".format(c=c) for c in cols) outer_from_cols = ", ".join("C.{c}_from".format(c=c) for c in cols) @@ -210,12 +189,13 @@ class PopulationWeightedOpportunities(Model): default method used. Refer to the Population() documentation for other available methods. - level : str - {levels} + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None + Spatial unit. If None, defaults to VersionedSiteSpatialUnit(). + Note: DistanceMatrix only supports spatial units + VersionedCellSpatialUnit and VersionedSiteSpatialUnit at this time. **kwargs : arguments - Used to pass custom arguments to the DistanceMatrix() - and ModalLocation() objects. + Used to pass custom arguments to the ModalLocation() objects. Examples -------- @@ -256,7 +236,7 @@ class PopulationWeightedOpportunities(Model): """ def __init__( - self, start, stop, method="home-location", level="versioned-site", **kwargs + self, start, stop, method="home-location", spatial_unit=None, **kwargs ): warnings.warn( @@ -269,21 +249,24 @@ def __init__( self.start = start self.stop = stop self.method = method - self.level = level + if spatial_unit is None: + self.spatial_unit = VersionedSiteSpatialUnit() + else: + self.spatial_unit = spatial_unit self.distance_matrix = DistanceMatrix( - level=level, return_geometry=True, **kwargs + spatial_unit=self.spatial_unit, return_geometry=True ) if self.method == "home-location": self.population_object = ModalLocation( *[ - daily_location(d, level=self.level, **kwargs) + daily_location(d, spatial_unit=self.spatial_unit, **kwargs) for d in list_of_dates(self.start, self.stop) ] ).aggregate() self.population_buffer_object = _populationBuffer( - level=self.level, + spatial_unit=self.spatial_unit, population_object=self.population_object, distance_matrix=self.distance_matrix, ) @@ -380,16 +363,21 @@ def run( population_df = self.population_object.get_dataframe() population_buffer = self.population_buffer_object.get_dataframe() - ix = get_columns_for_level(self.level) - ix = ["{}_{}".format(c, d) for d in ("from", "to") for c in ix] + ix = [ + "{}_{}".format(c, d) + for d in ("from", "to") + for c in self.spatial_unit.location_columns + ] population_buffer.set_index(ix, inplace=True) M = population_df["total"].sum() - N = len(population_df[get_columns_for_level(self.level)].drop_duplicates()) + N = len(population_df[self.spatial_unit.location_columns].drop_duplicates()) beta = 1 / M - locations = population_df[get_columns_for_level(self.level)].values.tolist() - population_df.set_index(get_columns_for_level(self.level), inplace=True) + locations = population_df[ + self.spatial_unit.location_columns + ].values.tolist() + population_df.set_index(self.spatial_unit.location_columns, inplace=True) if not departure_rate_vector: logger.warning( @@ -439,8 +427,11 @@ def run( probability = 0 results.append(i + j + [T_ij, probability]) - ix = get_columns_for_level(self.level) - ix = ["{}_{}".format(c, d) for d in ("from", "to") for c in ix] + ix = [ + "{}_{}".format(c, d) + for d in ("from", "to") + for c in self.spatial_unit.location_columns + ] ix += ["prediction", "probability"] res = pd.DataFrame(results, columns=ix) return res From 551cca66b87e222ed1ebffde487ab043a0c46292 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 13:34:13 +0000 Subject: [PATCH 022/138] Update dialy_location, locate_subscribers, Displacement and Multilocation --- .../features/subscriber/daily_location.py | 98 ++++--------------- .../features/subscriber/displacement.py | 23 ++++- .../features/utilities/multilocation.py | 11 +-- flowmachine/flowmachine/models/pwo.py | 13 +-- 4 files changed, 50 insertions(+), 95 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index 0cbb2e9769..9a86580013 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -13,6 +13,7 @@ """ import datetime +from ...core.spatial_unit import AdminSpatialUnit from .last_location import LastLocation from .most_frequent_location import MostFrequentLocation @@ -20,17 +21,14 @@ def locate_subscribers( start, stop, - level="admin3", + spatial_unit="default", hours="all", method="last", table="all", subscriber_identifier="msisdn", - column_name=None, *, ignore_nulls=True, subscriber_subset=None, - polygon_table=None, - size=None, radius=None, ): """ @@ -45,29 +43,11 @@ def locate_subscribers( start, stop : str iso format date range for the the time frame, e.g. 2016-01-01 or 2016-01-01 14:03:01 - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default AdminSpatialUnit(level=3) + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -89,14 +69,8 @@ def locate_subscribers( If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. kwargs : Eventually passed to flowmachine.spatial_metrics.spatial_helpers. - JoinToLocation. Here you can specify a non standard set of polygons. - See the doc string of JoinToLocation for more details. Notes ----- @@ -122,39 +96,35 @@ def locate_subscribers( . . """ + if spatial_unit == "default": + spatial_unit = AdminSpatialUnit(level=3) if method == "last": return LastLocation( start, stop, - level, + spatial_unit, hours, table=table, subscriber_identifier=subscriber_identifier, - column_name=column_name, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, radius=radius, ) elif method == "most-common": return MostFrequentLocation( start, stop, - level, + spatial_unit, hours, table=table, - column_name=column_name, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, radius=radius, ) # elif self.method == 'first': - # _obj = FirstLocation(start, stop, level, hours) + # _obj = FirstLocation(start, stop, spatial_unit, hours) else: raise ValueError( f"Unrecognised method '{method}', must be either 'most-common' or 'last'" @@ -165,16 +135,13 @@ def daily_location( date, stop=None, *, - level="admin3", + spatial_unit="default", hours="all", method="last", table="all", subscriber_identifier="msisdn", - column_name=None, ignore_nulls=True, subscriber_subset=None, - polygon_table=None, - size=None, radius=None, ): """ @@ -188,29 +155,11 @@ def daily_location( stop : str optionally specify a stop datetime in iso format date for the day in question, e.g. 2016-01-02 06:00:00 - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default AdminSpatialUnit(level=3) + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -232,10 +181,6 @@ def daily_location( If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. Notes ----- @@ -246,6 +191,8 @@ def daily_location( * Use 24 hr format! """ + if spatial_unit == "default": + spatial_unit = AdminSpatialUnit(level=3) if stop is None: # 'cast' the date object as a date d1 = datetime.date(*map(int, date.split("-"))) @@ -255,15 +202,12 @@ def daily_location( return locate_subscribers( start=date, stop=stop, - level=level, + spatial_unit=spatial_unit, hours=hours, method=method, table=table, subscriber_identifier=subscriber_identifier, - column_name=column_name, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, radius=radius, ) diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index c8149f7a9b..c06e5ed69f 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -17,6 +17,11 @@ from . import ModalLocation from ..utilities.subscriber_locations import subscriber_locations from flowmachine.utils import parse_datestring, get_dist_string, list_of_dates +from flowmachine.core.spatial_unit import ( + LatLonSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, +) from dateutil.relativedelta import relativedelta @@ -79,26 +84,34 @@ def __init__( self.start = start - allowed_levels = ["lat-lon", "versioned-cell", "versioned-site"] + allowed_spatial_units = [ + LatLonSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + ] if modal_locations: if ( isinstance(modal_locations, ModalLocation) - and modal_locations.level in allowed_levels + and type(modal_locations.spatial_unit) in allowed_spatial_units ): hl = modal_locations else: raise ValueError( - f"Argument 'modal_locations' should be an instance of ModalLocation class with level in {allowed_levels}" + "Argument 'modal_locations' should be an instance of " + "ModalLocation class with type(spatial_unit) in " + f"{su.__name__ for su in allowed_spatial_units}" ) else: hl = ModalLocation( *[ - daily_location(date, level="lat-lon", **kwargs) + daily_location(date, spatial_unit=LatLonSpatialUnit(), **kwargs) for date in list_of_dates(self.start, self.stop_hl) ] ) - sl = subscriber_locations(self.start, self.stop_sl, level="lat-lon", **kwargs) + sl = subscriber_locations( + self.start, self.stop_sl, spatial_unit=LatLonSpatialUnit(), **kwargs + ) self.statistic = statistic.lower() if self.statistic not in valid_stats: diff --git a/flowmachine/flowmachine/features/utilities/multilocation.py b/flowmachine/flowmachine/features/utilities/multilocation.py index f6cb5d8115..61fd02e366 100644 --- a/flowmachine/flowmachine/features/utilities/multilocation.py +++ b/flowmachine/flowmachine/features/utilities/multilocation.py @@ -13,7 +13,7 @@ import logging -from flowmachine.utils import parse_datestring, get_columns_for_level +from flowmachine.utils import parse_datestring from ...core import CustomQuery @@ -62,9 +62,8 @@ def __init__(self, *daily_locations): # Importing daily_location inputs # from first daily_location object. - self.level = self._all_dls[0].level + self.spatial_unit = self._all_dls[0].spatial_unit self.subscriber_identifier = self._all_dls[0].subscriber_identifier - self.column_name = self._all_dls[0].column_name super().__init__() def _append_date(self, dl): @@ -80,13 +79,11 @@ def _append_date(self, dl): date_string = f"to_date('{dl.start}','YYYY-MM-DD') AS date" sql = f"SELECT *, {date_string} FROM ({dl.get_query()}) AS dl" - return CustomQuery( - sql, get_columns_for_level(self.level, self.column_name) + ["date"] - ) + return CustomQuery(sql, self.spatial_unit.location_columns + ["date"]) def _get_relevant_columns(self): """ Get a string of the location related columns """ - return ", ".join(get_columns_for_level(self.level, self.column_name)) + return ", ".join(self.spatial_unit.location_columns) diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 596e4e2f17..621b2ae580 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -50,7 +50,8 @@ class _populationBuffer(Query): Parameters ---------- spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit - Spatial unit to which subscriber locations are mapped + Spatial unit to which subscriber locations are mapped. See the + docstring of spatial_unit.py for more information. population_object : flowmachine.features.utilities.spatial_aggregates.SpatialAggregate An aggregated subscriber locating object distance_matrix : flowmachine.features.spatial.distance_matrix.DistanceMatrix @@ -189,10 +190,10 @@ class PopulationWeightedOpportunities(Model): default method used. Refer to the Population() documentation for other available methods. - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None - Spatial unit. If None, defaults to VersionedSiteSpatialUnit(). + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, + default VersionedSiteSpatialUnit() Note: DistanceMatrix only supports spatial units - VersionedCellSpatialUnit and VersionedSiteSpatialUnit at this time. + VersionedCellSpatialUnit() and VersionedSiteSpatialUnit() at this time. **kwargs : arguments Used to pass custom arguments to the ModalLocation() objects. @@ -236,7 +237,7 @@ class PopulationWeightedOpportunities(Model): """ def __init__( - self, start, stop, method="home-location", spatial_unit=None, **kwargs + self, start, stop, method="home-location", spatial_unit="default", **kwargs ): warnings.warn( @@ -249,7 +250,7 @@ def __init__( self.start = start self.stop = stop self.method = method - if spatial_unit is None: + if spatial_unit == "default": self.spatial_unit = VersionedSiteSpatialUnit() else: self.spatial_unit = spatial_unit From d883bedd8fe8b5d2f47376d70b58e9c4bfec8efc Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 16:33:29 +0000 Subject: [PATCH 023/138] Update DayTrajectories and ModalLocation --- .../features/subscriber/day_trajectories.py | 16 ++++++++-------- .../features/subscriber/modal_location.py | 3 +-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/day_trajectories.py b/flowmachine/flowmachine/features/subscriber/day_trajectories.py index 7bae91c40b..0cf034ccf2 100644 --- a/flowmachine/flowmachine/features/subscriber/day_trajectories.py +++ b/flowmachine/flowmachine/features/subscriber/day_trajectories.py @@ -16,7 +16,6 @@ from flowmachine.core import Query from flowmachine.features.utilities.subscriber_locations import BaseLocation -from flowmachine.utils import get_columns_for_level from ..utilities.multilocation import MultiLocation @@ -26,8 +25,13 @@ class DayTrajectories(MultiLocation, BaseLocation, Query): Examples -------- - >>> dt = DayTrajectories('2016-01-01', '2016-01-04', - level = 'admin3', method = 'last', hours = (5,17)) + >>> dt = DayTrajectories( + '2016-01-01', + '2016-01-04', + spatial_unit = AdminSpatialUnit(level=3), + method = 'last', + hours = (5,17), + ) >>> dt.head(4) subscriber name date 0 038OVABN11Ak4W5P Dolpa 2016-01-01 @@ -38,11 +42,7 @@ class DayTrajectories(MultiLocation, BaseLocation, Query): @property def column_names(self) -> List[str]: - return ( - ["subscriber"] - + get_columns_for_level(self.level, self.column_name) - + ["date"] - ) + return ["subscriber"] + self.spatial_unit.location_columns + ["date"] def _make_query(self): """ diff --git a/flowmachine/flowmachine/features/subscriber/modal_location.py b/flowmachine/flowmachine/features/subscriber/modal_location.py index 5439582d98..73a850d74c 100644 --- a/flowmachine/flowmachine/features/subscriber/modal_location.py +++ b/flowmachine/flowmachine/features/subscriber/modal_location.py @@ -16,7 +16,6 @@ from flowmachine.core import Query from flowmachine.features.utilities.subscriber_locations import BaseLocation -from flowmachine.utils import get_columns_for_level from ..utilities.multilocation import MultiLocation @@ -30,7 +29,7 @@ class ModalLocation(MultiLocation, BaseLocation, Query): @property def column_names(self) -> List[str]: - return ["subscriber"] + get_columns_for_level(self.level, self.column_name) + return ["subscriber"] + self.spatial_unit.location_columns def _make_query(self): """ From d6a1a6f6c878719ec482ff13bca9e64028fc2755 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 16:33:58 +0000 Subject: [PATCH 024/138] Add exemplar_spatial_unit_param fixture --- flowmachine/tests/conftest.py | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 2689f47fe7..66c6a1f531 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -17,6 +17,14 @@ import flowmachine from flowmachine.core import Query from flowmachine.core.cache import reset_cache +from flowmachine.core.spatial_unit import ( + LatLonSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + PolygonSpatialUnit, + AdminSpatialUnit, + GridSpatialUnit, +) from flowmachine.features import EventTableSubset logger = logging.getLogger() @@ -56,6 +64,39 @@ def exemplar_level_param(request): yield request.param +@pytest.fixture( + params=[ + (AdminSpatialUnit, {"level": 2}), + (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), + (VersionedSiteSpatialUnit, {}), + (VersionedCellSpatialUnit, {}), + (LatLonSpatialUnit, {}), + (GridSpatialUnit, {"size": 5}), + ( + PolygonSpatialUnit, + {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, + ), + (lambda: None, {}), + ], + ids=lambda x: str(x[0]), +) +def exemplar_spatial_unit_param(request): + """ + A fixture which yields a succession of plausible default parameter + combinations for levels. + + Parameters + ---------- + request + + Yields + ------ + dict + + """ + yield request.param[0](**request.param[1]) + + def get_string_with_test_parameter_values(item): """ If `item` corresponds to a parametrized pytest test, return a string From 8d18674e3e711433dbef0dbd31a15c830a05727e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 16:35:06 +0000 Subject: [PATCH 025/138] Update tests for daily_location, Displacement, Multilocation, DayTrajectories and ModalLocation --- .../test_sql_strings_and_results.py | 19 +++--- flowmachine/tests/test_async.py | 12 ++-- flowmachine/tests/test_daily_location.py | 11 ++-- flowmachine/tests/test_day_trajectories.py | 17 ++++-- flowmachine/tests/test_displacement.py | 9 +-- flowmachine/tests/test_flows.py | 36 +++++++---- flowmachine/tests/test_geomixin.py | 61 ++++++++++++++----- flowmachine/tests/test_indexes.py | 8 +-- flowmachine/tests/test_inoutflows.py | 5 +- flowmachine/tests/test_location_visits.py | 11 ++-- .../tests/test_most_frequent_locations.py | 7 ++- flowmachine/tests/test_radius_of_gyration.py | 5 +- flowmachine/tests/test_spatial_aggregate.py | 8 ++- .../test_daily_location_results.py | 4 +- 14 files changed, 139 insertions(+), 74 deletions(-) diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index 46d93d9ebc..79467e62cd 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -7,6 +7,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery from flowmachine.features import daily_location +from flowmachine.core.spatial_unit import AdminSpatialUnit def test_daily_location_1_sql(diff_reporter): @@ -33,11 +34,10 @@ def test_daily_location_2_sql(diff_reporter): """ dl = daily_location( "2016-01-04", - level="admin2", + spatial_unit=AdminSpatialUnit(level=2, column_name="admin2pcod"), hours=(3, 9), method="most-common", subscriber_identifier="imei", - column_name="admin2pcod", ignore_nulls=False, subscriber_subset=[ "2GJxeNazvlgZbqj6", @@ -56,11 +56,10 @@ def test_daily_location_2_df(get_dataframe, diff_reporter): """ dl = daily_location( "2016-01-04", - level="admin2", + spatial_unit=AdminSpatialUnit(level=2), hours=(3, 9), method="most-common", # subscriber_identifier="imei", - # column_name="admin2pcod", ignore_nulls=False, subscriber_subset=[ "2GJxeNazvlgZbqj6", @@ -83,11 +82,10 @@ def test_daily_location_3_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - level="cell", + spatial_unit=None, hours=(23, 5), method="last", # subscriber_identifier="imei", - # column_name="admin2pcod", # ignore_nulls=False, subscriber_subset=subset_query, ) @@ -105,11 +103,10 @@ def test_daily_location_3_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - level="cell", + spatial_unit=None, hours=(23, 5), method="last", # subscriber_identifier="imei", - # column_name="admin2pcod", # ignore_nulls=False, subscriber_subset=subset_query, ) @@ -163,11 +160,10 @@ def test_daily_location_5_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - level="cell", + spatial_unit=None, hours=(23, 5), method="last", # subscriber_identifier="imei", - # column_name="admin2pcod", # ignore_nulls=False, subscriber_subset=subset_query, ) @@ -191,11 +187,10 @@ def test_daily_location_5_df(get_dataframe, diff_reporter): dl = daily_location( "2016-01-02", - level="admin3", + spatial_unit=AdminSpatialUnit(level=3), hours=(4, 9), method="most-common", # subscriber_identifier="imei", - # column_name="admin2pcod", # ignore_nulls=False, subscriber_subset=subset_query, ) diff --git a/flowmachine/tests/test_async.py b/flowmachine/tests/test_async.py index ffb87f37c4..0d7190a906 100644 --- a/flowmachine/tests/test_async.py +++ b/flowmachine/tests/test_async.py @@ -34,7 +34,7 @@ def test_double_store(): Storing a query twice doesn't raise an error. """ - dl = daily_location("2016-01-01", level="cell") + dl = daily_location("2016-01-01", spatial_unit=None) dl.store().result() dl.store().result() @@ -45,12 +45,12 @@ def test_store_async(): """ schema = "cache" - dl = daily_location("2016-01-01", level="cell") + dl = daily_location("2016-01-01", spatial_unit=None) table_name = dl.fully_qualified_table_name.split(".")[1] store_future = dl.store() store_future.result() assert dl.connection.has_table(table_name, schema=schema) - dl = daily_location("2016-01-01", level="cell") + dl = daily_location("2016-01-01", spatial_unit=None) assert table_name in dl.get_query() @@ -58,7 +58,7 @@ def test_get_query_blocks_on_store(): """ If a store is running get_query should block. """ - dl = daily_location("2016-01-01", level="cell") + dl = daily_location("2016-01-01", spatial_unit=None) dl.store().result() timer = [] @@ -79,8 +79,8 @@ def test_blocks_on_store_cascades(): If a store is running on a query that is used in a another query, that query should wait. """ - dl = daily_location("2016-01-01", level="cell") - dl2 = daily_location("2016-01-02", level="cell") + dl = daily_location("2016-01-01", spatial_unit=None) + dl2 = daily_location("2016-01-02", spatial_unit=None) store_future = dl.store() store_future.result() hl = ModalLocation(dl, dl2) diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index 67958fbd82..e177140d2f 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -5,6 +5,7 @@ import pytest from flowmachine.core.errors import MissingDateError +from flowmachine.core.spatial_unit import AdminSpatialUnit from flowmachine.features import daily_location, MostFrequentLocation @@ -41,7 +42,9 @@ def test_works_with_admin_names(get_dataframe): We can get daily locations with admin names rather than pcodes. """ - dl = daily_location("2016-01-05", level="admin3", column_name="admin3name") + dl = daily_location( + "2016-01-05", spatial_unit=AdminSpatialUnit(level=3, column_name="admin3name") + ) df = get_dataframe(dl) assert "Lamjung" == df.admin3name[0] @@ -54,9 +57,9 @@ def test_hours(get_length): # Lower level test test that subsetdates handles this correctly # we're just testing that it is passed on in this case. - dl1 = daily_location("2016-01-01", level="cell") - dl2 = daily_location("2016-01-01", level="cell", hours=(19, 23)) - dl3 = daily_location("2016-01-01", level="cell", hours=(19, 20)) + dl1 = daily_location("2016-01-01", spatial_unit=None) + dl2 = daily_location("2016-01-01", spatial_unit=None, hours=(19, 23)) + dl3 = daily_location("2016-01-01", spatial_unit=None, hours=(19, 20)) assert get_length(dl1) > get_length(dl2) > get_length(dl3) diff --git a/flowmachine/tests/test_day_trajectories.py b/flowmachine/tests/test_day_trajectories.py index b42799569e..0514743fe4 100644 --- a/flowmachine/tests/test_day_trajectories.py +++ b/flowmachine/tests/test_day_trajectories.py @@ -4,11 +4,14 @@ from flowmachine.features import DayTrajectories, daily_location +from flowmachine.core.spatial_unit import AdminSpatialUnit -def test_column_names_day_trajectories(exemplar_level_param): +def test_column_names_day_trajectories(exemplar_spatial_unit_param): """ Test that column_names property matches head(0) for DayTrajectories""" - lv = DayTrajectories(daily_location("2016-01-01", **exemplar_level_param)) + lv = DayTrajectories( + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) + ) assert lv.head(0).columns.tolist() == lv.column_names @@ -16,9 +19,15 @@ def test_day_trajectories(get_dataframe): """ DailyLocations calculations within DayTrajectories.get_dataframe() are correct. """ - traj = DayTrajectories(daily_location("2016-01-01", level="admin3", method="last")) + traj = DayTrajectories( + daily_location( + "2016-01-01", spatial_unit=AdminSpatialUnit(level=3), method="last" + ) + ) df = get_dataframe(traj).drop("date", axis=1) - dldf = daily_location("2016-01-01", level="admin3", method="last").get_dataframe() + dldf = daily_location( + "2016-01-01", spatial_unit=AdminSpatialUnit(level=3), method="last" + ).get_dataframe() assert [df["subscriber"][0], df["pcod"][0]] == [ dldf["subscriber"][0], dldf["pcod"][0], diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index 55afe1b039..71ebf62cc5 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -7,6 +7,7 @@ from numpy import isnan from flowmachine.utils import list_of_dates +from flowmachine.core.spatial_unit import LatLonSpatialUnit @pytest.mark.parametrize( @@ -50,7 +51,7 @@ def test_pass_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, level="lat-lon") + daily_location(d, spatial_unit=LatLonSpatialUnit()) for d in list_of_dates("2016-01-01", "2016-01-06") ] ) @@ -67,7 +68,7 @@ def test_pass_modal_location(get_dataframe): def test_error_when_modal_location_not_latlong(): """ Test that error is raised if home location passed to class - is not using level lat-lon + is not using lat-lon spatial unit """ ml = ModalLocation( @@ -89,7 +90,7 @@ def test_get_all_users_in_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, level="lat-lon", hours=(12, 13)) + daily_location(d, spatial_unit=LatLonSpatialUnit(), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) @@ -113,7 +114,7 @@ def test_subscriber_with_home_loc_but_no_calls_is_nan(get_dataframe): ml = ModalLocation( *[ - daily_location(d, level="lat-lon", hours=(12, 13)) + daily_location(d, spatial_unit=LatLonSpatialUnit(), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 337ae33690..9808dc3f6d 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -6,7 +6,7 @@ import pytest - +from flowmachine.core.spatial_unit import AdminSpatialUnit from flowmachine.features import daily_location from flowmachine.features.location.flows import * from flowmachine.features.subscriber.daily_location import locate_subscribers @@ -15,11 +15,11 @@ @pytest.mark.parametrize("query", [InFlow, OutFlow]) -def test_column_names_inout(query, exemplar_level_param): +def test_column_names_inout(query, exemplar_spatial_unit_param): """ Test that column_names property matches head(0) for InFlow & OutFlow""" flow = Flows( - daily_location("2016-01-01", **exemplar_level_param), - daily_location("2016-01-01", **exemplar_level_param), + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param), + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param), ) query_instance = query(flow) assert query_instance.head(0).columns.tolist() == query_instance.column_names @@ -29,17 +29,17 @@ def test_flows_raise_error(): """ Flows() raises error if location levels are different. """ - dl1 = daily_location("2016-01-01", level="admin3") - dl2 = daily_location("2016-01-01", level="admin2") + dl1 = daily_location("2016-01-01", spatial_unit=AdminSpatialUnit(level=3)) + dl2 = daily_location("2016-01-01", spatial_unit=AdminSpatialUnit(level=3)) with pytest.raises(ValueError): Flows(dl1, dl2) -def test_column_names_flow(exemplar_level_param): +def test_column_names_flow(exemplar_spatial_unit_param): """ Test that column_names property matches head(0) for Flows""" flow = Flows( - daily_location("2016-01-01", **exemplar_level_param), - daily_location("2016-01-01", **exemplar_level_param), + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param), + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param), ) assert flow.head(0).columns.tolist() == flow.column_names @@ -48,8 +48,13 @@ def test_calculates_flows(get_dataframe): """ Flows() are correctly calculated """ - dl1 = locate_subscribers("2016-01-01", "2016-01-02", level="admin3", method="last") - dl2 = locate_subscribers("2016-01-02", "2016-01-03", level="admin3", method="last") + spatial_unit = AdminSpatialUnit(level=3) + dl1 = locate_subscribers( + "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" + ) + dl2 = locate_subscribers( + "2016-01-02", "2016-01-03", spatial_unit=spatial_unit, method="last" + ) flow = Flows(dl1, dl2) df = get_dataframe(flow) assert ( @@ -76,8 +81,13 @@ def test_flows_geojson_correct(): """ Test that flows outputs expected geojson. """ - dl1 = locate_subscribers("2016-01-01", "2016-01-02", level="admin3", method="last") - dl2 = locate_subscribers("2016-01-02", "2016-01-03", level="admin3", method="last") + spatial_unit = AdminSpatialUnit(level=3) + dl1 = locate_subscribers( + "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" + ) + dl2 = locate_subscribers( + "2016-01-02", "2016-01-03", spatial_unit=spatial_unit, method="last" + ) flow = Flows(dl1, dl2) fl_json = flow.to_geojson() directory = os.path.dirname(os.path.os.path.realpath(__file__)) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index c180416ee3..311dfb591d 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -16,6 +16,13 @@ from flowmachine.core import Query from flowmachine.core.mixins import GeoDataMixin +from flowmachine.core.spatial_unit import ( + LatLonSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + AdminSpatialUnit, + GridSpatialUnit, +) from flowmachine.features import daily_location, Flows from flowmachine.utils import proj4string @@ -75,13 +82,25 @@ def test_valid_geojson(): """ test_geojson = [ daily_location("2016-01-01", "2016-01-02").aggregate(), - daily_location("2016-01-01", "2016-01-02", level="grid", size=100).aggregate(), - daily_location("2016-01-01", "2016-01-02", level="lat-lon").aggregate(), - daily_location("2016-01-01", "2016-01-02", level="versioned-site").aggregate(), - daily_location("2016-01-01", "2016-01-02", level="versioned-cell").aggregate(), - daily_location("2016-01-01", "2016-01-02", level="admin2").aggregate(), daily_location( - "2016-01-01", "2016-01-02", level="admin2", column_name="admin2name" + "2016-01-01", "2016-01-02", spatial_unit=GridSpatialUnit(size=100) + ).aggregate(), + daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).aggregate(), + daily_location( + "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() + ).aggregate(), + daily_location( + "2016-01-01", "2016-01-02", spatial_unit=VersionedCellSpatialUnit() + ).aggregate(), + daily_location( + "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) + ).aggregate(), + daily_location( + "2016-01-01", + "2016-01-02", + spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name"), ).aggregate(), ] for o in test_geojson: @@ -93,7 +112,9 @@ def test_correct_geojson(): Check that the geojson actually contains the right features. """ js = ( - daily_location("2016-01-01", "2016-01-02", level="admin2") + daily_location( + "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) + ) .aggregate() .to_geojson() ) @@ -118,7 +139,7 @@ def test_geojson_file_output(tmpdir): js_file = tmpdir / "geojson_test.json" daily_location( - "2016-01-01", "2016-01-02", level="admin2" + "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) ).aggregate().to_geojson_file(js_file) with open(js_file) as fin: js = json.load(fin) @@ -140,8 +161,12 @@ def test_flows_geojson(get_dataframe): Test geojson works for flows with non-standard column names. """ - dl = daily_location("2016-01-01", level="admin2", column_name="admin2name") - dl2 = daily_location("2016-01-02", level="admin2", column_name="admin2name") + dl = daily_location( + "2016-01-01", spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name") + ) + dl2 = daily_location( + "2016-01-02", spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name") + ) fl = Flows(dl, dl2) js = fl.to_geojson() df = get_dataframe(fl) @@ -160,7 +185,9 @@ def test_reprojection(): Test that in db reprojection works. """ - dl = daily_location("2016-01-01", "2016-01-02", level="lat-lon").aggregate() + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js["features"][0]["geometry"]["coordinates"] == [ -8094697.51781301, @@ -173,21 +200,27 @@ def test_geojson_cache(): """ Test geojson is cached locally. """ - dl = daily_location("2016-01-01", "2016-01-02", level="lat-lon").aggregate() + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js == dl._geojson[proj4string(dl.connection, 2770)] def test_geojson_cache_exluded_from_pickle(): """Test that cached geojson is not going to get pickled.""" - dl = daily_location("2016-01-01", "2016-01-02", level="lat-lon").aggregate() + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert "_geojson" not in dl.__getstate__() # Check excluded from pickle def test_geojson_caching_off(): """Test that switching off caching clears the cache, and doesn't add to it.""" - dl = daily_location("2016-01-01", "2016-01-02", level="lat-lon").aggregate() + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 dl.turn_off_caching() # Check caching for geojson switches off with pytest.raises(KeyError): diff --git a/flowmachine/tests/test_indexes.py b/flowmachine/tests/test_indexes.py index 98253e02ca..8e01d9039d 100644 --- a/flowmachine/tests/test_indexes.py +++ b/flowmachine/tests/test_indexes.py @@ -2,6 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core.spatial_unit import LatLonSpatialUnit from flowmachine.features.subscriber import * @@ -13,10 +14,9 @@ def test_default_indexes(): ["pcod"], '"subscriber"', ] - assert daily_location("2016-01-01", "2016-01-02", level="lat-lon").index_cols == [ - ["lat", "lon"], - '"subscriber"', - ] + assert daily_location( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ).index_cols == [["lat", "lon"], '"subscriber"'] assert SubscriberDegree("2016-01-01", "2016-01-02").index_cols == ['"subscriber"'] diff --git a/flowmachine/tests/test_inoutflows.py b/flowmachine/tests/test_inoutflows.py index f3509d78a3..a8741759ac 100644 --- a/flowmachine/tests/test_inoutflows.py +++ b/flowmachine/tests/test_inoutflows.py @@ -8,6 +8,7 @@ from flowmachine.features import Flows, daily_location +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit def test_inoutflow_with_double_column_location(): @@ -16,8 +17,8 @@ def test_inoutflow_with_double_column_location(): more than one column. """ - dl1 = daily_location("2016-01-01", level="versioned-site") - dl2 = daily_location("2016-01-02", level="versioned-site") + dl1 = daily_location("2016-01-01", spatial_unit=VersionedSiteSpatialUnit) + dl2 = daily_location("2016-01-02", spatial_unit=VersionedSiteSpatialUnit) flow = Flows(dl1, dl2) expected_columns = ["site_id_to", "version_to", "lon_to", "lat_to", "total"] diff --git a/flowmachine/tests/test_location_visits.py b/flowmachine/tests/test_location_visits.py index fee53aa656..f78e07247f 100644 --- a/flowmachine/tests/test_location_visits.py +++ b/flowmachine/tests/test_location_visits.py @@ -4,12 +4,15 @@ from flowmachine.features import LocationVisits, daily_location, DayTrajectories from flowmachine.utils import list_of_dates +from flowmachine.core.spatial_unit import AdminSpatialUnit -def test_column_names_location_visits(exemplar_level_param): +def test_column_names_location_visits(exemplar_spatial_unit_param): """ Test that column_names property matches head(0) for LocationVisits""" lv = LocationVisits( - DayTrajectories(daily_location("2016-01-01", **exemplar_level_param)) + DayTrajectories( + daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) + ) ) assert lv.head(0).columns.tolist() == lv.column_names @@ -26,7 +29,7 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): lv = LocationVisits( DayTrajectories( *[ - daily_location(d, level="admin3", method="last") + daily_location(d, spatial_unit=AdminSpatialUnit(level=3), method="last") for d in list_of_dates(start_date, stop_date) ] ) @@ -40,7 +43,7 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): lv = LocationVisits( DayTrajectories( *[ - daily_location(d, level="admin3", method="last") + daily_location(d, spatial_unit=AdminSpatialUnit(level=3), method="last") for d in list_of_dates(start_date, stop_date) ] ) diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index 11bcf150d8..b46c18657e 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -4,6 +4,7 @@ import pytest +from flowmachine.core.spatial_unit import AdminSpatialUnit from flowmachine.features import MostFrequentLocation from flowmachine.features.subscriber.daily_location import locate_subscribers @@ -47,9 +48,11 @@ def test_most_fequent_admin(get_dataframe): """ Test that the most frequent admin3 is correctly calculated. """ - mfl = locate_subscribers( - "2016-01-01", "2016-01-02", level="admin3", method="most-common" + "2016-01-01", + "2016-01-02", + spatial_unit=AdminSpatialUnit(level=3), + method="most-common", ) df = get_dataframe(mfl) # A few hand picked values diff --git a/flowmachine/tests/test_radius_of_gyration.py b/flowmachine/tests/test_radius_of_gyration.py index 0044d1e4fd..7e70646609 100644 --- a/flowmachine/tests/test_radius_of_gyration.py +++ b/flowmachine/tests/test_radius_of_gyration.py @@ -3,6 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core.spatial_unit import AdminSpatialUnit from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.features.subscriber import * @@ -48,7 +49,9 @@ def test_can_be_joined(get_dataframe): RadiusOfGyration() can be joined with a location type metric. """ RoG = RadiusOfGyration("2016-01-01", "2016-01-02") - dl = locate_subscribers("2016-01-01", "2016-01-02", level="admin3") + dl = locate_subscribers( + "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=3) + ) rog_JA = RoG.join_aggregate(dl) df = get_dataframe(rog_JA) assert isinstance(df, pd.DataFrame) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 663357391b..5616a2da8e 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -2,6 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core.spatial_unit import AdminSpatialUnit, LatLonSpatialUnit from flowmachine.features import ModalLocation, daily_location from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates @@ -12,7 +13,10 @@ def test_can_be_aggregated_admin3(get_dataframe): Query can be aggregated to a spatial level with admin3 data. """ mfl = locate_subscribers( - "2016-01-01", "2016-01-02", level="admin3", method="most-common" + "2016-01-01", + "2016-01-02", + spatial_unit=AdminSpatialUnit(level=3), + method="most-common", ) agg = mfl.aggregate() df = get_dataframe(agg) @@ -25,7 +29,7 @@ def test_can_be_aggregated_latlong(get_dataframe): """ hl = ModalLocation( *[ - daily_location(d, level="lat-lon", method="last") + daily_location(d, spatial_unit=LatLonSpatialUnit(), method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ] ) diff --git a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py index e2c0f4d514..edf98b87a4 100644 --- a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py +++ b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py @@ -19,7 +19,7 @@ def test_daily_location_1_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - level="cell", + spatial_unit=None, hours=(23, 5), method="last", subscriber_subset=subset_query, @@ -40,7 +40,7 @@ def test_daily_location_1_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - level="cell", + spatial_unit=None, hours=(23, 5), method="last", subscriber_subset=subset_query, From 0b07858395526ccc47766231222f6527993f3eed Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 17:04:37 +0000 Subject: [PATCH 026/138] Update LastLocation and MostFrequentLocation --- .../features/subscriber/last_location.py | 54 +++++------------- .../subscriber/most_frequent_location.py | 56 +++++-------------- .../features/utilities/spatial_aggregates.py | 2 +- flowmachine/tests/test_joined_aggregate.py | 21 +++++-- flowmachine/tests/test_last_location.py | 15 +++-- .../tests/test_most_frequent_locations.py | 24 ++++++-- 6 files changed, 74 insertions(+), 98 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index fefc255bdd..ea009e4149 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -15,7 +15,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation from ..utilities.subscriber_locations import subscriber_locations -from flowmachine.utils import get_columns_for_level +from flowmachine.core.spatial_unit import AdminSpatialUnit class LastLocation(BaseLocation, Query): @@ -30,29 +30,11 @@ class LastLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default AdminSpatialUnit(level=3) + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -74,10 +56,6 @@ class LastLocation(BaseLocation, Query): If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. Notes ----- @@ -93,52 +71,48 @@ def __init__( self, start, stop, - level="admin3", + spatial_unit="default", hours="all", table="all", subscriber_identifier="msisdn", - column_name=None, *, ignore_nulls=True, subscriber_subset=None, - polygon_table=None, - size=None, radius=None, ): self.start = start self.stop = stop - self.level = level + if spatial_unit == "default": + self.spatial_unit = AdminSpatialUnit(level=3) + else: + self.spatial_unit = spatial_unit self.hours = hours self.table = table self.subscriber_identifier = subscriber_identifier - self.column_name = column_name self.subscriber_locs = subscriber_locations( start=self.start, stop=self.stop, - level=self.level, + spatial_unit=self.spatial_unit, hours=self.hours, table=self.table, subscriber_identifier=self.subscriber_identifier, - column_name=self.column_name, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, radius=radius, ) super().__init__() @property def column_names(self) -> List[str]: - return ["subscriber"] + get_columns_for_level(self.level, self.column_name) + return ["subscriber"] + self.spatial_unit.location_columns def _make_query(self): """ Default query method implemented in the metaclass Query(). """ - relevant_columns = ",".join(get_columns_for_level(self.level, self.column_name)) + relevant_columns = ",".join(self.spatial_unit.location_columns) sql = """ SELECT final_time.subscriber, {rc} diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 40106e389d..8ddd784d41 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -13,7 +13,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation, subscriber_locations -from flowmachine.utils import get_columns_for_level +from flowmachine.core.spatial_unit import AdminSpatialUnit class MostFrequentLocation(BaseLocation, Query): @@ -28,29 +28,11 @@ class MostFrequentLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default AdminSpatialUnit(level=3) + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of int, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -72,10 +54,6 @@ class MostFrequentLocation(BaseLocation, Query): If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. Notes ----- @@ -91,16 +69,13 @@ def __init__( self, start, stop, - level="admin3", + spatial_unit="default", hours="all", table="all", subscriber_identifier="msisdn", - column_name=None, *, ignore_nulls=True, subscriber_subset=None, - polygon_table=None, - size=None, radius=None, ): """ @@ -110,23 +85,22 @@ def __init__( self.start = start self.stop = stop - self.level = level + if spatial_unit == "default": + self.spatial_unit = AdminSpatialUnit(level=3) + else: + self.spatial_unit = spatial_unit self.hours = hours self.table = table self.subscriber_identifier = subscriber_identifier - self.column_name = column_name self.subscriber_locs = subscriber_locations( start=self.start, stop=self.stop, - level=self.level, + spatial_unit=self.spatial_unit, hours=self.hours, table=self.table, subscriber_identifier=self.subscriber_identifier, - column_name=self.column_name, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, radius=radius, ) @@ -134,7 +108,7 @@ def __init__( @property def column_names(self) -> List[str]: - return ["subscriber"] + get_columns_for_level(self.level, self.column_name) + return ["subscriber"] + self.spatial_unit.location_columns def _make_query(self): """ @@ -143,9 +117,7 @@ def _make_query(self): """ subscriber_query = "{} ORDER BY time".format(self.subscriber_locs.get_query()) - relevant_columns = ", ".join( - get_columns_for_level(self.level, self.column_name) - ) + relevant_columns = ", ".join(self.spatial_unit.location_columns) # Create a table which has the total times each subscriber visited # each location diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index e6d50df827..73fb5d34b4 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -83,7 +83,7 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): -------- >>> mfl = subscribers.MostFrequentLocation('2016-01-01', '2016-01-04', - level='admin3') + spatial_unit=AdminSpatialUnit(level=3)) >>> rog = subscribers.RadiusOfGyration('2016-01-01', '2016-01-04') >>> sm = JoinedSpatialAggregate( rog, mfl ) diff --git a/flowmachine/tests/test_joined_aggregate.py b/flowmachine/tests/test_joined_aggregate.py index 69d45ddaed..129b86169d 100644 --- a/flowmachine/tests/test_joined_aggregate.py +++ b/flowmachine/tests/test_joined_aggregate.py @@ -6,6 +6,7 @@ import pytest +from flowmachine.core.spatial_unit import AdminSpatialUnit from flowmachine.features import ( MostFrequentLocation, RadiusOfGyration, @@ -17,7 +18,9 @@ def test_joined_aggregate(get_dataframe): """ Test join aggregate. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + ) joined = mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert ( pytest.approx(203.12391560786) @@ -29,7 +32,9 @@ def test_joined_modal_aggregate(get_dataframe): """ Test join with modal aggregate. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + ) rog = SubscriberDegree("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="mode") rawus_mode = ( @@ -50,7 +55,9 @@ def test_joined_median_aggregate(get_dataframe): """ Test join with median aggregate. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + ) rog = RadiusOfGyration("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="median") rawus_avg = ( @@ -71,7 +78,9 @@ def test_joined_agg_date_mismatch(): """ Test that join aggregate with mismatched dates raises a warning. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + ) with pytest.warns(UserWarning): mfl.join_aggregate(RadiusOfGyration("2016-01-02", "2016-01-04")) @@ -83,7 +92,9 @@ def test_joined_agg_hours_mismatch(): """ Test that join aggregate with mismatched hours doesn't warn. """ - mfl = MostFrequentLocation("2016-01-01 10:00", "2016-01-04", level="admin3") + mfl = MostFrequentLocation( + "2016-01-01 10:00", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + ) with warnings.catch_warnings(record=True) as w: mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert not w diff --git a/flowmachine/tests/test_last_location.py b/flowmachine/tests/test_last_location.py index f78ab9ad32..266a492db8 100644 --- a/flowmachine/tests/test_last_location.py +++ b/flowmachine/tests/test_last_location.py @@ -4,15 +4,18 @@ import pytest +from flowmachine.core.spatial_unit import LatLonSpatialUnit, VersionedSiteSpatialUnit from flowmachine.features import LastLocation -def test_last_location_column_names(exemplar_level_param, get_dataframe): +def test_last_location_column_names(exemplar_spatial_unit_param, get_dataframe): """ LastLocation() is able to return a dataframe. """ - last_loc = LastLocation("2016-01-01", "2016-01-02", **exemplar_level_param) + last_loc = LastLocation( + "2016-01-01", "2016-01-02", spatial_unit=exemplar_spatial_unit_param + ) df = get_dataframe(last_loc) assert df.columns.tolist() == last_loc.column_names @@ -22,7 +25,9 @@ def test_last_loc_vsite(get_dataframe): LastLocation() returns correct last location. """ - last_loc = LastLocation("2016-01-01", "2016-01-02", level="versioned-site") + last_loc = LastLocation( + "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() + ) df = get_dataframe(last_loc) df.set_index("subscriber", inplace=True) @@ -35,7 +40,9 @@ def test_last_loc_lat_lon(get_dataframe): LastLocation() can make queries at the lat-lon level. """ - last_loc = LastLocation("2016-01-01", "2016-01-02", level="lat-lon") + last_loc = LastLocation( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ) df = get_dataframe(last_loc) df.set_index("subscriber", inplace=True) assert pytest.approx(29.135638957790576) == float(df.loc["yqw50eNyEwOxNDGL"].lat) diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index b46c18657e..79300a8639 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -4,16 +4,24 @@ import pytest -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import ( + AdminSpatialUnit, + VersionedSiteSpatialUnit, + LatLonSpatialUnit, +) from flowmachine.features import MostFrequentLocation from flowmachine.features.subscriber.daily_location import locate_subscribers -def test_most_frequent_locations_column_names(get_dataframe, exemplar_level_param): +def test_most_frequent_locations_column_names( + get_dataframe, exemplar_spatial_unit_param +): """ MostFrequentLocations().get_dataframe() returns a dataframe. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-02", **exemplar_level_param) + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-02", spatial_unit=exemplar_spatial_unit_param + ) df = get_dataframe(mfl) assert df.columns.tolist() == mfl.column_names @@ -23,7 +31,9 @@ def test_vsites(get_dataframe): MostFrequentLocation() returns the correct locations. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-02", level="versioned-site") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() + ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) @@ -33,10 +43,12 @@ def test_vsites(get_dataframe): def test_lat_lons(get_dataframe): """ - MostFrequentLocations() has the correct values at the lat-lon level. + MostFrequentLocations() has the correct values at the lat-lon spatial unit. """ - mfl = MostFrequentLocation("2016-01-01", "2016-01-02", level="lat-lon") + mfl = MostFrequentLocation( + "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) From 0914e1e6f6ca82df11653ba1693be5b420ed4ee1 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 19 Mar 2019 17:30:14 +0000 Subject: [PATCH 027/138] Update subscriber_locations --- .../utilities/subscriber_locations.py | 52 ++++--------------- flowmachine/tests/test_calldays.py | 17 ++++-- flowmachine/tests/test_join_to_location.py | 12 ++--- .../tests/test_meaningful_locations.py | 39 ++++++++++---- .../tests/test_subscriber_location_cluster.py | 45 ++++++++++++---- .../tests/test_subscriber_locations.py | 7 +-- .../tests/test_unique_location_counts.py | 4 +- .../tests/test_unique_subscriber_counts.py | 4 +- 8 files changed, 102 insertions(+), 78 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index ec840d01a8..d7b8a52fdc 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -43,8 +43,7 @@ def __init__( self.table = table self.subscriber_identifier = subscriber_identifier self.ignore_nulls = ignore_nulls - self.level = "cell" - self.column_name = None + self.spatial_unit = None self.tables = table cols = [self.subscriber_identifier, "datetime", "location_id"] @@ -120,15 +119,12 @@ def subscriber_locations( start, stop, *, - level="cell", + spatial_unit=None, hours="all", table="all", subscriber_identifier="msisdn", ignore_nulls=True, - column_name=None, subscriber_subset=None, - polygon_table=None, - size=None, radius=None, ): """ @@ -142,29 +138,11 @@ def subscriber_locations( e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'cell' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default None + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -182,9 +160,6 @@ def subscriber_locations( these lines with null cells should still be present, although they contain no information on the subscribers location, they still tell us that the subscriber made a call at that time. - column_name : str or list of strings - kwargs : - Eventually passed to flowmachine.JoinToLocation. Notes ----- @@ -198,7 +173,7 @@ def subscriber_locations( -------- >>> subscriber_locs = subscriber_locations('2016-01-01 13:30:30', '2016-01-02 16:25:00' - level = 'cell') + spatial_unit = None) >>> subscriber_locs.head() subscriber time cell subscriberA 2016-01-01 12:42:11 233241 @@ -219,13 +194,8 @@ def subscriber_locations( ignore_nulls=ignore_nulls, ) - if level == "cell": + if spatial_unit is None: return subscriber_cells else: - return JoinToLocation( - subscriber_cells, - level=level, - column_name=column_name, - polygon_table=polygon_table, - size=size, - ) + return JoinToLocation(subscriber_cells, spatial_unit=spatial_unit) + diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index 9a6fba63b3..030d037573 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -11,15 +11,18 @@ import pytest +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit from flowmachine.features import CallDays, subscriber_locations import numpy as np @pytest.mark.usefixtures("skip_datecheck") -def test_calldays_column_names(exemplar_level_param): +def test_calldays_column_names(exemplar_spatial_unit_param): """Test that CallDays column_names property is correct""" cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-03", **exemplar_level_param) + subscriber_locations( + "2016-01-01", "2016-01-03", spatial_unit=exemplar_spatial_unit_param + ) ) assert cd.head(0).columns.tolist() == cd.column_names @@ -35,7 +38,9 @@ def test_call_days_returns_expected_counts_per_subscriber(get_dataframe): ("038OVABN11Ak4W5P", "2016-01-01", "2016-01-08", 32), ) for (subscriber, start, end, calls) in test_values: - cd = CallDays(subscriber_locations(start, end, level="versioned-site")) + cd = CallDays( + subscriber_locations(start, end, spatial_unit=VersionedSiteSpatialUnit()) + ) df = get_dataframe(cd).query('subscriber == "{}"'.format(subscriber)) assert df.calldays.sum() == calls @@ -51,7 +56,9 @@ def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): ("038OVABN11Ak4W5P", "nWM8R3", "2016-01-01", "2016-01-08", 5), ) for (subscriber, location, start, end, calls) in test_values: - cd = CallDays(subscriber_locations(start, end, level="versioned-site")) + cd = CallDays( + subscriber_locations(start, end, spatial_unit=VersionedSiteSpatialUnit()) + ) df = get_dataframe(cd).query( 'subscriber == "{}" & site_id == "{}"'.format(subscriber, location) ) @@ -63,6 +70,6 @@ def test_locations_are_only_repeated_once_per_subscriber(get_dataframe): Test that each location occurs only once per subscriber. """ - cd = CallDays(subscriber_locations("2016-01-01", "2016-01-03", level="cell")) + cd = CallDays(subscriber_locations("2016-01-01", "2016-01-03", spatial_unit=None)) df = get_dataframe(cd) assert not np.any(df.groupby(["subscriber", "location_id"]).count() > 1) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 9e55c4878b..faa61491ef 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -38,7 +38,7 @@ def test_join_to_location_column_names(spatial_unit, kwargs): """ Test that JoinToLocation's column_names property is accurate.""" su = spatial_unit(**kwargs) - table = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + table = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) joined = JoinToLocation(table, spatial_unit=su) assert joined.head(0).columns.tolist() == joined.column_names @@ -61,7 +61,7 @@ def test_join_with_versioned_cells(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can fetch the cell version. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) df = get_dataframe(JoinToLocation(ul, spatial_unit=VersionedCellSpatialUnit())) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) @@ -84,7 +84,7 @@ def test_join_with_lat_lon(get_dataframe): """ Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ - ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) df = get_dataframe(JoinToLocation(ul, spatial_unit=LatLonSpatialUnit())) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) @@ -108,7 +108,7 @@ def test_join_with_polygon(get_dataframe, get_length): Test that flowmachine.JoinToLocation can get the (arbitrary) polygon of each cell. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) j = JoinToLocation( ul, spatial_unit=PolygonSpatialUnit( @@ -128,7 +128,7 @@ def test_join_to_admin(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can join to a admin region. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) df = get_dataframe(JoinToLocation(ul, spatial_unit=AdminSpatialUnit(level=3))) assert len(df) == get_length(ul) expected_cols = sorted(["subscriber", "time", "location_id", "pcod"]) @@ -139,6 +139,6 @@ def test_join_to_grid(get_dataframe, get_length): """ Test that we can join to a grid square """ - ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") + ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) df = get_dataframe(JoinToLocation(ul, spatial_unit=GridSpatialUnit(size=50))) assert len(df) == get_length(ul) diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 5b5549c6d7..19c7cf410a 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -4,6 +4,7 @@ import pytest from flowmachine.core.errors import BadLevelError +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit from flowmachine.features import ( HartiganCluster, CallDays, @@ -34,7 +35,9 @@ def test_column_names_meaningful_locations(get_column_names_from_run): clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -62,7 +65,9 @@ def test_column_names_meaningful_locations_aggregate( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -90,7 +95,7 @@ def test_meaningful_locations_aggregate_disallowed_level_raises(): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - level="versioned-site", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -117,7 +122,9 @@ def test_column_names_meaningful_locations_od( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -133,7 +140,9 @@ def test_column_names_meaningful_locations_od( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -167,7 +176,9 @@ def test_meaningful_locations_results( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -199,7 +210,9 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -231,7 +244,9 @@ def test_meaningful_locations_od_raises_for_bad_level( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -259,7 +274,9 @@ def test_meaningful_locations_od_results(get_dataframe): clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, @@ -275,7 +292,9 @@ def test_meaningful_locations_od_results(get_dataframe): clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( - start="2016-01-02", stop="2016-01-03", level="versioned-site" + start="2016-01-02", + stop="2016-01-03", + spatial_unit=VersionedSiteSpatialUnit(), ) ), radius=1, diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 6df8a4ed4a..a00a8a377a 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -19,6 +19,7 @@ from flowmachine.core import Table, CustomQuery from flowmachine.core.query import Query from flowmachine.core.mixins import GeoDataMixin +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit from flowmachine.features import ( CallDays, HartiganCluster, @@ -32,7 +33,9 @@ def test_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) assert get_column_names_from_run(hartigan) == hartigan.column_names @@ -42,7 +45,9 @@ def test_hartigan_column_names(get_column_names_from_run): def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") @@ -60,7 +65,9 @@ def test_hartigan_type_error(): def test_joined_hartigan_type_error(): """Test that joining hartigan to something which isn't query like raises a type error.""" cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) with pytest.raises(TypeError): @@ -99,7 +106,9 @@ def test_cluster_is_within_envelope(get_dataframe): Test that all the clusters are within the enveloped formed by all the towers in the cluster. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -115,7 +124,9 @@ def test_first_call_day_in_first_cluster(get_dataframe): Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) cd_df = get_dataframe(cd) @@ -140,7 +151,9 @@ def test_bigger_radius_yields_fewer_clusters(get_dataframe): """ radius = [1, 2, 5, 10, 50] cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0])) @@ -158,7 +171,9 @@ def test_different_call_days_format(get_dataframe): Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) assert isinstance(har, pd.DataFrame) @@ -182,7 +197,9 @@ def test_call_threshold_works(get_dataframe): Test whether a call threshold above 1 limits the number of clusters. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -199,7 +216,9 @@ def test_buffered_hartigan(): Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas() @@ -217,7 +236,9 @@ def test_all_options_hartigan(): Test whether Hartigan works when changing all options. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) har = HartiganCluster( @@ -233,7 +254,9 @@ def test_join_returns_the_same_clusters(): Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site") + subscriber_locations( + "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + ) ) hartigan = HartiganCluster(calldays=cd, radius=50) diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index 2c5affc793..3b759f9fa7 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -3,6 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core.spatial_unit import PolygonSpatialUnit from flowmachine.features.utilities.subscriber_locations import subscriber_locations @@ -14,9 +15,9 @@ def test_can_get_pcods(get_dataframe): subscriber_pcod = subscriber_locations( "2016-01-01 13:30:30", "2016-01-02 16:25:00", - level="polygon", - polygon_table="geography.admin3", - column_name="admin3pcod", + spatial_unit=PolygonSpatialUnit( + polygon_column_names="admin3pcod", polygon_table="geography.admin3" + ), ) df = get_dataframe(subscriber_pcod) assert df.admin3pcod[0].startswith("524") diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index 5ef0c47be7..a448f2bfcc 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -31,7 +31,9 @@ def test_correct_counts(get_dataframe): ulc = UniqueLocationCounts("2016-01-01", "2016-01-02", level="cell", hours=(5, 17)) df = get_dataframe(ulc) dful = get_dataframe( - subscriber_locations("2016-01-01", "2016-01-02", level="cell", hours=(5, 17)) + subscriber_locations( + "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) + ) ) assert [ df["unique_location_counts"][0], diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index 4b1f1de39f..ea447ba5c5 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -38,7 +38,9 @@ def test_correct_counts(get_dataframe): ) df = get_dataframe(usc) dful = get_dataframe( - subscriber_locations("2016-01-01", "2016-01-02", level="cell", hours=(5, 17)) + subscriber_locations( + "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) + ) ) assert [ df["unique_subscriber_counts"][0], From 081c96eb16f4c3b8b973e0780989e12622cc8ec5 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 11:16:16 +0000 Subject: [PATCH 028/138] Use exemplar_spatial_unit_param in test_join_to_location.py and test_spatial_unit.py --- flowmachine/tests/conftest.py | 30 +++++++++---- flowmachine/tests/test_join_to_location.py | 24 +++-------- flowmachine/tests/test_spatial_unit.py | 50 ++++------------------ 3 files changed, 36 insertions(+), 68 deletions(-) diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 66c6a1f531..51cb44bc1a 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -76,22 +76,36 @@ def exemplar_level_param(request): PolygonSpatialUnit, {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, ), + ( + PolygonSpatialUnit, + { + "polygon_column_names": "id", + "polygon_table": "infrastructure.sites", + "geom_col": "geom_point", + }, + ), (lambda: None, {}), ], - ids=lambda x: str(x[0]), + ids=[ + "admin2", + "admin2_column-name", + "versioned-site", + "versioned-cell", + "lat-lon", + "grid", + "polygon", + "polygon_geom-col", + "None", + ], ) def exemplar_spatial_unit_param(request): """ - A fixture which yields a succession of plausible default parameter - combinations for levels. - - Parameters - ---------- - request + A fixture which yields a succession of plausible values for the + spatial_unit parameter. Yields ------ - dict + flowmachine.core.spatial_unit.*SpatialUnit or None """ yield request.param[0](**request.param[1]) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index faa61491ef..94f9bcefd6 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -20,26 +20,14 @@ ) -@pytest.mark.parametrize( - "spatial_unit, kwargs", - [ - (LatLonSpatialUnit, {}), - (VersionedCellSpatialUnit, {}), - (VersionedSiteSpatialUnit, {}), - ( - PolygonSpatialUnit, - {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, - ), - (AdminSpatialUnit, {"level": 2}), - (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), - (GridSpatialUnit, {"size": 5}), - ], -) -def test_join_to_location_column_names(spatial_unit, kwargs): +def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" - su = spatial_unit(**kwargs) + if exemplar_spatial_unit_param is None: + pytest.skip( + "JoinToLocation does not accept spatial_unit=None (i.e. no location join)" + ) table = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) - joined = JoinToLocation(table, spatial_unit=su) + joined = JoinToLocation(table, spatial_unit=exemplar_spatial_unit_param) assert joined.head(0).columns.tolist() == joined.column_names diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index c4fdea7e19..90ca34093d 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -15,34 +15,13 @@ import pytest -@pytest.mark.parametrize( - "spatial_unit, kwargs", - [ - (LatLonSpatialUnit, {}), - (VersionedCellSpatialUnit, {}), - (VersionedSiteSpatialUnit, {}), - ( - PolygonSpatialUnit, - {"polygon_column_names": "admin3name", "polygon_table": "geography.admin3"}, - ), - ( - PolygonSpatialUnit, - { - "polygon_column_names": "id", - "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", - }, - ), - (AdminSpatialUnit, {"level": 3}), - (AdminSpatialUnit, {"level": 3, "column_name": "admin3name"}), - (GridSpatialUnit, {"size": 5}), - ], -) -def test_spatial_unit_column_names(spatial_unit, kwargs): +def test_spatial_unit_column_names(exemplar_spatial_unit_param): """ Test that the SpatialUnit classes have accurate column_names properties. """ - su = spatial_unit(**kwargs) + if exemplar_spatial_unit_param is None: + pytest.skip("None is not a SpatialUnit object") + su = exemplar_spatial_unit_param assert su.head(0).columns.tolist() == su.column_names @@ -121,26 +100,13 @@ def geo_augment(self, query): ) -@pytest.mark.parametrize( - "spatial_unit, kwargs", - [ - (LatLonSpatialUnit, {}), - (VersionedCellSpatialUnit, {}), - (VersionedSiteSpatialUnit, {}), - ( - PolygonSpatialUnit, - {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, - ), - (AdminSpatialUnit, {"level": 2}), - (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), - (GridSpatialUnit, {"size": 5}), - ], -) -def test_geo_augment_columns(spatial_unit, kwargs): +def test_geo_augment_columns(exemplar_spatial_unit_param): """ Test that the columns returned by the geo_augment method are correct. """ - su = spatial_unit(**kwargs) + if exemplar_spatial_unit_param is None: + pytest.skip("None is not a SpatialUnit object") + su = exemplar_spatial_unit_param sql, cols = su.geo_augment(su) cq = CustomQuery(sql, cols) assert cq.head(0).columns.tolist() == cols From 13283adf8b0f87ec96f360fd12dccd41f8deffae Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 12:03:51 +0000 Subject: [PATCH 029/138] Update UniqueSubscriberCounts --- .../flowmachine/core/join_to_location.py | 2 +- .../location/unique_subscriber_counts.py | 65 +++++-------------- .../utilities/subscriber_locations.py | 10 ++- .../tests/test_unique_subscriber_counts.py | 16 ++--- 4 files changed, 30 insertions(+), 63 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index f2cafff0a5..2d28a32d40 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -33,7 +33,7 @@ class JoinToLocation(Query): spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit A query which maps cell identifiers in the CDR to a different spatial unit (e.g. versioned site or admin region) - time_col : str, default 'time': + time_col : str, default 'time' The name of the column that identifies the time in the source table e.g. 'time', 'date', 'start_time' etc. diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index ccf56d657f..4c97ceb3ae 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -18,7 +18,6 @@ from ...core.query import Query from ...core.mixins import GeoDataMixin -from flowmachine.utils import get_columns_for_level from ..utilities.subscriber_locations import subscriber_locations @@ -26,7 +25,7 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): """ Class that defines counts of unique subscribers for each location. - Each location for the given level is accompanied by the count of unique subscribers. + Each location for the given spatial unit is accompanied by the count of unique subscribers. Parameters ---------- @@ -35,29 +34,11 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'cell' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + default None + Spatial unit to which subscriber locations will be mapped. See the + docstring of spatial_unit.py for more information. Use None for no + location join (i.e. just the cell identifier in the CDR itself). hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -79,16 +60,14 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): these lines with null cells should still be present, although they contain no information on the subscribers location, they still tell us that the subscriber made a call at that time. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. - kwargs : + time_col : str, default 'time' + The name of the column that identifies the time in the source table + e.g. 'time', 'date', 'start_time' etc. Eventually passed to flowmachine.JoinToLocation. Examples -------- - >>> usc = UniqueSubscriberCounts('2016-01-01', '2016-01-04', level = 'admin3', hours = (5,17)) + >>> usc = UniqueSubscriberCounts('2016-01-01', '2016-01-04', spatial_unit=AdminSpatialUnit(level=3), hours=(5,17)) >>> usc.head(4) name unique_subscriber_counts 0 Arghakhanchi 313 @@ -97,14 +76,7 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): """ def __init__( - self, - start, - stop, - level="cell", - hours="all", - table="all", - column_name=None, - **kwargs + self, start, stop, spatial_unit=None, hours="all", table="all", time_col="time" ): """ @@ -112,28 +84,23 @@ def __init__( self.start = start self.stop = stop - self.level = level + self.spatial_unit = spatial_unit self.hours = hours self.table = table - self.column_name = column_name - self._kwargs = kwargs self.ul = subscriber_locations( start=self.start, stop=self.stop, - level=self.level, + spatial_unit=self.spatial_unit, hours=self.hours, table=self.table, - column_name=self.column_name, - **kwargs + time_col=time_col, ) super().__init__() @property def column_names(self) -> List[str]: - return get_columns_for_level(self.level, self.column_name) + [ - "unique_subscriber_counts" - ] + return self.spatial_unit.location_columns + ["unique_subscriber_counts"] def _make_query(self): """ @@ -141,7 +108,7 @@ def _make_query(self): metaclass Query(). """ - relevant_columns = ",".join(get_columns_for_level(self.level, self.column_name)) + relevant_columns = ",".join(self.spatial_unit.location_columns) sql = """ SELECT {rc}, COUNT(unique_subscribers) AS unique_subscriber_counts FROM (SELECT diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index d7b8a52fdc..84d6d2685f 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -125,7 +125,7 @@ def subscriber_locations( subscriber_identifier="msisdn", ignore_nulls=True, subscriber_subset=None, - radius=None, + time_col="time", ): """ Class representing all the locations for which a subscriber has been found. @@ -160,6 +160,10 @@ def subscriber_locations( these lines with null cells should still be present, although they contain no information on the subscribers location, they still tell us that the subscriber made a call at that time. + time_col : str, default 'time' + The name of the column that identifies the time in the source table + e.g. 'time', 'date', 'start_time' etc. + Passed to flowmachine.JoinToLocation. Notes ----- @@ -197,5 +201,7 @@ def subscriber_locations( if spatial_unit is None: return subscriber_cells else: - return JoinToLocation(subscriber_cells, spatial_unit=spatial_unit) + return JoinToLocation( + subscriber_cells, spatial_unit=spatial_unit, time_col=time_col + ) diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index ea447ba5c5..e91f9f6920 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -13,28 +13,22 @@ @pytest.mark.usefixtures("skip_datecheck") -def test_unique_subscriber_counts_column_names(exemplar_level_param): +def test_unique_subscriber_counts_column_names(exemplar_spatial_unit_param): """ Test that column_names property of UniqueSubscriberCounts matches head(0) """ - usc = UniqueSubscriberCounts("2016-01-01", "2016-01-04", **exemplar_level_param) + usc = UniqueSubscriberCounts( + "2016-01-01", "2016-01-04", spatial_unit=exemplar_spatial_unit_param + ) assert usc.head(0).columns.tolist() == usc.column_names -def test_returns_errors(): - """ - Test level exists - """ - with pytest.raises(BadLevelError): - UniqueSubscriberCounts("2016-01-01", "2016-01-02", level="BAD_LEVEL") - - def test_correct_counts(get_dataframe): """ UniqueLocationCounts returns correct counts. """ usc = UniqueSubscriberCounts( - "2016-01-01", "2016-01-02", level="cell", hours=(5, 17) + "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) ) df = get_dataframe(usc) dful = get_dataframe( From c4f80867f404013bddef9a30912e2234d0323823 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 12:46:22 +0000 Subject: [PATCH 030/138] Remove 'time_col' argument from subscriber_locations --- .../features/location/unique_subscriber_counts.py | 9 +-------- .../features/utilities/subscriber_locations.py | 7 +------ 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index 4c97ceb3ae..4fecf60784 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -60,10 +60,6 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): these lines with null cells should still be present, although they contain no information on the subscribers location, they still tell us that the subscriber made a call at that time. - time_col : str, default 'time' - The name of the column that identifies the time in the source table - e.g. 'time', 'date', 'start_time' etc. - Eventually passed to flowmachine.JoinToLocation. Examples -------- @@ -75,9 +71,7 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): 2 Bajhang 285 """ - def __init__( - self, start, stop, spatial_unit=None, hours="all", table="all", time_col="time" - ): + def __init__(self, start, stop, spatial_unit=None, hours="all", table="all"): """ """ @@ -93,7 +87,6 @@ def __init__( spatial_unit=self.spatial_unit, hours=self.hours, table=self.table, - time_col=time_col, ) super().__init__() diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 84d6d2685f..589c809fe7 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -125,7 +125,6 @@ def subscriber_locations( subscriber_identifier="msisdn", ignore_nulls=True, subscriber_subset=None, - time_col="time", ): """ Class representing all the locations for which a subscriber has been found. @@ -160,10 +159,6 @@ def subscriber_locations( these lines with null cells should still be present, although they contain no information on the subscribers location, they still tell us that the subscriber made a call at that time. - time_col : str, default 'time' - The name of the column that identifies the time in the source table - e.g. 'time', 'date', 'start_time' etc. - Passed to flowmachine.JoinToLocation. Notes ----- @@ -202,6 +197,6 @@ def subscriber_locations( return subscriber_cells else: return JoinToLocation( - subscriber_cells, spatial_unit=spatial_unit, time_col=time_col + subscriber_cells, spatial_unit=spatial_unit, time_col="time" ) From cc657e4fc5ff0a2fb52c84ee60f0b7e2112a05da Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 17:08:46 +0000 Subject: [PATCH 031/138] Define CellSpatialUnit class, with location_columns property --- .../flowmachine/core/join_to_location.py | 8 ++-- flowmachine/flowmachine/core/spatial_unit.py | 28 +++++++++++-- .../location/unique_subscriber_counts.py | 11 ++--- .../features/spatial/distance_matrix.py | 17 ++++---- .../features/subscriber/daily_location.py | 18 ++++---- .../features/subscriber/last_location.py | 9 ++-- .../subscriber/most_frequent_location.py | 9 ++-- .../utilities/subscriber_locations.py | 13 +++--- flowmachine/flowmachine/models/pwo.py | 4 +- flowmachine/tests/conftest.py | 14 +------ .../test_sql_strings_and_results.py | 8 ++-- flowmachine/tests/test_async.py | 13 +++--- flowmachine/tests/test_calldays.py | 6 ++- flowmachine/tests/test_daily_location.py | 8 ++-- flowmachine/tests/test_inoutflows.py | 4 +- flowmachine/tests/test_join_to_location.py | 42 ++++++++++++++----- flowmachine/tests/test_spatial_unit.py | 10 +++-- .../tests/test_unique_location_counts.py | 3 +- .../tests/test_unique_subscriber_counts.py | 7 ++-- .../test_daily_location_results.py | 5 ++- 20 files changed, 140 insertions(+), 97 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 2d28a32d40..c1f32c108e 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -11,7 +11,7 @@ from typing import List from .query import Query -from .spatial_unit import BaseSpatialUnit +from .spatial_unit import CellSpatialUnit class JoinToLocation(Query): @@ -47,8 +47,10 @@ class JoinToLocation(Query): """ def __init__(self, left, *, spatial_unit, time_col="time"): - if not isinstance(spatial_unit, BaseSpatialUnit): - raise TypeError("spatial_unit must be a SpatialUnit object") + if isinstance(spatial_unit, CellSpatialUnit): + raise ValueError( + "CellSpatialUnit is not a valid spatial unit type for JoinToLocation" + ) self.spatial_unit = spatial_unit self.left = left self.time_col = time_col diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 4d4eb232e5..7f052cde10 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -6,6 +6,8 @@ Classes that map cells (or towers or sites) to a spatial unit. The available spatial units are: + CellSpatialUnit: + The identifier as found in the CDR. VersionedCellSpatialUnit: The identifier as found in the CDR combined with the version from the cells table. @@ -32,10 +34,28 @@ from .grid import Grid +class CellSpatialUnit: + """ + This class represents the case where no join of cell ID to other data is + required. As such, this class does not inherit from Query, is not a valid + parameter to JoinToLocation, and only exists to provide the + location_columns property and for consistency with the other spatial units. + """ + + _loc_cols = ("location_id",) + + @property + def location_columns(self) -> List[str]: + """ + List of the location-related column names. + """ + return list(self._loc_cols) + + class BaseSpatialUnit(Query, metaclass=ABCMeta): """ - Base class for all spatial units. Selects columns from the location table, - and optionally joins to data in another table. + Base class for all spatial units except CellSpatialUnit. Selects columns + from the location table, and optionally joins to data in another table. Parameters ---------- @@ -87,7 +107,9 @@ def __init__( super().__init__() - # TODO: Need a method to check whether the required data can be found in the DB + # TODO: Currently most spatial units require a FlowDB connection at init time. + # It would be useful to remove this requirement wherever possible, and instead + # implement a method to check whether the required data can be found in the DB. @property def location_columns(self) -> List[str]: diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index 4fecf60784..dabfd9a699 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -17,6 +17,7 @@ """ from ...core.query import Query from ...core.mixins import GeoDataMixin +from ...core.spatial_unit import CellSpatialUnit from ..utilities.subscriber_locations import subscriber_locations @@ -34,11 +35,9 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, - default None + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default CellSpatialUnit() Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -71,7 +70,9 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): 2 Bajhang 285 """ - def __init__(self, start, stop, spatial_unit=None, hours="all", table="all"): + def __init__( + self, start, stop, spatial_unit=CellSpatialUnit(), hours="all", table="all" + ): """ """ diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 34bd5efda0..3ce1b0e6f7 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -9,10 +9,9 @@ """ from typing import List -from flowmachine.utils import get_columns_for_level from ...core.query import Query from ...core.mixins import GraphMixin -from ...core.spatial_unit import VersionedCellSpatialUnit +from ...core.spatial_unit import VersionedSiteSpatialUnit, VersionedCellSpatialUnit class DistanceMatrix(GraphMixin, Query): @@ -22,17 +21,16 @@ class DistanceMatrix(GraphMixin, Query): computation of distance travelled, area of influence, and other features. - This calls the SpatialUnit.distance_matrix_query method. - Note: this method is only implemented for the VersionedCellSpatialUnit and - VersionedSiteSpatialUnit at this time. + This is a wrapper around the SpatialUnit.distance_matrix_query method. Distance is returned in km. Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, default None + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default VersionedCellSpatialUnit() Locations to compute distances for. - If None, defaults to VersionedCellSpatialUnit(). + Note: only VersionedCellSpatialUnit and VersionedSiteSpatialUnit are + supported at this time. return_geometry : bool If True, geometries are returned in query @@ -47,6 +45,11 @@ def __init__(self, spatial_unit=None, return_geometry=False): self.spatial_unit = VersionedCellSpatialUnit() else: self.spatial_unit = spatial_unit + if type(self.spatial_unit) not in { + VersionedSiteSpatialUnit, + VersionedCellSpatialUnit, + }: + raise ValueError("Only point locations are supported at this time.") self.return_geometry = return_geometry super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index 9a86580013..106e03790d 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -21,7 +21,7 @@ def locate_subscribers( start, stop, - spatial_unit="default", + spatial_unit=None, hours="all", method="last", table="all", @@ -43,11 +43,10 @@ def locate_subscribers( start, stop : str iso format date range for the the time frame, e.g. 2016-01-01 or 2016-01-01 14:03:01 - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default AdminSpatialUnit(level=3) Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -96,7 +95,7 @@ def locate_subscribers( . . """ - if spatial_unit == "default": + if spatial_unit is None: spatial_unit = AdminSpatialUnit(level=3) if method == "last": @@ -135,7 +134,7 @@ def daily_location( date, stop=None, *, - spatial_unit="default", + spatial_unit=None, hours="all", method="last", table="all", @@ -155,11 +154,10 @@ def daily_location( stop : str optionally specify a stop datetime in iso format date for the day in question, e.g. 2016-01-02 06:00:00 - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default AdminSpatialUnit(level=3) Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -191,7 +189,7 @@ def daily_location( * Use 24 hr format! """ - if spatial_unit == "default": + if spatial_unit is None: spatial_unit = AdminSpatialUnit(level=3) if stop is None: # 'cast' the date object as a date diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index ea009e4149..b60bde680e 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -30,11 +30,10 @@ class LastLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default AdminSpatialUnit(level=3) Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -71,7 +70,7 @@ def __init__( self, start, stop, - spatial_unit="default", + spatial_unit=None, hours="all", table="all", subscriber_identifier="msisdn", @@ -83,7 +82,7 @@ def __init__( self.start = start self.stop = stop - if spatial_unit == "default": + if spatial_unit is None: self.spatial_unit = AdminSpatialUnit(level=3) else: self.spatial_unit = spatial_unit diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 8ddd784d41..5485816589 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -28,11 +28,10 @@ class MostFrequentLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default AdminSpatialUnit(level=3) Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of int, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -69,7 +68,7 @@ def __init__( self, start, stop, - spatial_unit="default", + spatial_unit=None, hours="all", table="all", subscriber_identifier="msisdn", @@ -85,7 +84,7 @@ def __init__( self.start = start self.stop = stop - if spatial_unit == "default": + if spatial_unit is None: self.spatial_unit = AdminSpatialUnit(level=3) else: self.spatial_unit = spatial_unit diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 589c809fe7..439bb36503 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -18,6 +18,7 @@ from ...core.query import Query from ...core.join_to_location import JoinToLocation +from ...core.spatial_unit import CellSpatialUnit logger = logging.getLogger("flowmachine").getChild(__name__) @@ -43,7 +44,7 @@ def __init__( self.table = table self.subscriber_identifier = subscriber_identifier self.ignore_nulls = ignore_nulls - self.spatial_unit = None + self.spatial_unit = CellSpatialUnit() self.tables = table cols = [self.subscriber_identifier, "datetime", "location_id"] @@ -119,7 +120,7 @@ def subscriber_locations( start, stop, *, - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours="all", table="all", subscriber_identifier="msisdn", @@ -137,11 +138,9 @@ def subscriber_locations( e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit or None, - default None + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default CellSpatialUnit() Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. Use None for no - location join (i.e. just the cell identifier in the CDR itself). + docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -193,7 +192,7 @@ def subscriber_locations( ignore_nulls=ignore_nulls, ) - if spatial_unit is None: + if isinstance(spatial_unit, CellSpatialUnit): return subscriber_cells else: return JoinToLocation( diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 621b2ae580..d663333c49 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -237,7 +237,7 @@ class PopulationWeightedOpportunities(Model): """ def __init__( - self, start, stop, method="home-location", spatial_unit="default", **kwargs + self, start, stop, method="home-location", spatial_unit=None, **kwargs ): warnings.warn( @@ -250,7 +250,7 @@ def __init__( self.start = start self.stop = stop self.method = method - if spatial_unit == "default": + if spatial_unit is None: self.spatial_unit = VersionedSiteSpatialUnit() else: self.spatial_unit = spatial_unit diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 51cb44bc1a..13281c56e3 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -70,6 +70,7 @@ def exemplar_level_param(request): (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), (VersionedSiteSpatialUnit, {}), (VersionedCellSpatialUnit, {}), + (CellSpatialUnit, {}), (LatLonSpatialUnit, {}), (GridSpatialUnit, {"size": 5}), ( @@ -84,19 +85,8 @@ def exemplar_level_param(request): "geom_col": "geom_point", }, ), - (lambda: None, {}), - ], - ids=[ - "admin2", - "admin2_column-name", - "versioned-site", - "versioned-cell", - "lat-lon", - "grid", - "polygon", - "polygon_geom-col", - "None", ], + ids=lambda x: x[0].__name__, ) def exemplar_spatial_unit_param(request): """ diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index 79467e62cd..4afeaaa1b2 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -7,7 +7,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery from flowmachine.features import daily_location -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import AdminSpatialUnit, CellSpatialUnit def test_daily_location_1_sql(diff_reporter): @@ -82,7 +82,7 @@ def test_daily_location_3_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours=(23, 5), method="last", # subscriber_identifier="imei", @@ -103,7 +103,7 @@ def test_daily_location_3_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours=(23, 5), method="last", # subscriber_identifier="imei", @@ -160,7 +160,7 @@ def test_daily_location_5_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours=(23, 5), method="last", # subscriber_identifier="imei", diff --git a/flowmachine/tests/test_async.py b/flowmachine/tests/test_async.py index 0d7190a906..cc7a8513f3 100644 --- a/flowmachine/tests/test_async.py +++ b/flowmachine/tests/test_async.py @@ -7,6 +7,7 @@ from threading import Thread import pandas as pd +from flowmachine.core.spatial_unit import CellSpatialUnit from flowmachine.utils import rlock @@ -34,7 +35,7 @@ def test_double_store(): Storing a query twice doesn't raise an error. """ - dl = daily_location("2016-01-01", spatial_unit=None) + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) dl.store().result() dl.store().result() @@ -45,12 +46,12 @@ def test_store_async(): """ schema = "cache" - dl = daily_location("2016-01-01", spatial_unit=None) + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) table_name = dl.fully_qualified_table_name.split(".")[1] store_future = dl.store() store_future.result() assert dl.connection.has_table(table_name, schema=schema) - dl = daily_location("2016-01-01", spatial_unit=None) + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) assert table_name in dl.get_query() @@ -58,7 +59,7 @@ def test_get_query_blocks_on_store(): """ If a store is running get_query should block. """ - dl = daily_location("2016-01-01", spatial_unit=None) + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) dl.store().result() timer = [] @@ -79,8 +80,8 @@ def test_blocks_on_store_cascades(): If a store is running on a query that is used in a another query, that query should wait. """ - dl = daily_location("2016-01-01", spatial_unit=None) - dl2 = daily_location("2016-01-02", spatial_unit=None) + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl2 = daily_location("2016-01-02", spatial_unit=CellSpatialUnit()) store_future = dl.store() store_future.result() hl = ModalLocation(dl, dl2) diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index 030d037573..640ac766cd 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -11,7 +11,7 @@ import pytest -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit, CellSpatialUnit from flowmachine.features import CallDays, subscriber_locations import numpy as np @@ -70,6 +70,8 @@ def test_locations_are_only_repeated_once_per_subscriber(get_dataframe): Test that each location occurs only once per subscriber. """ - cd = CallDays(subscriber_locations("2016-01-01", "2016-01-03", spatial_unit=None)) + cd = CallDays( + subscriber_locations("2016-01-01", "2016-01-03", spatial_unit=CellSpatialUnit()) + ) df = get_dataframe(cd) assert not np.any(df.groupby(["subscriber", "location_id"]).count() > 1) diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index e177140d2f..fcc957ada6 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import MissingDateError -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import AdminSpatialUnit, CellSpatialUnit from flowmachine.features import daily_location, MostFrequentLocation @@ -57,9 +57,9 @@ def test_hours(get_length): # Lower level test test that subsetdates handles this correctly # we're just testing that it is passed on in this case. - dl1 = daily_location("2016-01-01", spatial_unit=None) - dl2 = daily_location("2016-01-01", spatial_unit=None, hours=(19, 23)) - dl3 = daily_location("2016-01-01", spatial_unit=None, hours=(19, 20)) + dl1 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl2 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit(), hours=(19, 23)) + dl3 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit(), hours=(19, 20)) assert get_length(dl1) > get_length(dl2) > get_length(dl3) diff --git a/flowmachine/tests/test_inoutflows.py b/flowmachine/tests/test_inoutflows.py index a8741759ac..e0a3a24787 100644 --- a/flowmachine/tests/test_inoutflows.py +++ b/flowmachine/tests/test_inoutflows.py @@ -17,8 +17,8 @@ def test_inoutflow_with_double_column_location(): more than one column. """ - dl1 = daily_location("2016-01-01", spatial_unit=VersionedSiteSpatialUnit) - dl2 = daily_location("2016-01-02", spatial_unit=VersionedSiteSpatialUnit) + dl1 = daily_location("2016-01-01", spatial_unit=VersionedSiteSpatialUnit()) + dl2 = daily_location("2016-01-02", spatial_unit=VersionedSiteSpatialUnit()) flow = Flows(dl1, dl2) expected_columns = ["site_id_to", "version_to", "lon_to", "lat_to", "total"] diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 94f9bcefd6..15350efcab 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -11,6 +11,7 @@ from flowmachine.features import subscriber_locations from flowmachine.core import JoinToLocation from flowmachine.core.spatial_unit import ( + CellSpatialUnit, AdminSpatialUnit, VersionedSiteSpatialUnit, VersionedCellSpatialUnit, @@ -22,15 +23,26 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" - if exemplar_spatial_unit_param is None: - pytest.skip( - "JoinToLocation does not accept spatial_unit=None (i.e. no location join)" - ) - table = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + if isinstance(spatial_unit, CellSpatialUnit): + pytest.skip("JoinToLocation does not accept CellSpatialUnit objects") + table = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) joined = JoinToLocation(table, spatial_unit=exemplar_spatial_unit_param) assert joined.head(0).columns.tolist() == joined.column_names +def test_join_to_location_raises_value_error(): + """ + Test that JoinToLocation raises a ValueError if spatial_unit==CellSpatialUnit(). + """ + with pytest.raises(ValueError): + table = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) + joined = JoinToLocation(table, spatial_unit=CellSpatialUnit()) + + moving_sites = [ "N0tfEoYN", "yPANTB8f", @@ -49,7 +61,9 @@ def test_join_with_versioned_cells(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can fetch the cell version. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + ul = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) df = get_dataframe(JoinToLocation(ul, spatial_unit=VersionedCellSpatialUnit())) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) @@ -72,7 +86,9 @@ def test_join_with_lat_lon(get_dataframe): """ Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ - ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + ul = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) df = get_dataframe(JoinToLocation(ul, spatial_unit=LatLonSpatialUnit())) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) @@ -96,7 +112,9 @@ def test_join_with_polygon(get_dataframe, get_length): Test that flowmachine.JoinToLocation can get the (arbitrary) polygon of each cell. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + ul = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) j = JoinToLocation( ul, spatial_unit=PolygonSpatialUnit( @@ -116,7 +134,9 @@ def test_join_to_admin(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can join to a admin region. """ - ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + ul = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) df = get_dataframe(JoinToLocation(ul, spatial_unit=AdminSpatialUnit(level=3))) assert len(df) == get_length(ul) expected_cols = sorted(["subscriber", "time", "location_id", "pcod"]) @@ -127,6 +147,8 @@ def test_join_to_grid(get_dataframe, get_length): """ Test that we can join to a grid square """ - ul = subscriber_locations("2016-01-05", "2016-01-07", spatial_unit=None) + ul = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) df = get_dataframe(JoinToLocation(ul, spatial_unit=GridSpatialUnit(size=50))) assert len(df) == get_length(ul) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 90ca34093d..4b0c65ce9e 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -19,8 +19,10 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): """ Test that the SpatialUnit classes have accurate column_names properties. """ - if exemplar_spatial_unit_param is None: - pytest.skip("None is not a SpatialUnit object") + if isinstance(spatial_unit, CellSpatialUnit): + pytest.skip( + "CellSpatialUnit does not have a column_names property (not a Query)" + ) su = exemplar_spatial_unit_param assert su.head(0).columns.tolist() == su.column_names @@ -104,8 +106,8 @@ def test_geo_augment_columns(exemplar_spatial_unit_param): """ Test that the columns returned by the geo_augment method are correct. """ - if exemplar_spatial_unit_param is None: - pytest.skip("None is not a SpatialUnit object") + if isinstance(spatial_unit, CellSpatialUnit): + pytest.skip("CellSpatialUnit does not have a geo_augment method") su = exemplar_spatial_unit_param sql, cols = su.geo_augment(su) cq = CustomQuery(sql, cols) diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index a448f2bfcc..7afa597a07 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -5,6 +5,7 @@ import pytest from flowmachine.core.errors import BadLevelError +from flowmachine.core.spatial_unit import CellSpatialUnit from flowmachine.features import UniqueLocationCounts, subscriber_locations @@ -32,7 +33,7 @@ def test_correct_counts(get_dataframe): df = get_dataframe(ulc) dful = get_dataframe( subscriber_locations( - "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) + "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) ) ) assert [ diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index e91f9f6920..3ccfedaa1d 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -7,9 +7,10 @@ """ import pytest + +from flowmachine.core.spatial_unit import CellSpatialUnit from flowmachine.features import UniqueSubscriberCounts from flowmachine.features.utilities import subscriber_locations -from flowmachine.core.errors import BadLevelError @pytest.mark.usefixtures("skip_datecheck") @@ -28,12 +29,12 @@ def test_correct_counts(get_dataframe): UniqueLocationCounts returns correct counts. """ usc = UniqueSubscriberCounts( - "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) + "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) ) df = get_dataframe(usc) dful = get_dataframe( subscriber_locations( - "2016-01-01", "2016-01-02", spatial_unit=None, hours=(5, 17) + "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) ) ) assert [ diff --git a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py index edf98b87a4..8f48658759 100644 --- a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py +++ b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py @@ -6,6 +6,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery +from flowmachine.core.spatial_unit import CellSpatialUnit from flowmachine.features import daily_location @@ -19,7 +20,7 @@ def test_daily_location_1_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours=(23, 5), method="last", subscriber_subset=subset_query, @@ -40,7 +41,7 @@ def test_daily_location_1_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=None, + spatial_unit=CellSpatialUnit(), hours=(23, 5), method="last", subscriber_subset=subset_query, From 699c44f00604a8c536ee67fc44b302b6d6834c69 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 17:41:47 +0000 Subject: [PATCH 032/138] Update TotalNetworkObjects and AggregateNetworkObjects --- .../flowmachine/core/join_to_location.py | 1 + .../features/network/total_network_objects.py | 162 ++++++++---------- .../tests/test_total_network_objects.py | 11 +- 3 files changed, 77 insertions(+), 97 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index c1f32c108e..de8815902f 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -48,6 +48,7 @@ class JoinToLocation(Query): def __init__(self, left, *, spatial_unit, time_col="time"): if isinstance(spatial_unit, CellSpatialUnit): + # Nothing to join in this case raise ValueError( "CellSpatialUnit is not a valid spatial unit type for JoinToLocation" ) diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 6d82bf3c84..11e895542c 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -14,8 +14,13 @@ from ...core.mixins import GeoDataMixin from ...core import JoinToLocation -from flowmachine.utils import get_columns_for_level from ...core.query import Query +from ...core.spatial_unit import ( + CellSpatialUnit, + VersionedSiteSpatialUnit, + VersionedCellSpatialUnit, + AdminSpatialUnit, +) from ..utilities import EventsTablesUnion valid_stats = {"avg", "max", "min", "median", "mode", "stddev", "variance"} @@ -39,11 +44,12 @@ class TotalNetworkObjects(GeoDataMixin, Query): Either 'calls', 'sms', or other table under `events.*`. If no specific table is provided this will collect statistics from all tables. - network_object : {'cell', 'versioned-cell', 'versioned-site'} - Objects to track, defaults to 'cells', the unversioned lowest + network_object : {Cell,VersionedCell,VersionedSite}SpatialUnit, default CellSpatialUnit() + Objects to track, defaults to CellSpatialUnit(), the unversioned lowest level of infrastructure available. - level : {'adminN', 'grid', 'polygon'} - Level to facet on, defaults to 'admin0' + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, + default AdminSpatialUnit(level=0) + Spatial unit to facet on. Other Parameters ---------------- @@ -67,17 +73,12 @@ def __init__( *, table="all", period="day", - network_object="cell", - level="admin0", - column_name=None, - size=None, - polygon_table=None, - geom_col="geom", + network_object=CellSpatialUnit(), + spatial_unit=None, hours="all", subscriber_subset=None, subscriber_identifier="msisdn", ): - self.table = table.lower() self.start = ( self.connection.min_date(table=table).strftime("%Y-%m-%d") if start is None @@ -88,55 +89,51 @@ def __init__( if stop is None else stop ) + + self.table = table.lower() if self.table != "all" and not self.table.startswith("events"): self.table = "events.{}".format(self.table) - self.network_object = network_object.lower() - if self.network_object == "cell": - events = EventsTablesUnion( - self.start, - self.stop, - tables=self.table, - columns=["location_id", "datetime"], - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, + + allowed_network_object_types = [ + CellSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + ] + + self.network_object = network_object + if type(self.network_object) not in allowed_network_object_types: + raise ValueError( + "{} is not a valid network object type.".format(type(network_object)) ) - elif self.network_object in {"versioned-cell", "versioned-site"}: - events = EventsTablesUnion( - self.start, - self.stop, - tables=self.table, - columns=["location_id", "datetime"], - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, + + if spatial_unit is None: + self.spatial_unit = AdminSpatialUnit(level=0) + else: + self.spatial_unit = spatial_unit + if type(self.spatial_unit) in allowed_network_object_types: + # No sense in aggregating network object to network object + raise ValueError( + "{} is not a valid spatial unit type for TotalNetworkObjects".format( + type(self.spatial_unit) + ) ) + + events = EventsTablesUnion( + self.start, + self.stop, + tables=self.table, + columns=["location_id", "datetime"], + hours=hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=subscriber_identifier, + ) + if not isinstance(self.network_object, CellSpatialUnit): events = JoinToLocation( - events, - level=self.network_object, - time_col="datetime", - column_name=column_name, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, + events, spatial_unit=self.network_object, time_col="datetime" ) - else: - raise ValueError("{} is not a valid network object.".format(network_object)) - self.level = level.lower() - if self.level in { - "cell", - "versioned-cell", - "versioned-site", - }: # No sense in aggregating network object to - raise ValueError("{} is not a valid level".format(level)) # network object + self.joined = JoinToLocation( - events, - level=level, - time_col="datetime", - column_name=column_name, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, + events, spatial_unit=self.spatial_unit, time_col="datetime" ) self.period = period.lower() if self.period not in valid_periods: @@ -145,10 +142,6 @@ def __init__( # FIXME: we are only storing these here so that they can be accessed by # AggregateNetworkObjects.from_total_network_objects() below. This # should be refactored soon. - self.column_name = column_name - self.size = size - self.polygon_table = polygon_table - self.geom_col = geom_col self.hours = hours self.subscriber_subset = subscriber_subset self.subscriber_identifier = subscriber_identifier @@ -157,16 +150,11 @@ def __init__( @property def column_names(self) -> List[str]: - return get_columns_for_level(self.level, self.joined.column_name) + [ - "total", - "datetime", - ] + return self.spatial_unit.location_columns + ["total", "datetime"] def _make_query(self): - cols = ",".join(get_columns_for_level(self.network_object)) - group_cols = ",".join( - get_columns_for_level(self.level, self.joined.column_name) - ) + cols = ",".join(self.network_object.location_columns) + group_cols = ",".join(self.spatial_unit.column_names) sql = """ SELECT {group_cols}, COUNT(*) as total, datetime FROM @@ -225,11 +213,12 @@ class AggregateNetworkObjects(GeoDataMixin, Query): statistics from all tables. statistic : {'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'} Statistic to calculate, defaults to 'avg'. - object : {'cell', 'versioned-cell', 'versioned-site'} - Objects to track, defaults to 'cells', the unversioned lowest + network_object : {Cell,VersionedCell,VersionedSite}SpatialUnit, default CellSpatialUnit() + Objects to track, defaults to CellSpatialUnit(), the unversioned lowest level of infrastructure available. - level : {'adminN', 'grid', 'polygon'} - Level to facet at, defaults to 'admin0' + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, + default AdminSpatialUnit(level=0) + Spatial unit to facet on. Other Parameters ---------------- @@ -257,12 +246,8 @@ def __init__( table="all", period="day", by=None, - network_object="cell", - level="admin0", - column_name=None, - size=None, - polygon_table=None, - geom_col="geom", + network_object=CellSpatialUnit(), + spatial_unit=None, hours="all", subscriber_subset=None, subscriber_identifier="msisdn", @@ -273,11 +258,7 @@ def __init__( table=table, period=period, network_object=network_object, - level=level, - column_name=column_name, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, + spatial_unit=spatial_unit, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, @@ -339,11 +320,7 @@ def from_total_network_objects(cls, total_objs, statistic, by): network_object=total_objs.network_object, statistic=statistic, by=by, - level=total_objs.level, - column_name=total_objs.column_name, - size=total_objs.size, - polygon_table=total_objs.polygon_table, - geom_col=total_objs.geom_col, + spatial_unit=total_objs.spatial_unit, hours=total_objs.hours, subscriber_subset=total_objs.subscriber_subset, subscriber_identifier=total_objs.subscriber_identifier, @@ -351,16 +328,13 @@ def from_total_network_objects(cls, total_objs, statistic, by): @property def column_names(self) -> List[str]: - return get_columns_for_level( - self.total_objs.level, self.total_objs.joined.column_name - ) + [self.statistic, "datetime"] + return self.total_objs.spatial_unit.location_columns + [ + self.statistic, + "datetime", + ] def _make_query(self): - group_cols = ",".join( - get_columns_for_level( - self.total_objs.level, self.total_objs.joined.column_name - ) - ) + group_cols = ",".join(self.total_objs.spatial_unit.location_columns) sql = """ SELECT {group_cols}, {stat}(z.total) as {stat}, date_trunc('{by}', z.datetime) as datetime FROM diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 9601767330..bdbc144148 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -10,6 +10,7 @@ import pytest +from flowmachine.core.spatial_unit import CellSpatialUnit, VersionedSiteSpatialUnit import flowmachine.features.network as network @@ -33,7 +34,11 @@ def test_count_returns_correct_values(get_dataframe): @pytest.mark.parametrize( "bad_arg, bad_val", - [("period", "BAD_PERIOD"), ("level", "cell"), ("network_object", "BAD_OBJECT")], + [ + ("period", "BAD_PERIOD"), + ("spatial_unit", CellSpatialUnit()), + ("network_object", "BAD_OBJECT"), + ], ) def test_bad_params(bad_arg, bad_val): """Test value errors are raised for bad params""" @@ -57,7 +62,7 @@ def test_median_returns_correct_values(get_dataframe): """ instance = network.TotalNetworkObjects( - table="calls", period="hour", network_object="versioned-site" + table="calls", period="hour", network_object=VersionedSiteSpatialUnit() ).aggregate(by="day", statistic="median") # @@ -77,7 +82,7 @@ def test_mean_returns_correct_values(get_dataframe): start="2016-01-01", stop="2016-12-30", period="hour", - network_object="versioned-site", + network_object=VersionedSiteSpatialUnit(), ).aggregate(by="day") # From f7763f5ac2a91ba7e283281d6f2bc3c4e317e5ef Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 20 Mar 2019 17:46:07 +0000 Subject: [PATCH 033/138] Fix tests for JoinToLocation and spatial_unit.py --- flowmachine/tests/conftest.py | 1 + flowmachine/tests/test_join_to_location.py | 2 +- flowmachine/tests/test_spatial_unit.py | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 13281c56e3..023e244b85 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -18,6 +18,7 @@ from flowmachine.core import Query from flowmachine.core.cache import reset_cache from flowmachine.core.spatial_unit import ( + CellSpatialUnit, LatLonSpatialUnit, VersionedCellSpatialUnit, VersionedSiteSpatialUnit, diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 15350efcab..59afa9fcef 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -23,7 +23,7 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" - if isinstance(spatial_unit, CellSpatialUnit): + if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): pytest.skip("JoinToLocation does not accept CellSpatialUnit objects") table = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 4b0c65ce9e..febea3f643 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -5,6 +5,7 @@ from flowmachine.core import CustomQuery from flowmachine.core.spatial_unit import ( BaseSpatialUnit, + CellSpatialUnit, LatLonSpatialUnit, VersionedCellSpatialUnit, VersionedSiteSpatialUnit, @@ -19,7 +20,7 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): """ Test that the SpatialUnit classes have accurate column_names properties. """ - if isinstance(spatial_unit, CellSpatialUnit): + if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): pytest.skip( "CellSpatialUnit does not have a column_names property (not a Query)" ) @@ -106,7 +107,7 @@ def test_geo_augment_columns(exemplar_spatial_unit_param): """ Test that the columns returned by the geo_augment method are correct. """ - if isinstance(spatial_unit, CellSpatialUnit): + if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): pytest.skip("CellSpatialUnit does not have a geo_augment method") su = exemplar_spatial_unit_param sql, cols = su.geo_augment(su) From 8611cc1da517d05eb4fc34fbef6281fefc0653c5 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 29 May 2019 17:40:44 +0100 Subject: [PATCH 034/138] Add comment to explain reason for using GeoTable --- flowmachine/flowmachine/core/spatial_unit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 7f052cde10..d04ae0d85d 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -396,6 +396,8 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): if isinstance(polygon_table, Query): self.polygon_table = polygon_table else: + # Creating a GeoTable object here means that we don't have to handle + # admin tables and Grid objects differently in self.geo_augment self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) self.geom_col = geom_col From 3f679e04622ff5b21d6875ad8dbabedd695aa307 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 29 May 2019 17:41:16 +0100 Subject: [PATCH 035/138] Add 'location_joined_query' helper function --- .../flowmachine/core/join_to_location.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 04a7581658..fc7f3ce9c8 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -104,3 +104,32 @@ def _make_query(self): """ return sql + + +def location_joined_query(left, *, spatial_unit, time_col="time"): + """ + Helper function which returns JoinToLocation(left_query, spatial_unit, time_col) + unless type(spatial_unit)==CellSpatialUnit, in which case this returns left_query. + + Parameters + ---------- + left : flowmachine.Query + This represents a table that can be joined to the cell information + table. This must have a date column (called time) and a location column + call 'location_id'. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit + A query which maps cell identifiers in the CDR to a different spatial + unit (e.g. versioned site or admin region) + time_col : str, default 'time' + The name of the column that identifies the time in the source table + e.g. 'time', 'date', 'start_time' etc. + + Returns + ------- + flowmachine.Query + Either a JoinToLocation object, or the input parameter 'left' + """ + if isinstance(spatial_unit, CellSpatialUnit): + return left + else: + return JoinToLocation(left, spatial_unit=spatial_unit, time_col=time_col) From 6790a01cac61bca9ed7d265b467a5eddac125f12 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 10:17:17 +0100 Subject: [PATCH 036/138] Add test for location_joined_query --- flowmachine/flowmachine/core/__init__.py | 3 ++- flowmachine/tests/test_join_to_location.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/__init__.py b/flowmachine/flowmachine/core/__init__.py index dd1380c1fc..ca63077409 100644 --- a/flowmachine/flowmachine/core/__init__.py +++ b/flowmachine/flowmachine/core/__init__.py @@ -13,7 +13,7 @@ from .geotable import GeoTable from .init import connect from .logging import init_logging, set_log_level -from .join_to_location import JoinToLocation +from .join_to_location import JoinToLocation, location_joined_query from .custom_query import CustomQuery from .grid import Grid @@ -26,6 +26,7 @@ "Connection", "connect", "JoinToLocation", + "location_joined_query", "CustomQuery", "Grid", ] diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 59afa9fcef..2e7122b84c 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -9,7 +9,7 @@ import numpy as np from flowmachine.features import subscriber_locations -from flowmachine.core import JoinToLocation +from flowmachine.core import JoinToLocation, location_joined_query from flowmachine.core.spatial_unit import ( CellSpatialUnit, AdminSpatialUnit, @@ -152,3 +152,19 @@ def test_join_to_grid(get_dataframe, get_length): ) df = get_dataframe(JoinToLocation(ul, spatial_unit=GridSpatialUnit(size=50))) assert len(df) == get_length(ul) + + +def test_location_joined_query_return_type(exemplar_spatial_unit_param): + """ + Test that location_joined_query(query, spatial_unit) returns a + JoinToLocation object when spatial_unit != CellSpatialUnit(), and returns + query when spatial_unit == CellSpatialUnit(). + """ + table = subscriber_locations( + "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + ) + joined = location_joined_query(table, spatial_unit=exemplar_spatial_unit_param) + if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): + assert joined is table + else: + assert isinstance(joined, JoinToLocation) From a666bab54de817b91da3260bba7a843879994a57 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 15:15:32 +0100 Subject: [PATCH 037/138] Don't pass "radius" parameter to subscriber_locations --- .../features/subscriber/first_location.py | 2 - .../features/subscriber/last_location.py | 2 - .../subscriber/most_frequent_location.py | 2 - .../subscriber/subscriber_location_cluster.py | 38 +++++++++++-------- .../subscriber/unique_location_counts.py | 2 - 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/first_location.py b/flowmachine/flowmachine/features/subscriber/first_location.py index 5eb8103ffb..b9cda63143 100644 --- a/flowmachine/flowmachine/features/subscriber/first_location.py +++ b/flowmachine/flowmachine/features/subscriber/first_location.py @@ -67,7 +67,6 @@ def __init__( subscriber_subset=None, polygon_table=None, size=None, - radius=None, ): """ @@ -95,7 +94,6 @@ def __init__( subscriber_subset=subscriber_subset, polygon_table=polygon_table, size=size, - radius=radius, ) self.table = self.ul.table diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index b60bde680e..13ef401a2f 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -77,7 +77,6 @@ def __init__( *, ignore_nulls=True, subscriber_subset=None, - radius=None, ): self.start = start @@ -98,7 +97,6 @@ def __init__( subscriber_identifier=self.subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - radius=radius, ) super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 5485816589..0f4877beb3 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -75,7 +75,6 @@ def __init__( *, ignore_nulls=True, subscriber_subset=None, - radius=None, ): """ @@ -100,7 +99,6 @@ def __init__( subscriber_identifier=self.subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - radius=radius, ) super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py index 1f47e67fc1..deafc5d47c 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py @@ -37,6 +37,8 @@ def subscriber_location_cluster( hours="all", table="all", subscriber_identifier="msisdn", + subscriber_subset=None, + ignore_nulls=True, **kwargs, ): """ @@ -49,6 +51,11 @@ def subscriber_location_cluster( Parameters ---------- + method : str + 'hartigan': + Uses the Hartigan Clustering algorithm which clusters locations + based on a ranked listed of call days. This method requires + that `radius` be specified and optionally the `call_threshold`. start : str ISO format date range for the beginning of the time frame, e.g. 2016-01-01 or 2016-01-01 14:03:01 @@ -59,21 +66,22 @@ def subscriber_location_cluster( This will subset the query only with these hours, but across all specified days. Or set to 'all' to include all hours. + table : str, default 'all' + Schema qualified name of the table which the analysis is based + upon. If 'all' it will pull together all of the tables specified as + flowmachine.yml under 'location_tables'. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - method : str - 'hartigan': - Uses the Hartigan Clustering algorithm which clusters locations - based on a ranked listed of call days. This method requires - that `radius` be specified and optionally the `call_threshold`. - table : str, default 'all' - Schema qualified name of the table which the analysis is based - upon. If 'all' it will pull together all of the tables specified as - flowmachine.yml under 'location_tables'. + ignore_nulls : bool, default True + ignores those values that are null. Sometime data appears for which + the cell is null. If set to true this will ignore those lines. If false + these lines with null cells should still be present, although they contain + no information on the subscribers location, they still tell us that the subscriber made + a call at that time. Other Parameters ---------------- @@ -113,7 +121,9 @@ def subscriber_location_cluster( raise ValueError("Unidentified clustering method: {}".format(method)) if method == "hartigan": - if "radius" not in kwargs: + try: + radius = kwargs.pop("radius") + except KeyError: raise ValueError("`radius` must be defined when using method: `hartigan`") call_threshold = kwargs.pop("call_threshold", 0) buffer = kwargs.pop("buffer", 0) @@ -126,15 +136,13 @@ def subscriber_location_cluster( level="versioned-site", table=table, subscriber_identifier=subscriber_identifier, - **kwargs, + ignore_nulls=ignore_nulls, + subscriber_subset=subscriber_subset, ) ) return HartiganCluster( - calldays=cd, - radius=kwargs["radius"], - buffer=buffer, - call_threshold=call_threshold, + calldays=cd, radius=radius, buffer=buffer, call_threshold=call_threshold ) diff --git a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py index f0696fc8d5..6db14ffe25 100644 --- a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py +++ b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py @@ -101,7 +101,6 @@ def __init__( subscriber_subset=None, polygon_table=None, size=None, - radius=None, ): self.ul = subscriber_locations( @@ -116,7 +115,6 @@ def __init__( subscriber_subset=subscriber_subset, polygon_table=polygon_table, size=size, - radius=radius, ) super().__init__() From 08363b3a80fd943e53b3a5ecf44b7661d8d5e4cd Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 15:18:36 +0100 Subject: [PATCH 038/138] Remove "radius" argument from daily_location --- .../flowmachine/features/subscriber/daily_location.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index d84394234f..82656fded9 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -29,7 +29,6 @@ def locate_subscribers( *, ignore_nulls=True, subscriber_subset=None, - radius=None, ): """ Return a class representing the location of an individual. This can be called @@ -108,7 +107,6 @@ def locate_subscribers( subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - radius=radius, ) elif method == "most-common": return MostFrequentLocation( @@ -120,7 +118,6 @@ def locate_subscribers( subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - radius=radius, ) # elif self.method == 'first': # _obj = FirstLocation(start, stop, spatial_unit, hours) @@ -141,7 +138,6 @@ def daily_location( subscriber_identifier="msisdn", ignore_nulls=True, subscriber_subset=None, - radius=None, ): """ Return a query for locating all subscribers on a single day of data. @@ -214,5 +210,4 @@ def daily_location( subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, - radius=radius, ) From 89e3cec24e31c925c73d878fb81059acbe0d7097 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 16:14:25 +0100 Subject: [PATCH 039/138] Fix TotalNetworkObjects --- .../features/network/total_network_objects.py | 2 +- .../tests/test_total_network_objects.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 881fadb06b..5c6ba62b11 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -148,7 +148,7 @@ def column_names(self) -> List[str]: def _make_query(self): cols = self.network_object.location_columns - group_cols = self.spatial_unit.column_names + group_cols = self.spatial_unit.location_columns for column in group_cols: if column in cols: cols.remove(column) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 24d74e272c..027c52b836 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -10,8 +10,12 @@ import pytest -from flowmachine.core.spatial_unit import CellSpatialUnit, VersionedSiteSpatialUnit -import flowmachine.features.network as network +from flowmachine.core.spatial_unit import ( + CellSpatialUnit, + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + LatLonSpatialUnit, +) from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects @@ -22,8 +26,8 @@ def test_tno_at_lat_lng(get_dataframe): tno = TotalNetworkObjects( start="2016-01-01", stop="2016-01-07", - network_object="versioned-cell", - level="lat-lon", + network_object=VersionedCellSpatialUnit(), + spatial_unit=LatLonSpatialUnit(), ) assert tno.get_dataframe().sum().value == 330 @@ -45,8 +49,8 @@ def test_aggregate_returns_correct_values(stat, expected, get_dataframe): AggregateNetworkObjects returns correct values. """ - instance = network.AggregateNetworkObjects( - total_network_objects=network.TotalNetworkObjects( + instance = AggregateNetworkObjects( + total_network_objects=TotalNetworkObjects( start="2016-01-01", stop="2016-12-30", table="calls", total_by="hour" ), statistic=stat, @@ -66,7 +70,7 @@ def test_count_returns_correct_values(get_dataframe): TotalNetworkObjects returns correct values. """ - instance = network.TotalNetworkObjects( + instance = TotalNetworkObjects( start="2016-01-01", stop="2016-12-30", table="calls", total_by="hour" ) df = get_dataframe(instance) From ce7ee87c02c368ac8f73257a881cd19296d716a5 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 17:10:40 +0100 Subject: [PATCH 040/138] Use location_joined_query in TotalNetworkObjects and subscriber_locations --- .../flowmachine/core/join_to_location.py | 4 +++ .../features/network/total_network_objects.py | 34 ++++++++----------- .../utilities/subscriber_locations.py | 11 +++--- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index fc7f3ce9c8..a7770a64d8 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -7,6 +7,10 @@ the joining of any query with cell/site information to another spatial level, such as a grid or an admin region. +No join is required if the spatial unit is CellSpatialUnit, +so we also define the helper function location_joined_query +to return a JoinToLocation object if a join is required, or +the original query object otherwise. """ from typing import List diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 5c6ba62b11..05b492edf5 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -14,7 +14,7 @@ from typing import List from ...core.mixins import GeoDataMixin -from ...core import JoinToLocation +from ...core import location_joined_query from ...core.query import Query from ...core.spatial_unit import ( CellSpatialUnit, @@ -54,7 +54,7 @@ class TotalNetworkObjects(GeoDataMixin, Query): Other Parameters ---------------- - Passed to JoinToLocation + Passed to EventsTablesUnion Examples -------- @@ -119,21 +119,21 @@ def __init__( ) ) - events = EventsTablesUnion( - self.start, - self.stop, - tables=self.table, - columns=["location_id", "datetime"], - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, + events = location_joined_query( + EventsTablesUnion( + self.start, + self.stop, + tables=self.table, + columns=["location_id", "datetime"], + hours=hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=subscriber_identifier, + ), + spatial_unit=self.network_object, + time_col="datetime", ) - if not isinstance(self.network_object, CellSpatialUnit): - events = JoinToLocation( - events, spatial_unit=self.network_object, time_col="datetime" - ) - self.joined = JoinToLocation( + self.joined = location_joined_query( events, spatial_unit=self.spatial_unit, time_col="datetime" ) self.total_by = total_by.lower() @@ -183,10 +183,6 @@ class AggregateNetworkObjects(GeoDataMixin, Query): A period definition to calculate statistics over, defaults to the one greater than total_network_objects.total_by. - Other Parameters - ---------------- - Passed to JoinToLocation - Examples -------- >>> t = AggregateNetworkObjects(total_network_objects=TotalNetworkObjects()) diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index cdb8b6848d..b0b5c49f1d 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -17,7 +17,7 @@ from .spatial_aggregates import SpatialAggregate, JoinedSpatialAggregate from ...core.query import Query -from ...core.join_to_location import JoinToLocation +from ...core.join_to_location import location_joined_query from ...core.spatial_unit import CellSpatialUnit import structlog @@ -194,10 +194,7 @@ def subscriber_locations( ignore_nulls=ignore_nulls, ) - if isinstance(spatial_unit, CellSpatialUnit): - return subscriber_cells - else: - return JoinToLocation( - subscriber_cells, spatial_unit=spatial_unit, time_col="time" - ) + return location_joined_query( + subscriber_cells, spatial_unit=spatial_unit, time_col="time" + ) From 5fe78e04f65977222c0a887434d47036499eb39c Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 30 May 2019 17:11:10 +0100 Subject: [PATCH 041/138] Use spatial_unit instead of level in SpatialAggregate and JoinedSpatialAggregate --- .../features/utilities/spatial_aggregates.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index ab01555c6a..6fa8623b79 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -29,8 +29,7 @@ class SpatialAggregate(GeoDataMixin, Query): def __init__(self, *, locations): self.locations = locations - self.level = locations.level - self.column_name = locations.column_name + self.spatial_unit = locations.spatial_unit super().__init__() @property @@ -39,19 +38,17 @@ def column_names(self) -> List[str]: def _make_query(self): - aggregate_cols = self.locations.column_names[1:] + aggregate_cols = ",".join(self.locations.column_names[1:]) - sql = """ + sql = f""" SELECT - {agg_cols}, + {aggregate_cols}, count(*) AS total FROM - ({to_agg}) AS to_agg + ({self.locations.get_query()}) AS to_agg GROUP BY - {agg_cols} - """.format( - to_agg=self.locations.get_query(), agg_cols=",".join(aggregate_cols) - ) + {aggregate_cols} + """ return sql @@ -102,8 +99,7 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): def __init__(self, *, metric, locations, method="mean"): self.metric = metric self.locations = locations - self.level = locations.level - self.column_name = locations.column_name + self.spatial_unit = locations.spatial_unit self.method = method.lower() if self.method not in self.allowed_methods: raise ValueError( From e01e208e2e92bd4719a6db91f076045ef2e35485 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 31 May 2019 12:55:41 +0100 Subject: [PATCH 042/138] Define __eq__ for spatial units --- flowmachine/flowmachine/core/spatial_unit.py | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index d04ae0d85d..774f288f20 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -44,6 +44,9 @@ class CellSpatialUnit: _loc_cols = ("location_id",) + def __eq__(self, other): + return isinstance(other, CellSpatialUnit) + @property def location_columns(self) -> List[str]: """ @@ -111,6 +114,9 @@ def __init__( # It would be useful to remove this requirement wherever possible, and instead # implement a method to check whether the required data can be found in the DB. + def __eq__(self, other): + return self.md5 == other.md5 + @property def location_columns(self) -> List[str]: """ @@ -204,6 +210,9 @@ def __init__(self): location_column_names=["lat", "lon"], ) + def __eq__(self, other): + return isinstance(other, LatLonSpatialUnit) + def geo_augment(self, query): sql = f""" SELECT @@ -238,6 +247,9 @@ def __init__(self): location_info_table="infrastructure.cells", ) + def __eq__(self, other): + return isinstance(other, VersionedCellSpatialUnit) + def geo_augment(self, query): sql = f""" SELECT @@ -326,6 +338,9 @@ def __init__(self): join_clause=join_clause, ) + def __eq__(self, other): + return isinstance(other, VersionedSiteSpatialUnit) + def geo_augment(self, query): sql = f""" SELECT @@ -447,6 +462,14 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): join_clause=join_clause, ) + def __eq__(self, other): + return ( + isinstance(other, PolygonSpatialUnit) + and self.polygon_table.md5 == other.polygon_table.md5 + and self.polygon_column_names == other.polygon_column_names + and self.geom_col == other.geom_col + ) + def geo_augment(self, query): r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) sql = f""" @@ -494,6 +517,15 @@ def __init__(self, *, level, column_name=None): super().__init__(polygon_column_names=col_name, polygon_table=table) + def __eq__(self, other): + if isinstance(other, AdminSpatialUnit): + return ( + self.level == other.level + and self.polygon_column_names == other.polygon_column_names + ) + else: + return super().__eq__(other) + def _get_standard_name(self): """ Returns the standard name of the column that identifies @@ -521,3 +553,9 @@ def __init__(self, *, size): polygon_table=Grid(self.size), geom_col="geom_square", ) + + def __eq__(self, other): + if isinstance(other, GridSpatialUnit): + return self.size == other.size + else: + return super().__eq__(other) From 13bdf4e4c10525c03ba224a0ed77a8bededf329a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 31 May 2019 16:50:03 +0100 Subject: [PATCH 043/138] Refactor spatial_unit.geo_augment to take a sql string instead of Query --- .../flowmachine/core/mixins/geodata_mixin.py | 4 ++- flowmachine/flowmachine/core/spatial_unit.py | 36 ++++++++----------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index bfc064d9e7..c2b0074e1e 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -83,7 +83,9 @@ def _geo_augmented_query(self): The columns this query contains """ loc_join = self._get_location_join() - return loc_join.spatial_unit.geo_augment(self) + sql = loc_join.spatial_unit.geo_augment(self.get_query()) + cols = list(set(self.column_names + ["gid", "geom"])) + return sql, cols def geojson_query(self, crs=None): """ diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 774f288f20..be2aff669c 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -129,23 +129,21 @@ def column_names(self) -> List[str]: return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] @abstractmethod - def geo_augment(self, query): + def geo_augment(self, sql_query): """ - Given a query object (which is assumed to be a JoinToLocation object, + Given a SQL string (which will usually be from a JoinToLocation object, joined to this spatial unit), return a version of the query augmented with a geom column and a gid column. Parameters ---------- - query : flowmachine.Query + sql_query : string The query to augment with geom and gid columns Returns ------- str A version of this query with geom and gid columns - list - The columns this query contains """ raise NotImplementedError @@ -213,16 +211,15 @@ def __init__(self): def __eq__(self, other): return isinstance(other, LatLonSpatialUnit) - def geo_augment(self, query): + def geo_augment(self, sql_query): sql = f""" SELECT row_number() over() AS gid, *, ST_SetSRID(ST_Point(lon, lat), 4326) AS geom - FROM ({query.get_query()}) AS L + FROM ({sql_query}) AS L """ - cols = list(set(query.column_names + ["gid", "geom"])) - return sql, cols + return sql class VersionedCellSpatialUnit(BaseSpatialUnit): @@ -250,19 +247,18 @@ def __init__(self): def __eq__(self, other): return isinstance(other, VersionedCellSpatialUnit) - def geo_augment(self, query): + def geo_augment(self, sql_query): sql = f""" SELECT row_number() OVER () AS gid, geom_point AS geom, U.* - FROM ({query.get_query()}) AS U + FROM ({sql_query}) AS U LEFT JOIN infrastructure.cells AS S ON U.location_id = S.id AND U.version = S.version """ - cols = list(set(query.column_names + ["gid", "geom"])) - return sql, cols + return sql def distance_matrix_query(self, return_geometry=False): return_geometry_statement = "" @@ -341,19 +337,18 @@ def __init__(self): def __eq__(self, other): return isinstance(other, VersionedSiteSpatialUnit) - def geo_augment(self, query): + def geo_augment(self, sql_query): sql = f""" SELECT row_number() OVER () AS gid, geom_point AS geom, U.* - FROM ({query.get_query()}) AS U + FROM ({sql_query}) AS U LEFT JOIN infrastructure.sites AS S ON U.site_id = S.id AND U.version = S.version """ - cols = list(set(query.column_names + ["gid", "geom"])) - return sql, cols + return sql def distance_matrix_query(self, return_geometry=False): return_geometry_statement = "" @@ -470,19 +465,18 @@ def __eq__(self, other): and self.geom_col == other.geom_col ) - def geo_augment(self, query): + def geo_augment(self, sql_query): r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) sql = f""" SELECT row_number() OVER () as gid, {self.geom_col} AS geom, U.* - FROM ({query.get_query()}) AS U + FROM ({sql_query}) AS U LEFT JOIN ({self.polygon_table.get_query()}) AS G ON U.{l_col_name} = G.{r_col_name} """ - cols = list(set(query.column_names + ["gid", "geom"])) - return sql, cols + return sql class AdminSpatialUnit(PolygonSpatialUnit): From 5b700f92c4c5386472d7b4ffa2e663a115aac6e7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 31 May 2019 18:00:07 +0100 Subject: [PATCH 044/138] Use SpatialUnit objects in Flows --- .../flowmachine/features/location/flows.py | 115 +++++++----------- 1 file changed, 44 insertions(+), 71 deletions(-) diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 440fcf0701..4635490a4a 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -43,13 +43,12 @@ def __init__(self, loc1, loc2): """ - if loc1.level != loc2.level: + if loc1.spatial_unit != loc2.spatial_unit: raise ValueError( - "You cannot compute flows for locations on " + "different levels" + "You cannot compute flows for locations on different spatial units" ) - self.level = loc1.level - self.column_name = loc1.column_name + self.spatial_unit = loc1.spatial_unit self.joined = loc1.join( loc2, on_left="subscriber", left_append="_from", right_append="_to" ) @@ -82,12 +81,12 @@ def inflow(self): @property def index_cols(self): - cols = get_columns_for_level(self.level, self.column_name) + cols = self.spatial_unit.location_columns return [["{}_from".format(x) for x in cols], ["{}_to".format(x) for x in cols]] @property def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level, self.column_name) + cols = self.spatial_unit.location_columns return ( [f"{col}_from" for col in cols] + [f"{col}_to" for col in cols] + ["count"] ) @@ -95,24 +94,22 @@ def column_names(self) -> List[str]: def _make_query(self): group_cols = ",".join(self.joined.column_names[1:]) - grouped = """ + grouped = f""" SELECT {group_cols}, count(*) FROM - ({joined}) AS joined + ({self.joined.get_query()}) AS joined GROUP BY {group_cols} ORDER BY {group_cols} DESC - """.format( - group_cols=group_cols, joined=self.joined.get_query() - ) + """ return grouped def _geo_augmented_query(self): """ - Returns one of each geom for non-point levels, with the + Returns one of each geom for non-point spatial units, with the flows in/out as properties. Returns @@ -120,56 +117,34 @@ def _geo_augmented_query(self): str A version of this query with geom and gid columns """ - loc_join = self._get_location_join() - level = loc_join.level - if level in ["lat-lon", "versioned-site"]: - return super()._geo_augmented_query() - else: - mapping = loc_join.right_query.mapping - col_name = mapping.column_name[0] - l_col_name = ( - "pcod" - if ("admin" in level) and (self.column_name is None) - else col_name - ) - geom_col = mapping.geom_col - poly_query = mapping.polygon_table - if isinstance(poly_query, Query): # Deal with grids - poly_query = poly_query.get_query() - else: - poly_query = "SELECT * FROM {}".format(poly_query) - - agg_qry = """ - WITH flows AS ({query}) - select {col_name}, json_strip_nulls(outflows) as outflows, json_strip_nulls(inflows) as inflows FROM - (SELECT {col_name}_from as {col_name}, json_object_agg({col_name}_to, count) AS outflows - FROM flows - GROUP BY {col_name}_from - ) x - FULL JOIN - (SELECT {col_name}_to as {col_name}, json_object_agg({col_name}_from, count) AS inflows - FROM flows - GROUP BY {col_name}_to - ) y - USING ({col_name}) - """.format( - query=self.get_query(), col_name=l_col_name - ) + loc_cols = self.spatial_unit.location_columns + loc_cols_string = ",".join(loc_cols) + loc_cols_from_string = ",".join([f"{col}_from" for col in loc_cols]) + loc_cols_to_string = ",".join([f"{col}_to" for col in loc_cols]) + loc_cols_from_aliased_string = ",".join( + [f"{col}_from AS {col}" for col in loc_cols] + ) + loc_cols_to_aliased_string = ",".join( + [f"{col}_to AS {col}" for col in loc_cols] + ) - joined_query = """ - SELECT row_number() over() as gid, {geom_col} as geom, u.* - FROM ({qur}) u - LEFT JOIN - ({poly_query}) g - ON u.{l_col_name}=g.{r_col_name} - """.format( - qur=agg_qry, - poly_query=poly_query, - geom_col=geom_col, - l_col_name=l_col_name, - r_col_name=col_name, - ) - return joined_query, [l_col_name, "outflows", "inflows", "geom", "gid"] + agg_qry = f""" + WITH flows AS ({self.get_query()}) + select {loc_cols_string}, json_strip_nulls(outflows) as outflows, json_strip_nulls(inflows) as inflows FROM + (SELECT {loc_cols_from_aliased_string}, json_object_agg({loc_cols_to_string}, count) AS outflows + FROM flows + GROUP BY {loc_cols_from_string} + ) x + FULL JOIN + (SELECT {loc_cols_to_aliased_string}, json_object_agg({loc_cols_from_string}, count) AS inflows + FROM flows + GROUP BY {loc_cols_to_string} + ) y + USING ({loc_cols_string}) + """ + + joined_query = self.spatial_unit.geo_augment(agg_qry) + return joined_query, loc_cols + ["outflows", "inflows", "geom", "gid"] class BaseInOutFlow(GeoDataMixin, Query, metaclass=ABCMeta): @@ -183,13 +158,11 @@ class BaseInOutFlow(GeoDataMixin, Query, metaclass=ABCMeta): """ def __init__(self, flow): - self.flow = flow cols = self.flow.column_names self.loc_from = ",".join([c for c in cols if c.endswith("_from")]) self.loc_to = ",".join([c for c in cols if c.endswith("_to")]) - self.level = flow.level - self.column_name = flow.column_name + self.spatial_unit = flow.spatial_unit super().__init__() # Returns a query that groups by one column and sums the count @@ -210,7 +183,7 @@ class OutFlow(BaseInOutFlow): """ Class for an outflow. These are the total number of people coming from one locations, regardless of where they go to. Note that this is normally initialised - through the outflows method of a flows class. + through the outflows method of a Flows object. """ def _make_query(self): @@ -219,12 +192,12 @@ def _make_query(self): @property def index_cols(self): - cols = get_columns_for_level(self.level, self.column_name) - return [["{}_from".format(x) for x in cols]] + cols = self.spatial_unit.location_columns + return [[f"{x}_from" for x in cols]] @property def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level, self.column_name) + cols = self.spatial_unit.location_columns return [f"{col}_from" for col in cols] + ["total"] @@ -241,10 +214,10 @@ def _make_query(self): @property def index_cols(self): - cols = get_columns_for_level(self.level, self.column_name) - return [["{}_to".format(x) for x in cols]] + cols = self.spatial_unit.location_columns + return [[f"{x}_to" for x in cols]] @property def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level, self.column_name) + cols = self.spatial_unit.location_columns return [f"{col}_to" for col in cols] + ["total"] From f83f8e42b63fae55245edcf4de36b7fde5432038 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 09:29:27 +0100 Subject: [PATCH 045/138] Format subscriber_locations.py --- .../flowmachine/features/utilities/subscriber_locations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index b0b5c49f1d..e0885181b3 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -197,4 +197,3 @@ def subscriber_locations( return location_joined_query( subscriber_cells, spatial_unit=spatial_unit, time_col="time" ) - From 6f7a56eabeae53f6dc045ab3b530bfe27577b279 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 11:37:03 +0100 Subject: [PATCH 046/138] Update Query.index_cols to use spatial_unit attribute instead of level --- flowmachine/flowmachine/core/query.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index 33afc74630..0822863885 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -406,8 +406,8 @@ def union(self, other, all=True): Examples -------- - >>> dl1 = daily_location('2016-01-01', level='cell') - >>> dl2 = daily_location('2016-01-02', level='cell') + >>> dl1 = daily_location('2016-01-01', spatial_unit=CellSpatialUnit()) + >>> dl2 = daily_location('2016-01-02', spatial_unit=CellSpatialUnit()) >>> dl1.union(dl2).get_query() 'cell_msisdn_20160101 UNION ALL cell_msisdn_20160102' @@ -927,26 +927,17 @@ def index_cols(self): ------- ixen : list By default, returns the location columns if they are present - and self.level is defined, and the subscriber column. + and self.spatial_unit is defined, and the subscriber column. Examples -------- >>> daily_location("2016-01-01").index_cols [['name'], '"subscriber"'] """ - from flowmachine.utils import ( - get_columns_for_level, - ) # Local import to avoid circular import - cols = self.column_names ixen = [] try: - # Not all objects define the attribute column_name so we'll fall - # back to the default if it is not defined - try: - loc_cols = get_columns_for_level(self.level, self.column_name) - except AttributeError: - loc_cols = get_columns_for_level(self.level) + loc_cols = self.spatial_unit.location_columns if set(loc_cols).issubset(cols): ixen.append(loc_cols) except AttributeError: From cd186c340e0a4b7c104cfab6d4cc61763d18b8dc Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 11:37:49 +0100 Subject: [PATCH 047/138] Remove a few mentions of "level" --- flowmachine/flowmachine/features/location/flows.py | 3 +-- flowmachine/flowmachine/features/subscriber/daily_location.py | 4 ++-- flowmachine/flowmachine/features/subscriber/metaclasses.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 4635490a4a..261e8ef8b0 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -6,7 +6,7 @@ """ Definition of the flows class, which is the difference in locations between two daily or home location classes, -aggregated to a location level. +aggregated to a spatial unit. @@ -17,7 +17,6 @@ from ...core.query import Query from ...core.mixins import GeoDataMixin, GraphMixin -from flowmachine.utils import get_columns_for_level import structlog diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index 82656fded9..ba471cdf88 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -35,7 +35,7 @@ def locate_subscribers( with a number of different methods. Find the last/most-frequent location for every subscriber within the given time - frame. Specify a level. + frame. Specify a spatial unit. Parameters ---------- @@ -83,7 +83,7 @@ def locate_subscribers( >>> last_locs = locate_subscribers('2016-01-01 13:30:30', '2016-01-02 16:25:00' - level = 'cell' + spatial_unit = CellSpatialUnit method='last') >>> last_locs.head() subscriber | cell diff --git a/flowmachine/flowmachine/features/subscriber/metaclasses.py b/flowmachine/flowmachine/features/subscriber/metaclasses.py index 897682d719..91fb6ce067 100644 --- a/flowmachine/flowmachine/features/subscriber/metaclasses.py +++ b/flowmachine/flowmachine/features/subscriber/metaclasses.py @@ -41,7 +41,7 @@ def join_aggregate(self, locations, method="avg"): ------- JoinedSpatialAggregate Query object representing a version of this metric aggregated to - the location level. + the spatial unit. """ return JoinedSpatialAggregate(metric=self, locations=locations, method=method) From 40ada92c39fc04532e112f7a83e80f8f0c608184 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 12:46:53 +0100 Subject: [PATCH 048/138] Replace AdminSpatialUnit and GridSpatialUnit with helper functions --- flowmachine/flowmachine/core/spatial_unit.py | 120 +++++++----------- .../location/unique_subscriber_counts.py | 2 +- .../features/network/total_network_objects.py | 6 +- .../features/subscriber/daily_location.py | 10 +- .../features/subscriber/day_trajectories.py | 2 +- .../features/subscriber/last_location.py | 6 +- .../subscriber/most_frequent_location.py | 6 +- .../features/utilities/spatial_aggregates.py | 2 +- flowmachine/tests/conftest.py | 10 +- .../test_sql_strings_and_results.py | 8 +- flowmachine/tests/test_daily_location.py | 4 +- flowmachine/tests/test_day_trajectories.py | 6 +- flowmachine/tests/test_flows.py | 10 +- flowmachine/tests/test_geomixin.py | 18 +-- flowmachine/tests/test_join_to_location.py | 8 +- flowmachine/tests/test_joined_aggregate.py | 12 +- flowmachine/tests/test_location_visits.py | 10 +- .../tests/test_most_frequent_locations.py | 4 +- flowmachine/tests/test_radius_of_gyration.py | 4 +- flowmachine/tests/test_spatial_aggregate.py | 4 +- flowmachine/tests/test_spatial_unit.py | 12 +- 21 files changed, 122 insertions(+), 142 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index be2aff669c..383ef0c20b 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -8,6 +8,8 @@ The available spatial units are: CellSpatialUnit: The identifier as found in the CDR. + LatLonSpatialUnit: + Latitude and longitude of cell/site locations. VersionedCellSpatialUnit: The identifier as found in the CDR combined with the version from the cells table. @@ -19,12 +21,14 @@ return after the join, and polygon_table, the table where the polygons reside (with the schema), and additionally geom_col which is the column with the geometry information (will default to 'geom'). - AdminSpatialUnit: + admin_spatial_unit: An admin region of interest, such as admin3. Must live in the database in the standard location. - GridSpatialUnit: + Special case of PolygonSpatialUnit. + grid_spatial_unit: A square in a regular grid, in addition pass size to determine the size of the polygon. + Special case of PolygonSpatialUnit. """ from typing import List from abc import ABCMeta, abstractmethod @@ -47,6 +51,11 @@ class CellSpatialUnit: def __eq__(self, other): return isinstance(other, CellSpatialUnit) + def __hash__(self): + # We may never need CellSpatialUnits to be hashable, but I'll define + # this just in case. + return hash(self.__class__.__name__) + @property def location_columns(self) -> List[str]: """ @@ -117,6 +126,10 @@ def __init__( def __eq__(self, other): return self.md5 == other.md5 + def __hash__(self): + # Must define this because we explicitly define self.__eq__ + return self.md5 + @property def location_columns(self) -> List[str]: """ @@ -208,9 +221,6 @@ def __init__(self): location_column_names=["lat", "lon"], ) - def __eq__(self, other): - return isinstance(other, LatLonSpatialUnit) - def geo_augment(self, sql_query): sql = f""" SELECT @@ -244,9 +254,6 @@ def __init__(self): location_info_table="infrastructure.cells", ) - def __eq__(self, other): - return isinstance(other, VersionedCellSpatialUnit) - def geo_augment(self, sql_query): sql = f""" SELECT @@ -334,9 +341,6 @@ def __init__(self): join_clause=join_clause, ) - def __eq__(self, other): - return isinstance(other, VersionedSiteSpatialUnit) - def geo_augment(self, sql_query): sql = f""" SELECT @@ -407,7 +411,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): self.polygon_table = polygon_table else: # Creating a GeoTable object here means that we don't have to handle - # admin tables and Grid objects differently in self.geo_augment + # admin tables and Grid objects differently in join_clause and self.geo_augment self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) self.geom_col = geom_col @@ -457,14 +461,6 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): join_clause=join_clause, ) - def __eq__(self, other): - return ( - isinstance(other, PolygonSpatialUnit) - and self.polygon_table.md5 == other.polygon_table.md5 - and self.polygon_column_names == other.polygon_column_names - and self.geom_col == other.geom_col - ) - def geo_augment(self, sql_query): r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) sql = f""" @@ -479,11 +475,11 @@ def geo_augment(self, sql_query): return sql -class AdminSpatialUnit(PolygonSpatialUnit): +def admin_spatial_unit(*, level, column_name=None): """ - Class that maps all cells (aka sites) to an admin region. This is a thin - wrapper to the more general class PolygonSpatialUnit, which assumes that - you have the standard set-up. + Helper function to create a PolygonSpatialUnit object that maps all cells + (aka sites) to an admin region. This assumes that you have geography data + in the standard location in FlowDB. Parameters ---------- @@ -494,62 +490,42 @@ class AdminSpatialUnit(PolygonSpatialUnit): identifier of the admin region. By default this will be admin*pcod. But you may wish to use something else, such as admin3name. + + Returns + ------- + PolygonSpatialUnit + Query which maps cell/site IDs to admin regions """ + # If there is no column_name passed then we can use the default, which is + # of the form admin3pcod. If the user has asked for the standard + # column_name then we will alias this column as 'pcod', otherwise we won't + # alias it at all. + if (column_name is None) or (column_name == f"admin{level}pcod"): + col_name = f"admin{level}pcod AS pcod" + else: + col_name = column_name + table = f"geography.admin{level}" - def __init__(self, *, level, column_name=None): - self.level = level - # If there is no column_name passed then we can use - # the default, which is of the form admin3pcod. - # If the user has asked for the standard column_name - # then we will alias this column as 'pcod', otherwise - # we'll won't alias it at all. - if (column_name is None) or (column_name == self._get_standard_name()): - col_name = f"{self._get_standard_name()} AS pcod" - else: - col_name = column_name - table = f"geography.admin{self.level}" + return PolygonSpatialUnit(polygon_column_names=col_name, polygon_table=table) - super().__init__(polygon_column_names=col_name, polygon_table=table) - def __eq__(self, other): - if isinstance(other, AdminSpatialUnit): - return ( - self.level == other.level - and self.polygon_column_names == other.polygon_column_names - ) - else: - return super().__eq__(other) - - def _get_standard_name(self): - """ - Returns the standard name of the column that identifies - the name of the region. - """ - - return f"admin{self.level}pcod" - - -class GridSpatialUnit(PolygonSpatialUnit): +def grid_spatial_unit(*, size): """ - Query representing a mapping between all the sites in the database - and a grid of arbitrary size. + Helper function to create a PolygonSpatialUnit representing a mapping + between all the sites in the database and a grid of arbitrary size. Parameters ---------- size : float or int Size of the grid in kilometres + + Returns + ------- + PolygonSpatialUnit + Query which maps cell/site IDs to grid squares """ - - def __init__(self, *, size): - self.size = size - super().__init__( - polygon_column_names=["grid_id"], - polygon_table=Grid(self.size), - geom_col="geom_square", - ) - - def __eq__(self, other): - if isinstance(other, GridSpatialUnit): - return self.size == other.size - else: - return super().__eq__(other) + return PolygonSpatialUnit( + polygon_column_names=["grid_id"], + polygon_table=Grid(size), + geom_col="geom_square", + ) diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index dabfd9a699..9677d506cc 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -62,7 +62,7 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): Examples -------- - >>> usc = UniqueSubscriberCounts('2016-01-01', '2016-01-04', spatial_unit=AdminSpatialUnit(level=3), hours=(5,17)) + >>> usc = UniqueSubscriberCounts('2016-01-01', '2016-01-04', spatial_unit=admin_spatial_unit(level=3), hours=(5,17)) >>> usc.head(4) name unique_subscriber_counts 0 Arghakhanchi 313 diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 05b492edf5..e8705a15fd 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -20,7 +20,7 @@ CellSpatialUnit, VersionedSiteSpatialUnit, VersionedCellSpatialUnit, - AdminSpatialUnit, + admin_spatial_unit, ) from ..utilities import EventsTablesUnion @@ -49,7 +49,7 @@ class TotalNetworkObjects(GeoDataMixin, Query): Objects to track, defaults to CellSpatialUnit(), the unversioned lowest level of infrastructure available. spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default AdminSpatialUnit(level=0) + default admin_spatial_unit(level=0) Spatial unit to facet on. Other Parameters @@ -108,7 +108,7 @@ def __init__( ) if spatial_unit is None: - self.spatial_unit = AdminSpatialUnit(level=0) + self.spatial_unit = admin_spatial_unit(level=0) else: self.spatial_unit = spatial_unit if type(self.spatial_unit) in allowed_network_object_types: diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index ba471cdf88..8377df339b 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -13,7 +13,7 @@ """ import datetime -from ...core.spatial_unit import AdminSpatialUnit +from ...core.spatial_unit import admin_spatial_unit from .last_location import LastLocation from .most_frequent_location import MostFrequentLocation @@ -43,7 +43,7 @@ def locate_subscribers( iso format date range for the the time frame, e.g. 2016-01-01 or 2016-01-01 14:03:01 spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default AdminSpatialUnit(level=3) + default admin_spatial_unit(level=3) Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' @@ -95,7 +95,7 @@ def locate_subscribers( . """ if spatial_unit is None: - spatial_unit = AdminSpatialUnit(level=3) + spatial_unit = admin_spatial_unit(level=3) if method == "last": return LastLocation( @@ -151,7 +151,7 @@ def daily_location( optionally specify a stop datetime in iso format date for the day in question, e.g. 2016-01-02 06:00:00 spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default AdminSpatialUnit(level=3) + default admin_spatial_unit(level=3) Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' @@ -186,7 +186,7 @@ def daily_location( """ if spatial_unit is None: - spatial_unit = AdminSpatialUnit(level=3) + spatial_unit = admin_spatial_unit(level=3) # Temporary band-aid; marshmallow deserialises date strings # to date objects, so we convert it back here because the diff --git a/flowmachine/flowmachine/features/subscriber/day_trajectories.py b/flowmachine/flowmachine/features/subscriber/day_trajectories.py index 0cf034ccf2..560f552f1a 100644 --- a/flowmachine/flowmachine/features/subscriber/day_trajectories.py +++ b/flowmachine/flowmachine/features/subscriber/day_trajectories.py @@ -28,7 +28,7 @@ class DayTrajectories(MultiLocation, BaseLocation, Query): >>> dt = DayTrajectories( '2016-01-01', '2016-01-04', - spatial_unit = AdminSpatialUnit(level=3), + spatial_unit = admin_spatial_unit(level=3), method = 'last', hours = (5,17), ) diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index 13ef401a2f..c35195bd50 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -15,7 +15,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation from ..utilities.subscriber_locations import subscriber_locations -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit class LastLocation(BaseLocation, Query): @@ -31,7 +31,7 @@ class LastLocation(BaseLocation, Query): stop : str As above spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default AdminSpatialUnit(level=3) + default admin_spatial_unit(level=3) Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' @@ -82,7 +82,7 @@ def __init__( self.start = start self.stop = stop if spatial_unit is None: - self.spatial_unit = AdminSpatialUnit(level=3) + self.spatial_unit = admin_spatial_unit(level=3) else: self.spatial_unit = spatial_unit self.hours = hours diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 0f4877beb3..4e1001cab3 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -13,7 +13,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation, subscriber_locations -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit class MostFrequentLocation(BaseLocation, Query): @@ -29,7 +29,7 @@ class MostFrequentLocation(BaseLocation, Query): stop : str As above spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default AdminSpatialUnit(level=3) + default admin_spatial_unit(level=3) Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of int, default 'all' @@ -84,7 +84,7 @@ def __init__( self.start = start self.stop = stop if spatial_unit is None: - self.spatial_unit = AdminSpatialUnit(level=3) + self.spatial_unit = admin_spatial_unit(level=3) else: self.spatial_unit = spatial_unit self.hours = hours diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index 6fa8623b79..c8041b8b8b 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -80,7 +80,7 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): -------- >>> mfl = subscribers.MostFrequentLocation('2016-01-01', '2016-01-04', - spatial_unit=AdminSpatialUnit(level=3)) + spatial_unit=admin_spatial_unit(level=3)) >>> rog = subscribers.RadiusOfGyration('2016-01-01', '2016-01-04') >>> sm = JoinedSpatialAggregate(metric=rog, locations=mfl) diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 9bd2ed8cf2..2ea545b486 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -26,8 +26,8 @@ VersionedCellSpatialUnit, VersionedSiteSpatialUnit, PolygonSpatialUnit, - AdminSpatialUnit, - GridSpatialUnit, + admin_spatial_unit, + grid_spatial_unit, ) from flowmachine.features import EventTableSubset @@ -73,13 +73,13 @@ def exemplar_level_param(request): @pytest.fixture( params=[ - (AdminSpatialUnit, {"level": 2}), - (AdminSpatialUnit, {"level": 2, "column_name": "admin2name"}), + (admin_spatial_unit, {"level": 2}), + (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), (VersionedSiteSpatialUnit, {}), (VersionedCellSpatialUnit, {}), (CellSpatialUnit, {}), (LatLonSpatialUnit, {}), - (GridSpatialUnit, {"size": 5}), + (grid_spatial_unit, {"size": 5}), ( PolygonSpatialUnit, {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index 4afeaaa1b2..d20849bfe7 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -7,7 +7,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery from flowmachine.features import daily_location -from flowmachine.core.spatial_unit import AdminSpatialUnit, CellSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit, CellSpatialUnit def test_daily_location_1_sql(diff_reporter): @@ -34,7 +34,7 @@ def test_daily_location_2_sql(diff_reporter): """ dl = daily_location( "2016-01-04", - spatial_unit=AdminSpatialUnit(level=2, column_name="admin2pcod"), + spatial_unit=admin_spatial_unit(level=2, column_name="admin2pcod"), hours=(3, 9), method="most-common", subscriber_identifier="imei", @@ -56,7 +56,7 @@ def test_daily_location_2_df(get_dataframe, diff_reporter): """ dl = daily_location( "2016-01-04", - spatial_unit=AdminSpatialUnit(level=2), + spatial_unit=admin_spatial_unit(level=2), hours=(3, 9), method="most-common", # subscriber_identifier="imei", @@ -187,7 +187,7 @@ def test_daily_location_5_df(get_dataframe, diff_reporter): dl = daily_location( "2016-01-02", - spatial_unit=AdminSpatialUnit(level=3), + spatial_unit=admin_spatial_unit(level=3), hours=(4, 9), method="most-common", # subscriber_identifier="imei", diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index fcc957ada6..6b9f732982 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import MissingDateError -from flowmachine.core.spatial_unit import AdminSpatialUnit, CellSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit, CellSpatialUnit from flowmachine.features import daily_location, MostFrequentLocation @@ -43,7 +43,7 @@ def test_works_with_admin_names(get_dataframe): """ dl = daily_location( - "2016-01-05", spatial_unit=AdminSpatialUnit(level=3, column_name="admin3name") + "2016-01-05", spatial_unit=admin_spatial_unit(level=3, column_name="admin3name") ) df = get_dataframe(dl) assert "Lamjung" == df.admin3name[0] diff --git a/flowmachine/tests/test_day_trajectories.py b/flowmachine/tests/test_day_trajectories.py index 0514743fe4..74844d7be4 100644 --- a/flowmachine/tests/test_day_trajectories.py +++ b/flowmachine/tests/test_day_trajectories.py @@ -4,7 +4,7 @@ from flowmachine.features import DayTrajectories, daily_location -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit def test_column_names_day_trajectories(exemplar_spatial_unit_param): @@ -21,12 +21,12 @@ def test_day_trajectories(get_dataframe): """ traj = DayTrajectories( daily_location( - "2016-01-01", spatial_unit=AdminSpatialUnit(level=3), method="last" + "2016-01-01", spatial_unit=admin_spatial_unit(level=3), method="last" ) ) df = get_dataframe(traj).drop("date", axis=1) dldf = daily_location( - "2016-01-01", spatial_unit=AdminSpatialUnit(level=3), method="last" + "2016-01-01", spatial_unit=admin_spatial_unit(level=3), method="last" ).get_dataframe() assert [df["subscriber"][0], df["pcod"][0]] == [ dldf["subscriber"][0], diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 9808dc3f6d..94f5a1cb3e 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -6,7 +6,7 @@ import pytest -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit from flowmachine.features import daily_location from flowmachine.features.location.flows import * from flowmachine.features.subscriber.daily_location import locate_subscribers @@ -29,8 +29,8 @@ def test_flows_raise_error(): """ Flows() raises error if location levels are different. """ - dl1 = daily_location("2016-01-01", spatial_unit=AdminSpatialUnit(level=3)) - dl2 = daily_location("2016-01-01", spatial_unit=AdminSpatialUnit(level=3)) + dl1 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=3)) + dl2 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=3)) with pytest.raises(ValueError): Flows(dl1, dl2) @@ -48,7 +48,7 @@ def test_calculates_flows(get_dataframe): """ Flows() are correctly calculated """ - spatial_unit = AdminSpatialUnit(level=3) + spatial_unit = admin_spatial_unit(level=3) dl1 = locate_subscribers( "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" ) @@ -81,7 +81,7 @@ def test_flows_geojson_correct(): """ Test that flows outputs expected geojson. """ - spatial_unit = AdminSpatialUnit(level=3) + spatial_unit = admin_spatial_unit(level=3) dl1 = locate_subscribers( "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" ) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index 311dfb591d..e09d140f9e 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -20,8 +20,8 @@ LatLonSpatialUnit, VersionedCellSpatialUnit, VersionedSiteSpatialUnit, - AdminSpatialUnit, - GridSpatialUnit, + admin_spatial_unit, + grid_spatial_unit, ) from flowmachine.features import daily_location, Flows from flowmachine.utils import proj4string @@ -83,7 +83,7 @@ def test_valid_geojson(): test_geojson = [ daily_location("2016-01-01", "2016-01-02").aggregate(), daily_location( - "2016-01-01", "2016-01-02", spatial_unit=GridSpatialUnit(size=100) + "2016-01-01", "2016-01-02", spatial_unit=grid_spatial_unit(size=100) ).aggregate(), daily_location( "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() @@ -95,12 +95,12 @@ def test_valid_geojson(): "2016-01-01", "2016-01-02", spatial_unit=VersionedCellSpatialUnit() ).aggregate(), daily_location( - "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) + "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) ).aggregate(), daily_location( "2016-01-01", "2016-01-02", - spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name"), + spatial_unit=admin_spatial_unit(level=2, column_name="admin2name"), ).aggregate(), ] for o in test_geojson: @@ -113,7 +113,7 @@ def test_correct_geojson(): """ js = ( daily_location( - "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) + "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) ) .aggregate() .to_geojson() @@ -139,7 +139,7 @@ def test_geojson_file_output(tmpdir): js_file = tmpdir / "geojson_test.json" daily_location( - "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=2) + "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) ).aggregate().to_geojson_file(js_file) with open(js_file) as fin: js = json.load(fin) @@ -162,10 +162,10 @@ def test_flows_geojson(get_dataframe): """ dl = daily_location( - "2016-01-01", spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name") + "2016-01-01", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") ) dl2 = daily_location( - "2016-01-02", spatial_unit=AdminSpatialUnit(level=2, column_name="admin2name") + "2016-01-02", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") ) fl = Flows(dl, dl2) js = fl.to_geojson() diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 2e7122b84c..67385369a8 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -12,11 +12,11 @@ from flowmachine.core import JoinToLocation, location_joined_query from flowmachine.core.spatial_unit import ( CellSpatialUnit, - AdminSpatialUnit, + admin_spatial_unit, VersionedSiteSpatialUnit, VersionedCellSpatialUnit, LatLonSpatialUnit, - GridSpatialUnit, + grid_spatial_unit, PolygonSpatialUnit, ) @@ -137,7 +137,7 @@ def test_join_to_admin(get_dataframe, get_length): ul = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=AdminSpatialUnit(level=3))) + df = get_dataframe(JoinToLocation(ul, spatial_unit=admin_spatial_unit(level=3))) assert len(df) == get_length(ul) expected_cols = sorted(["subscriber", "time", "location_id", "pcod"]) assert sorted(df.columns) == expected_cols @@ -150,7 +150,7 @@ def test_join_to_grid(get_dataframe, get_length): ul = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=GridSpatialUnit(size=50))) + df = get_dataframe(JoinToLocation(ul, spatial_unit=grid_spatial_unit(size=50))) assert len(df) == get_length(ul) diff --git a/flowmachine/tests/test_joined_aggregate.py b/flowmachine/tests/test_joined_aggregate.py index d1f2421c55..2db820aed6 100644 --- a/flowmachine/tests/test_joined_aggregate.py +++ b/flowmachine/tests/test_joined_aggregate.py @@ -6,7 +6,7 @@ import pytest -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit from flowmachine.features import ( MostFrequentLocation, RadiusOfGyration, @@ -19,7 +19,7 @@ def test_joined_aggregate(get_dataframe): Test join aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) ) joined = mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert ( @@ -33,7 +33,7 @@ def test_joined_modal_aggregate(get_dataframe): Test join with modal aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) ) rog = SubscriberDegree("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="mode") @@ -56,7 +56,7 @@ def test_joined_median_aggregate(get_dataframe): Test join with median aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) ) rog = RadiusOfGyration("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="median") @@ -79,7 +79,7 @@ def test_joined_agg_date_mismatch(): Test that join aggregate with mismatched dates raises a warning. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) ) with pytest.warns(UserWarning): mfl.join_aggregate(RadiusOfGyration("2016-01-02", "2016-01-04")) @@ -93,7 +93,7 @@ def test_joined_agg_hours_mismatch(): Test that join aggregate with mismatched hours doesn't warn. """ mfl = MostFrequentLocation( - "2016-01-01 10:00", "2016-01-04", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01 10:00", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) ) with warnings.catch_warnings(record=True) as w: mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) diff --git a/flowmachine/tests/test_location_visits.py b/flowmachine/tests/test_location_visits.py index f78e07247f..4c7fcba701 100644 --- a/flowmachine/tests/test_location_visits.py +++ b/flowmachine/tests/test_location_visits.py @@ -4,7 +4,7 @@ from flowmachine.features import LocationVisits, daily_location, DayTrajectories from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit def test_column_names_location_visits(exemplar_spatial_unit_param): @@ -29,7 +29,9 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): lv = LocationVisits( DayTrajectories( *[ - daily_location(d, spatial_unit=AdminSpatialUnit(level=3), method="last") + daily_location( + d, spatial_unit=admin_spatial_unit(level=3), method="last" + ) for d in list_of_dates(start_date, stop_date) ] ) @@ -43,7 +45,9 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): lv = LocationVisits( DayTrajectories( *[ - daily_location(d, spatial_unit=AdminSpatialUnit(level=3), method="last") + daily_location( + d, spatial_unit=admin_spatial_unit(level=3), method="last" + ) for d in list_of_dates(start_date, stop_date) ] ) diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index 79300a8639..fad6a5a14f 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.spatial_unit import ( - AdminSpatialUnit, + admin_spatial_unit, VersionedSiteSpatialUnit, LatLonSpatialUnit, ) @@ -63,7 +63,7 @@ def test_most_fequent_admin(get_dataframe): mfl = locate_subscribers( "2016-01-01", "2016-01-02", - spatial_unit=AdminSpatialUnit(level=3), + spatial_unit=admin_spatial_unit(level=3), method="most-common", ) df = get_dataframe(mfl) diff --git a/flowmachine/tests/test_radius_of_gyration.py b/flowmachine/tests/test_radius_of_gyration.py index 0b590c70f8..e253cad0d7 100644 --- a/flowmachine/tests/test_radius_of_gyration.py +++ b/flowmachine/tests/test_radius_of_gyration.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import pytest -from flowmachine.core.spatial_unit import AdminSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.features.subscriber import * @@ -58,7 +58,7 @@ def test_can_be_joined(get_dataframe): """ RoG = RadiusOfGyration("2016-01-01", "2016-01-02") dl = locate_subscribers( - "2016-01-01", "2016-01-02", spatial_unit=AdminSpatialUnit(level=3) + "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=3) ) rog_JA = RoG.join_aggregate(dl) df = get_dataframe(rog_JA) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 5616a2da8e..51e613ef04 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import AdminSpatialUnit, LatLonSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit, LatLonSpatialUnit from flowmachine.features import ModalLocation, daily_location from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates @@ -15,7 +15,7 @@ def test_can_be_aggregated_admin3(get_dataframe): mfl = locate_subscribers( "2016-01-01", "2016-01-02", - spatial_unit=AdminSpatialUnit(level=3), + spatial_unit=admin_spatial_unit(level=3), method="most-common", ) agg = mfl.aggregate() diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index febea3f643..bbbec00695 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -10,8 +10,8 @@ VersionedCellSpatialUnit, VersionedSiteSpatialUnit, PolygonSpatialUnit, - AdminSpatialUnit, - GridSpatialUnit, + admin_spatial_unit, + grid_spatial_unit, ) import pytest @@ -52,10 +52,10 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): }, ["id"], ), - (AdminSpatialUnit, {"level": 3}, ["pcod"]), - (AdminSpatialUnit, {"level": 3, "column_name": "admin3pcod"}, ["pcod"]), - (AdminSpatialUnit, {"level": 3, "column_name": "admin3name"}, ["admin3name"]), - (GridSpatialUnit, {"size": 5}, ["grid_id"]), + (admin_spatial_unit, {"level": 3}, ["pcod"]), + (admin_spatial_unit, {"level": 3, "column_name": "admin3pcod"}, ["pcod"]), + (admin_spatial_unit, {"level": 3, "column_name": "admin3name"}, ["admin3name"]), + (grid_spatial_unit, {"size": 5}, ["grid_id"]), ], ) def test_spatial_unit_location_columns(spatial_unit, kwargs, loc_cols): From 086da03d39165e609b597e0245e7f296cd884150 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 12:52:33 +0100 Subject: [PATCH 049/138] Return an integer from BaseSpatialUnit.__hash__ --- flowmachine/flowmachine/core/spatial_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 383ef0c20b..57a4f79822 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -128,7 +128,7 @@ def __eq__(self, other): def __hash__(self): # Must define this because we explicitly define self.__eq__ - return self.md5 + return hash(self.md5) @property def location_columns(self) -> List[str]: From 0cd1e5b334e16a0e01bb2ad389770b061723d5f4 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 13:24:26 +0100 Subject: [PATCH 050/138] Fix flows test --- flowmachine/tests/test_flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 94f5a1cb3e..a5d7bce107 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -30,7 +30,7 @@ def test_flows_raise_error(): Flows() raises error if location levels are different. """ dl1 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=3)) - dl2 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=3)) + dl2 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=2)) with pytest.raises(ValueError): Flows(dl1, dl2) From 167b4b7515a993f4efda86df2843b862a4a0d091 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 17:09:35 +0100 Subject: [PATCH 051/138] Handle AttributeError in BaseSpatialUnit.__eq__ --- flowmachine/flowmachine/core/spatial_unit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 57a4f79822..449e1b47f6 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -124,7 +124,10 @@ def __init__( # implement a method to check whether the required data can be found in the DB. def __eq__(self, other): - return self.md5 == other.md5 + try: + return self.md5 == other.md5 + except AttributeError: + return False def __hash__(self): # Must define this because we explicitly define self.__eq__ From ae4c4f48c0d4097184263284da5bc039f85a584d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 17:10:10 +0100 Subject: [PATCH 052/138] Fix docstring --- flowmachine/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 2ea545b486..c231dfc9cc 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -102,7 +102,7 @@ def exemplar_spatial_unit_param(request): Yields ------ - flowmachine.core.spatial_unit.*SpatialUnit or None + flowmachine.core.spatial_unit.*SpatialUnit """ yield request.param[0](**request.param[1]) From 2600bb9229bcd1f33b23d3ceae3b3de770820a2e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 17:11:32 +0100 Subject: [PATCH 053/138] Test Flows to_geojson_string --- flowmachine/tests/test_geomixin.py | 47 +++++++++++++----------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index e09d140f9e..28847e98f5 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -17,6 +17,7 @@ from flowmachine.core import Query from flowmachine.core.mixins import GeoDataMixin from flowmachine.core.spatial_unit import ( + CellSpatialUnit, LatLonSpatialUnit, VersionedCellSpatialUnit, VersionedSiteSpatialUnit, @@ -75,36 +76,30 @@ def _geo_augmented_query(self): ManyRows().to_geojson() # This will error if the geojson couldn't be constructed -def test_valid_geojson(): +def test_valid_geojson(exemplar_spatial_unit_param): """ Check that valid geojson is returned. """ - test_geojson = [ - daily_location("2016-01-01", "2016-01-02").aggregate(), - daily_location( - "2016-01-01", "2016-01-02", spatial_unit=grid_spatial_unit(size=100) - ).aggregate(), - daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() - ).aggregate(), - daily_location( - "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() - ).aggregate(), - daily_location( - "2016-01-01", "2016-01-02", spatial_unit=VersionedCellSpatialUnit() - ).aggregate(), - daily_location( - "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) - ).aggregate(), - daily_location( - "2016-01-01", - "2016-01-02", - spatial_unit=admin_spatial_unit(level=2, column_name="admin2name"), - ).aggregate(), - ] - for o in test_geojson: - assert geojson.loads(o.to_geojson_string()).is_valid + if CellSpatialUnit() == exemplar_spatial_unit_param: + pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=exemplar_spatial_unit_param + ).aggregate() + assert geojson.loads(dl.to_geojson_string()).is_valid + + +def test_valid_flows_geojson(exemplar_spatial_unit_param): + """ + Check that valid geojson is returned for Flows. + + """ + if CellSpatialUnit() == exemplar_spatial_unit_param: + pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") + dl = daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) + dl2 = daily_location("2016-01-02", spatial_unit=exemplar_spatial_unit_param) + fl = Flows(dl, dl2) + assert geojson.loads(fl.to_geojson_string()).is_valid def test_correct_geojson(): From c2413e0986ad81ac2ad04c09a0cc908097798dab Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 17:11:53 +0100 Subject: [PATCH 054/138] Test __eq__ method of spatial units --- flowmachine/tests/test_spatial_unit.py | 94 ++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index bbbec00695..12a884b2a3 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -20,7 +20,7 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): """ Test that the SpatialUnit classes have accurate column_names properties. """ - if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): + if CellSpatialUnit() == exemplar_spatial_unit_param: pytest.skip( "CellSpatialUnit does not have a column_names property (not a Query)" ) @@ -103,18 +103,6 @@ def geo_augment(self, query): ) -def test_geo_augment_columns(exemplar_spatial_unit_param): - """ - Test that the columns returned by the geo_augment method are correct. - """ - if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): - pytest.skip("CellSpatialUnit does not have a geo_augment method") - su = exemplar_spatial_unit_param - sql, cols = su.geo_augment(su) - cq = CustomQuery(sql, cols) - assert cq.head(0).columns.tolist() == cols - - @pytest.mark.parametrize( "spatial_unit", [VersionedCellSpatialUnit, VersionedSiteSpatialUnit] ) @@ -129,3 +117,83 @@ def test_distance_matrix_columns(spatial_unit, return_geometry): cols = su.distance_matrix_columns(return_geometry=return_geometry) cq = CustomQuery(sql, cols) assert cq.head(0).columns.tolist() == cols + + +@pytest.mark.parametrize( + "spatial_unit, kwargs", + [ + (admin_spatial_unit, {"level": 2}), + (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), + (VersionedSiteSpatialUnit, {}), + (VersionedCellSpatialUnit, {}), + (CellSpatialUnit, {}), + (LatLonSpatialUnit, {}), + (grid_spatial_unit, {"size": 5}), + ( + PolygonSpatialUnit, + {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, + ), + ( + PolygonSpatialUnit, + { + "polygon_column_names": "id", + "polygon_table": "infrastructure.sites", + "geom_col": "geom_point", + }, + ), + ], +) +def test_spatial_unit_equals_itself(spatial_unit, kwargs): + """ + Test that instances of the SpatialUnit classes are equal to themselves. + """ + su1 = spatial_unit(**kwargs) + su2 = spatial_unit(**kwargs) + assert su1 == su2 + assert hash(su1) == hash(su2) + + +def test_cell_spatial_unit_not_equal_to_other_spatial_unit(): + """ + Test that a CellSpatialUnit is not equal to a VersionedCellSpatialUnit. + """ + su1 = CellSpatialUnit() + su2 = VersionedCellSpatialUnit() + assert su1 != su2 + assert su2 != su1 + + +def test_different_spatial_units_are_not_equal(): + """ + Test that two different spatial units are not equal. + """ + su1 = VersionedCellSpatialUnit() + su2 = VersionedSiteSpatialUnit() + assert su1 != su2 + + +def test_different_level_admin_spatial_units_are_not_equal(): + """ + Test that two admin spatial units with different levels are not equal. + """ + su1 = admin_spatial_unit(level=1) + su2 = admin_spatial_unit(level=3) + assert su1 != su2 + + +def test_different_column_name_admin_spatial_units_are_not_equal(): + """ + Test that two admin spatial units with different column_names are not equal. + """ + su1 = admin_spatial_unit(level=3, column_name="admin3pcod") + su2 = admin_spatial_unit(level=3, column_name="admin3name") + assert su1 != su2 + + +def test_different_grid_spatial_units_are_not_equal(): + """ + Test that two grid spatial units with different sizes are not equal. + """ + su1 = grid_spatial_unit(size=5) + su2 = grid_spatial_unit(size=50) + assert su1 != su2 From a18f6e297de9c938130d6fac691b313b1e2994ae Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 3 Jun 2019 17:13:17 +0100 Subject: [PATCH 055/138] Fix Flows._geo_augmented_query for multiple location columns --- flowmachine/flowmachine/features/location/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 261e8ef8b0..7b33fb7408 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -130,12 +130,12 @@ def _geo_augmented_query(self): agg_qry = f""" WITH flows AS ({self.get_query()}) select {loc_cols_string}, json_strip_nulls(outflows) as outflows, json_strip_nulls(inflows) as inflows FROM - (SELECT {loc_cols_from_aliased_string}, json_object_agg({loc_cols_to_string}, count) AS outflows + (SELECT {loc_cols_from_aliased_string}, json_object_agg({loc_cols[0]}_to, count) AS outflows FROM flows GROUP BY {loc_cols_from_string} ) x FULL JOIN - (SELECT {loc_cols_to_aliased_string}, json_object_agg({loc_cols_from_string}, count) AS inflows + (SELECT {loc_cols_to_aliased_string}, json_object_agg({loc_cols[0]}_from, count) AS inflows FROM flows GROUP BY {loc_cols_to_string} ) y From 60cdb6a63a470e301071edf4717276ea1829717d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 09:55:59 +0100 Subject: [PATCH 056/138] Remove distance_matrix_query method --- flowmachine/flowmachine/core/spatial_unit.py | 98 ------------------- .../features/spatial/distance_matrix.py | 78 +++++++++++++-- .../tests/test_spatial_distancematrix.py | 21 +++- flowmachine/tests/test_spatial_unit.py | 16 --- 4 files changed, 86 insertions(+), 127 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 449e1b47f6..7ae5c27748 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -163,38 +163,6 @@ def geo_augment(self, sql_query): """ raise NotImplementedError - def distance_matrix_query(self, return_geometry): - """ - A query that calculates the complete distance matrix between all - elements of this spatial unit. Distance is returned in km. - - Parameters - ---------- - return_geometry : bool - If True, geometries are returned in query - (represented as WKB in a dataframe) - - Returns - ------- - str - SQL query string - - """ - raise NotImplementedError( - f"Spatial units of type {type(self).__name__} do not support distance_matrix_query at this time." - ) - - def distance_matrix_columns(self, return_geometry=False): - """ - List of columns for self.distance_matrix_query - """ - col_names = [f"{c}_from" for c in self.location_columns] - col_names += [f"{c}_to" for c in self.location_columns] - col_names += ["distance"] - if return_geometry: - col_names += ["geom_origin", "geom_destination"] - return col_names - def _make_query(self): columns = ", ".join(self._cols) sql = f""" @@ -270,39 +238,6 @@ def geo_augment(self, sql_query): """ return sql - def distance_matrix_query(self, return_geometry=False): - return_geometry_statement = "" - if return_geometry: - return_geometry_statement = """ - , - A.geom_point AS geom_origin, - B.geom_point AS geom_destination - """ - - sql = f""" - - SELECT - A.id AS location_id_from, - A.version AS version_from, - B.id AS location_id_to, - B.version AS version_to, - ST_X(A.geom_point::geometry) AS lon_from, - ST_Y(A.geom_point::geometry) AS lat_from, - ST_X(B.geom_point::geometry) AS lon_to, - ST_Y(B.geom_point::geometry) AS lat_to, - ST_Distance( - A.geom_point::geography, - B.geom_point::geography - ) / 1000 AS distance - {return_geometry_statement} - FROM infrastructure.cells AS A - CROSS JOIN infrastructure.cells AS B - ORDER BY distance DESC - - """ - - return sql - class VersionedSiteSpatialUnit(BaseSpatialUnit): """ @@ -357,39 +292,6 @@ def geo_augment(self, sql_query): """ return sql - def distance_matrix_query(self, return_geometry=False): - return_geometry_statement = "" - if return_geometry: - return_geometry_statement = """ - , - A.geom_point AS geom_origin, - B.geom_point AS geom_destination - """ - - sql = f""" - - SELECT - A.id AS site_id_from, - A.version AS version_from, - B.id AS site_id_to, - B.version AS version_to, - ST_X(A.geom_point::geometry) AS lon_from, - ST_Y(A.geom_point::geometry) AS lat_from, - ST_X(B.geom_point::geometry) AS lon_to, - ST_Y(B.geom_point::geometry) AS lat_to, - ST_Distance( - A.geom_point::geography, - B.geom_point::geography - ) / 1000 AS distance - {return_geometry_statement} - FROM infrastructure.sites AS A - CROSS JOIN infrastructure.sites AS B - ORDER BY distance DESC - - """ - - return sql - class PolygonSpatialUnit(BaseSpatialUnit): """ diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 3ce1b0e6f7..781ec346f2 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -9,6 +9,7 @@ """ from typing import List +from flowmachine.utils import get_name_and_alias from ...core.query import Query from ...core.mixins import GraphMixin from ...core.spatial_unit import VersionedSiteSpatialUnit, VersionedCellSpatialUnit @@ -45,22 +46,81 @@ def __init__(self, spatial_unit=None, return_geometry=False): self.spatial_unit = VersionedCellSpatialUnit() else: self.spatial_unit = spatial_unit - if type(self.spatial_unit) not in { - VersionedSiteSpatialUnit, - VersionedCellSpatialUnit, - }: + + self.location_id_cols = set(self.spatial_unit.location_columns) + try: + self.location_id_cols.remove("lat") + self.location_id_cols.remove("lon") + except KeyError: raise ValueError("Only point locations are supported at this time.") + self.return_geometry = return_geometry super().__init__() @property def column_names(self) -> List[str]: - return self.spatial_unit.distance_matrix_columns( - return_geometry=self.return_geometry - ) + col_names = [f"{c}_from" for c in self.location_id_cols] + col_names += [f"{c}_to" for c in self.location_id_cols] + col_names += ["lon_from", "lat_from", "lon_to", "lat_to", "distance"] + if self.return_geometry: + col_names += ["geom_origin", "geom_destination"] + return col_names def _make_query(self): - return self.spatial_unit.distance_matrix_query( - return_geometry=self.return_geometry + # FIXME: Accessing a "private" attribute of self.spatial_unit here + names_for_loc_id_col_aliases = [ + c + for c in self.spatial_unit._cols + if get_name_and_alias(c)[1] in self.location_id_cols + ] + + cols_A = ",".join( + [ + f"A.{get_name_and_alias(c)[0].split('.')[-1]} AS {get_name_and_alias(c)[1]}_from" + for c in names_for_loc_id_col_aliases + ] + ) + if cols_A != "": + cols_A += "," + cols_B = ",".join( + [ + f"B.{get_name_and_alias(c)[0].split('.')[-1]} AS {get_name_and_alias(c)[1]}_to" + for c in names_for_loc_id_col_aliases + ] ) + if cols_B != "": + cols_B += "," + + locinfo_table = get_name_and_alias(self.spatial_unit.location_info_table)[0] + + if self.return_geometry: + return_geometry_statement = """ + , + A.geom_point AS geom_origin, + B.geom_point AS geom_destination + """ + else: + return_geometry_statement = "" + + sql = f""" + + SELECT + {cols_A} + {cols_B} + ST_X(A.geom_point::geometry) AS lon_from, + ST_Y(A.geom_point::geometry) AS lat_from, + ST_X(B.geom_point::geometry) AS lon_to, + ST_Y(B.geom_point::geometry) AS lat_to, + ST_Distance( + A.geom_point::geography, + B.geom_point::geography + ) / 1000 AS distance + {return_geometry_statement} + FROM {locinfo_table} AS A + CROSS JOIN {locinfo_table} AS B + ORDER BY distance DESC + + """ + + return sql diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index faad0b8bac..0badd6da51 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -6,9 +6,14 @@ Tests for the DistanceMatrix() class. """ +import pytest from flowmachine.features.spatial import DistanceMatrix -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import ( + VersionedCellSpatialUnit, + VersionedSiteSpatialUnit, + LatLonSpatialUnit, +) def test_some_results(get_dataframe): @@ -23,9 +28,17 @@ def test_some_results(get_dataframe): assert round(set_df.loc["8wPojr"]["distance"].values[4]) == 758 -def test_result_has_correct_length(get_length): +@pytest.mark.parametrize( + "spatial_unit_type, length", + [ + (VersionedCellSpatialUnit, 62), + (VersionedSiteSpatialUnit, 35), + (LatLonSpatialUnit, 62), + ], +) +def test_result_has_correct_length(spatial_unit_type, length, get_length): """ DistanceMatrix() has the correct length. """ - c = DistanceMatrix(spatial_unit=VersionedSiteSpatialUnit()) - assert get_length(c) == 35 ** 2 + c = DistanceMatrix(spatial_unit=spatial_unit_type()) + assert get_length(c) == length ** 2 diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 12a884b2a3..a1d6faa85a 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -103,22 +103,6 @@ def geo_augment(self, query): ) -@pytest.mark.parametrize( - "spatial_unit", [VersionedCellSpatialUnit, VersionedSiteSpatialUnit] -) -@pytest.mark.parametrize("return_geometry", [True, False]) -def test_distance_matrix_columns(spatial_unit, return_geometry): - """ - Test that the columns returned by the distance_matrix_columns method match - the columns of the distance_matrix_query. - """ - su = spatial_unit() - sql = su.distance_matrix_query(return_geometry=return_geometry) - cols = su.distance_matrix_columns(return_geometry=return_geometry) - cq = CustomQuery(sql, cols) - assert cq.head(0).columns.tolist() == cols - - @pytest.mark.parametrize( "spatial_unit, kwargs", [ From feba6aac86bfa54a9a256554b23ce8630e84c1db Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 12:11:37 +0100 Subject: [PATCH 057/138] Rename 'geom_col', and use a Table instead of GeoTable --- flowmachine/flowmachine/core/spatial_unit.py | 23 +++++++++++--------- flowmachine/tests/test_join_to_location.py | 2 +- flowmachine/tests/test_spatial_unit.py | 8 +++---- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 7ae5c27748..dd9e0b954d 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -19,7 +19,7 @@ A custom set of polygons that live in the database. Takes the parameters polygon_column_names, which is the columns you want to return after the join, and polygon_table, the table where the polygons - reside (with the schema), and additionally geom_col which is the column + reside (with the schema), and additionally geom_column which is the column with the geometry information (will default to 'geom'). admin_spatial_unit: An admin region of interest, such as admin3. Must live in the database @@ -34,7 +34,7 @@ from abc import ABCMeta, abstractmethod from flowmachine.utils import get_name_and_alias -from . import Query, GeoTable +from . import Query, Table from .grid import Grid @@ -143,6 +143,9 @@ def location_columns(self) -> List[str]: @property def column_names(self) -> List[str]: return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] + + def get_geom_sql(self): + @abstractmethod def geo_augment(self, sql_query): @@ -307,19 +310,19 @@ class PolygonSpatialUnit(BaseSpatialUnit): name of the table containing the geography information. Can be either the name of a table, with the schema, a flowmachine.Query object, or a string representing a query. - geom_col : str, default 'geom' + geom_column : str, default 'geom' column that defines the geography. """ - def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): + def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): if isinstance(polygon_table, Query): self.polygon_table = polygon_table else: - # Creating a GeoTable object here means that we don't have to handle + # Creating a Table object here means that we don't have to handle # admin tables and Grid objects differently in join_clause and self.geo_augment - self.polygon_table = GeoTable(name=polygon_table, geom_column=geom_col) + self.polygon_table = Table(name=polygon_table) - self.geom_col = geom_col + self.geom_column = geom_column location_info_table = self.connection.location_table @@ -338,7 +341,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_col="geom"): ({self.polygon_table.get_query()}) AS {joined_alias} ON ST_within( {locinfo_alias}.geom_point::geometry, - ST_SetSRID({joined_alias}.{self.geom_col}, 4326)::geometry + ST_SetSRID({joined_alias}.{self.geom_column}, 4326)::geometry ) """ @@ -371,7 +374,7 @@ def geo_augment(self, sql_query): sql = f""" SELECT row_number() OVER () as gid, - {self.geom_col} AS geom, + {self.geom_column} AS geom, U.* FROM ({sql_query}) AS U LEFT JOIN ({self.polygon_table.get_query()}) AS G @@ -432,5 +435,5 @@ def grid_spatial_unit(*, size): return PolygonSpatialUnit( polygon_column_names=["grid_id"], polygon_table=Grid(size), - geom_col="geom_square", + geom_column="geom_square", ) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 67385369a8..db01b37614 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -120,7 +120,7 @@ def test_join_with_polygon(get_dataframe, get_length): spatial_unit=PolygonSpatialUnit( polygon_column_names="admin3pcod", polygon_table="geography.admin3", - geom_col="geom", + geom_column="geom", ), ) df = get_dataframe(j) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index a1d6faa85a..6be9aac11c 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -39,7 +39,7 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): { "polygon_column_names": "id", "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", + "geom_column": "geom_point", }, ["id"], ), @@ -48,7 +48,7 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): { "polygon_column_names": ["id"], "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", + "geom_column": "geom_point", }, ["id"], ), @@ -75,7 +75,7 @@ def test_polygon_spatial_unit_column_list(): psu = PolygonSpatialUnit( polygon_column_names=passed_cols, polygon_table="infrastructure.sites", - geom_col="geom_point", + geom_column="geom_point", ) loc_cols = psu.location_columns assert passed_cols == loc_cols @@ -122,7 +122,7 @@ def geo_augment(self, query): { "polygon_column_names": "id", "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", + "geom_column": "geom_point", }, ), ], From 59c86b86860e269fcd8ae9627eaec737e5916ddb Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 12:13:06 +0100 Subject: [PATCH 058/138] Don't leave bits of incomplete functions lying around --- flowmachine/flowmachine/core/spatial_unit.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index dd9e0b954d..38b92b2bae 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -143,9 +143,6 @@ def location_columns(self) -> List[str]: @property def column_names(self) -> List[str]: return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] - - def get_geom_sql(self): - @abstractmethod def geo_augment(self, sql_query): From 38205499b8c9ce23249f6a82e7febe2f15b0fe8f Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 15:14:27 +0100 Subject: [PATCH 059/138] Add 'get_geom_query' method --- flowmachine/flowmachine/core/spatial_unit.py | 55 ++++++++++++++++---- flowmachine/tests/conftest.py | 2 +- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 38b92b2bae..fc92288be1 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -80,6 +80,8 @@ class BaseSpatialUnit(Query, metaclass=ABCMeta): location_info_table : str, optional Fully qualified name of the location info table to select from. Defaults to self.connection.location_table + geom_column : str, default "geom" + Name of the column that defines the geometry in location_info_table. join_clause : str, optional Optionally provide a SQL join clause to join data from the location info table to spatial regions in another table. @@ -91,6 +93,7 @@ def __init__( selected_column_names, location_column_names, location_info_table=None, + geom_column="geom", join_clause="", ): if type(selected_column_names) is str: @@ -115,6 +118,8 @@ def __init__( else: self.location_info_table = self.connection.location_table + self._geom_column = geom_column + self._join_clause = join_clause super().__init__() @@ -136,7 +141,7 @@ def __hash__(self): @property def location_columns(self) -> List[str]: """ - List of the location-related column names. + List of names of the columns which identify the locations. """ return self._loc_cols @@ -144,6 +149,20 @@ def location_columns(self) -> List[str]: def column_names(self) -> List[str]: return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] + def get_geom_query(self): + """ + Returns a SQL query which can be used to map locations (identified by + the values in self.location_columns) to their geometries (in a column + named "geom"). + """ + columns = [ + c for c in self._cols if get_name_and_alias(c)[1] in self.location_columns + ] + [f"{self._geom_column} AS geom"] + + sql = f"SELECT {','.join(columns)} FROM {self.location_info_table}" + + return sql, self.location_columns + ["geom"] + @abstractmethod def geo_augment(self, sql_query): """ @@ -190,6 +209,7 @@ def __init__(self): "ST_Y(geom_point::geometry) AS lat", ], location_column_names=["lat", "lon"], + geom_column="geom_point", ) def geo_augment(self, sql_query): @@ -223,6 +243,7 @@ def __init__(self): ], location_column_names=["location_id", "version", "lon", "lat"], location_info_table="infrastructure.cells", + geom_column="geom_point", ) def geo_augment(self, sql_query): @@ -276,6 +297,7 @@ def __init__(self): ], location_column_names=["site_id", "version", "lon", "lat"], location_info_table=f"infrastructure.sites AS {sites_alias}", + geom_column="geom_point", join_clause=join_clause, ) @@ -308,7 +330,7 @@ class PolygonSpatialUnit(BaseSpatialUnit): Can be either the name of a table, with the schema, a flowmachine.Query object, or a string representing a query. geom_column : str, default 'geom' - column that defines the geography. + Name of the column in polygon_table that defines the geography. """ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): @@ -319,8 +341,6 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): # admin tables and Grid objects differently in join_clause and self.geo_augment self.polygon_table = Table(name=polygon_table) - self.geom_column = geom_column - location_info_table = self.connection.location_table locinfo_alias = "locinfo" @@ -338,7 +358,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): ({self.polygon_table.get_query()}) AS {joined_alias} ON ST_within( {locinfo_alias}.geom_point::geometry, - ST_SetSRID({joined_alias}.{self.geom_column}, 4326)::geometry + ST_SetSRID({joined_alias}.{geom_column}, 4326)::geometry ) """ @@ -349,29 +369,42 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): f"{locinfo_alias}.date_of_last_service AS date_of_last_service", ] if type(polygon_column_names) is str: - self.polygon_column_names = [polygon_column_names] + self._polygon_column_names = [polygon_column_names] else: - self.polygon_column_names = polygon_column_names + self._polygon_column_names = polygon_column_names all_column_names = locinfo_column_names + [ - f"{joined_alias}.{c}" for c in self.polygon_column_names + f"{joined_alias}.{c}" for c in self._polygon_column_names ] location_column_names = [ - get_name_and_alias(c)[1] for c in self.polygon_column_names + get_name_and_alias(c)[1] for c in self._polygon_column_names ] super().__init__( selected_column_names=all_column_names, location_column_names=location_column_names, location_info_table=f"{location_info_table} AS {locinfo_alias}", + geom_column=geom_column, join_clause=join_clause, ) + def get_geom_query(self): + """ + Returns a SQL query which can be used to map locations (identified by + the values in self.location_columns) to their geometries (in a column + named "geom"). + """ + columns = self._polygon_column_names + [f"{self._geom_column} AS geom"] + + sql = f"SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon" + + return sql, self.location_columns + ["geom"] + def geo_augment(self, sql_query): - r_col_name, l_col_name = get_name_and_alias(self.polygon_column_names[0]) + r_col_name, l_col_name = get_name_and_alias(self._polygon_column_names[0]) sql = f""" SELECT row_number() OVER () as gid, - {self.geom_column} AS geom, + {self._geom_column} AS geom, U.* FROM ({sql_query}) AS U LEFT JOIN ({self.polygon_table.get_query()}) AS G diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index c231dfc9cc..91b61d55fe 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -89,7 +89,7 @@ def exemplar_level_param(request): { "polygon_column_names": "id", "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", + "geom_column": "geom_point", }, ), ], From e45663cc1ba91616b338ddaf66f3b52668a3ccd0 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 15:14:56 +0100 Subject: [PATCH 060/138] Use 'get_geom_query' method in DistanceMatrix --- .../features/spatial/distance_matrix.py | 68 ++++++------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 781ec346f2..37f003f9dd 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -9,7 +9,6 @@ """ from typing import List -from flowmachine.utils import get_name_and_alias from ...core.query import Query from ...core.mixins import GraphMixin from ...core.spatial_unit import VersionedSiteSpatialUnit, VersionedCellSpatialUnit @@ -22,17 +21,14 @@ class DistanceMatrix(GraphMixin, Query): computation of distance travelled, area of influence, and other features. - This is a wrapper around the SpatialUnit.distance_matrix_query method. - Distance is returned in km. Parameters ---------- spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default VersionedCellSpatialUnit() Locations to compute distances for. - Note: only VersionedCellSpatialUnit and VersionedSiteSpatialUnit are - supported at this time. - + Note: only point locations (i.e. spatial units with "lat" and "lon" + included in location_columns) are supported at this time. return_geometry : bool If True, geometries are returned in query (represented as WKB in a dataframe). This @@ -47,11 +43,10 @@ def __init__(self, spatial_unit=None, return_geometry=False): else: self.spatial_unit = spatial_unit - self.location_id_cols = set(self.spatial_unit.location_columns) - try: - self.location_id_cols.remove("lat") - self.location_id_cols.remove("lon") - except KeyError: + if not ( + "lat" in self.spatial_unit.location_columns + and "lon" in self.spatial_unit.location_columns + ): raise ValueError("Only point locations are supported at this time.") self.return_geometry = return_geometry @@ -60,67 +55,44 @@ def __init__(self, spatial_unit=None, return_geometry=False): @property def column_names(self) -> List[str]: - col_names = [f"{c}_from" for c in self.location_id_cols] - col_names += [f"{c}_to" for c in self.location_id_cols] - col_names += ["lon_from", "lat_from", "lon_to", "lat_to", "distance"] + col_names = [f"{c}_from" for c in self.spatial_unit.location_columns] + col_names += [f"{c}_to" for c in self.spatial_unit.location_columns] + col_names += ["distance"] if self.return_geometry: col_names += ["geom_origin", "geom_destination"] return col_names def _make_query(self): - # FIXME: Accessing a "private" attribute of self.spatial_unit here - names_for_loc_id_col_aliases = [ - c - for c in self.spatial_unit._cols - if get_name_and_alias(c)[1] in self.location_id_cols - ] - cols_A = ",".join( - [ - f"A.{get_name_and_alias(c)[0].split('.')[-1]} AS {get_name_and_alias(c)[1]}_from" - for c in names_for_loc_id_col_aliases - ] + [f"A.{c} AS {c}_from" for c in self.spatial_unit.location_columns] ) - if cols_A != "": - cols_A += "," cols_B = ",".join( - [ - f"B.{get_name_and_alias(c)[0].split('.')[-1]} AS {get_name_and_alias(c)[1]}_to" - for c in names_for_loc_id_col_aliases - ] + [f"B.{c} AS {c}_to" for c in self.spatial_unit.location_columns] ) - if cols_B != "": - cols_B += "," - locinfo_table = get_name_and_alias(self.spatial_unit.location_info_table)[0] + geom_query, _ = self.spatial_unit.get_geom_query() if self.return_geometry: return_geometry_statement = """ , - A.geom_point AS geom_origin, - B.geom_point AS geom_destination + A.geom AS geom_origin, + B.geom AS geom_destination """ else: return_geometry_statement = "" sql = f""" - SELECT - {cols_A} - {cols_B} - ST_X(A.geom_point::geometry) AS lon_from, - ST_Y(A.geom_point::geometry) AS lat_from, - ST_X(B.geom_point::geometry) AS lon_to, - ST_Y(B.geom_point::geometry) AS lat_to, + {cols_A}, + {cols_B}, ST_Distance( - A.geom_point::geography, - B.geom_point::geography + A.geom::geography, + B.geom::geography ) / 1000 AS distance {return_geometry_statement} - FROM {locinfo_table} AS A - CROSS JOIN {locinfo_table} AS B + FROM ({geom_query}) AS A + CROSS JOIN ({geom_query}) AS B ORDER BY distance DESC - """ return sql From 41d2868cefb9887cfa0f990759b75741653acdcd Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 16:03:27 +0100 Subject: [PATCH 061/138] Use 'get_geom_query' in GeoDataMixin --- .../flowmachine/core/mixins/geodata_mixin.py | 13 ++++++++++++- flowmachine/flowmachine/core/spatial_unit.py | 4 +++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index c2b0074e1e..b0f16cff00 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -83,7 +83,18 @@ def _geo_augmented_query(self): The columns this query contains """ loc_join = self._get_location_join() - sql = loc_join.spatial_unit.geo_augment(self.get_query()) + spatial_unit = loc_join.spatial_unit + join_columns_string = ",".join(spatial_unit.location_columns) + geom_query, _ = spatial_unit.get_geom_query() + + sql = f""" + SELECT + row_number() over() AS gid, + * + FROM ({self.get_query()}) AS Q + LEFT JOIN ({geom_query}) AS G + USING ({join_columns_string}) + """ cols = list(set(self.column_names + ["gid", "geom"])) return sql, cols diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index fc92288be1..71c30bf836 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -395,7 +395,9 @@ def get_geom_query(self): """ columns = self._polygon_column_names + [f"{self._geom_column} AS geom"] - sql = f"SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon" + sql = f""" + SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon + """ return sql, self.location_columns + ["geom"] From 15739bea8cb87c9efb3fe89e22ebbb4174640059 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 17:18:05 +0100 Subject: [PATCH 062/138] Remove _get_location_join --- .../flowmachine/core/mixins/geodata_mixin.py | 49 ++++--------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index b0f16cff00..8586a1cc0d 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -33,42 +33,6 @@ class GeoDataMixin: type queries. """ - def _get_location_join(self): - """ - Utility method which searches the query tree for location - information. - - Returns - ------- - JoinToLocation - The first JoinToLocation object encountered in the query tree. - - """ - open = set(self.dependencies) - closed = set() - while True: - try: - qur = open.pop() - closed.add(qur) - - # - # This will check if the passed query - # is an instance of the JoinToLocation class. - # We don't check for the instance directly - # because of import issues. This isn't an - # ideal solution. - # - # - Luis Capelo, June 22, 2017 - # - if "JoinToLocation" in str(getattr(qur, "__class__", lambda: None)): - return qur - - open.update(qur.dependencies - closed) - - except KeyError: - logger.warning("No JoinToLocation object found.") - break - def _geo_augmented_query(self): """ Creates a version of this query augmented with a geom column, @@ -82,10 +46,15 @@ def _geo_augmented_query(self): list The columns this query contains """ - loc_join = self._get_location_join() - spatial_unit = loc_join.spatial_unit - join_columns_string = ",".join(spatial_unit.location_columns) - geom_query, _ = spatial_unit.get_geom_query() + join_columns_string = ",".join(self.spatial_unit.location_columns) + + try: + geom_query, _ = self.spatial_unit.get_geom_query() + except AttributeError: + raise ValueError( + f"Query {self} with spatial_unit of type " + f"{type(self.spatial_unit)} has no geography information." + ) sql = f""" SELECT From 2878fa554f7e6d0e17fad1b5ebf75671dbd4c673 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 17:19:35 +0100 Subject: [PATCH 063/138] Use get_geom_query in Flows._geo_augmented_query --- .../flowmachine/features/location/flows.py | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 7b33fb7408..e0b8640069 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -129,20 +129,46 @@ def _geo_augmented_query(self): agg_qry = f""" WITH flows AS ({self.get_query()}) - select {loc_cols_string}, json_strip_nulls(outflows) as outflows, json_strip_nulls(inflows) as inflows FROM - (SELECT {loc_cols_from_aliased_string}, json_object_agg({loc_cols[0]}_to, count) AS outflows - FROM flows - GROUP BY {loc_cols_from_string} + SELECT + {loc_cols_string}, + json_strip_nulls(outflows) as outflows, + json_strip_nulls(inflows) as inflows + FROM + ( + SELECT + {loc_cols_from_aliased_string}, + json_object_agg({loc_cols[0]}_to, count) AS outflows + FROM flows + GROUP BY {loc_cols_from_string} ) x FULL JOIN - (SELECT {loc_cols_to_aliased_string}, json_object_agg({loc_cols[0]}_from, count) AS inflows - FROM flows - GROUP BY {loc_cols_to_string} + ( + SELECT + {loc_cols_to_aliased_string}, + json_object_agg({loc_cols[0]}_from, count) AS inflows + FROM flows + GROUP BY {loc_cols_to_string} ) y USING ({loc_cols_string}) """ - joined_query = self.spatial_unit.geo_augment(agg_qry) + try: + geom_query, _ = self.spatial_unit.get_geom_query() + except AttributeError: + raise ValueError( + f"Query {self} with spatial_unit of type " + f"{type(self.spatial_unit)} has no geography information." + ) + + joined_query = f""" + SELECT + row_number() over() AS gid, + * + FROM ({agg_qry}) AS Q + LEFT JOIN ({geom_query}) AS G + USING ({loc_cols_string}) + """ + return joined_query, loc_cols + ["outflows", "inflows", "geom", "gid"] From 21e2d7e7df72f9c51a4263055effd436eae6e823 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 17:20:15 +0100 Subject: [PATCH 064/138] Add spatial_unit attribute to AggregateNetworkObjects --- .../flowmachine/features/network/total_network_objects.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index e8705a15fd..b4fa7ec4e4 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -227,15 +227,16 @@ def __init__(self, *, total_network_objects, statistic="avg", aggregate_by=None) raise ValueError( "{} is not a valid aggregate_by value.".format(self.aggregate_by) ) + self.spatial_unit = self.total_objs.spatial_unit super().__init__() @property def column_names(self) -> List[str]: - return self.total_objs.spatial_unit.location_columns + ["value", "datetime"] + return self.spatial_unit.location_columns + ["value", "datetime"] def _make_query(self): - group_cols = ",".join(self.total_objs.spatial_unit.location_columns) + group_cols = ",".join(self.spatial_unit.location_columns) if self.statistic == "mode": av_call = f"pg_catalog.mode() WITHIN GROUP(ORDER BY z.value)" else: From bb2cca0cdb392e437cdc7253ecea27292c065821 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 17:23:09 +0100 Subject: [PATCH 065/138] Add tests for _geo_augmented_query raising an error, and move Flows geojson tests to test_flows.py --- flowmachine/tests/test_flows.py | 51 ++++++++++++++++++++++++++++++ flowmachine/tests/test_geomixin.py | 46 ++++++--------------------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index a5d7bce107..323929a8f9 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -4,12 +4,14 @@ import json import os +import geojson import pytest from flowmachine.core.spatial_unit import admin_spatial_unit from flowmachine.features import daily_location from flowmachine.features.location.flows import * from flowmachine.features.subscriber.daily_location import locate_subscribers +from flowmachine.core.spatial_unit import CellSpatialUnit pytestmark = pytest.mark.usefixtures("skip_datecheck") @@ -95,3 +97,52 @@ def test_flows_geojson_correct(): with open(reference_file) as ref: ref_json = json.load(ref) assert ref_json == fl_json + + +def test_valid_flows_geojson(exemplar_spatial_unit_param): + """ + Check that valid geojson is returned for Flows. + + """ + if CellSpatialUnit() == exemplar_spatial_unit_param: + pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") + dl = daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) + dl2 = daily_location("2016-01-02", spatial_unit=exemplar_spatial_unit_param) + fl = Flows(dl, dl2) + assert geojson.loads(fl.to_geojson_string()).is_valid + + +def test_flows_geo_augmented_query_raises_error(): + """ + Test that a ValueError is raised when attempting to get geojson for a flows + query with no geography data. + """ + dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl2 = daily_location("2016-01-02", spatial_unit=CellSpatialUnit()) + fl = Flows(dl, dl2) + with pytest.raises(ValueError): + fl.to_geojson_string() + + +def test_flows_geojson(get_dataframe): + """ + Test geojson works for flows with non-standard column names. + """ + + dl = daily_location( + "2016-01-01", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") + ) + dl2 = daily_location( + "2016-01-02", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") + ) + fl = Flows(dl, dl2) + js = fl.to_geojson() + df = get_dataframe(fl) + check_features = [js["features"][0], js["features"][5], js["features"][7]] + for feature in check_features: + outflows = feature["properties"]["outflows"] + df_src = df[ + df.admin2name_from == feature["properties"]["admin2name"] + ].set_index("admin2name_to") + for dest, tot in outflows.items(): + assert tot == df_src.loc[dest]["count"] diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index 28847e98f5..8bd9567ba6 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -19,12 +19,9 @@ from flowmachine.core.spatial_unit import ( CellSpatialUnit, LatLonSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, admin_spatial_unit, - grid_spatial_unit, ) -from flowmachine.features import daily_location, Flows +from flowmachine.features import daily_location from flowmachine.utils import proj4string @@ -89,17 +86,16 @@ def test_valid_geojson(exemplar_spatial_unit_param): assert geojson.loads(dl.to_geojson_string()).is_valid -def test_valid_flows_geojson(exemplar_spatial_unit_param): +def test_geo_augmented_query_raises_error(): """ - Check that valid geojson is returned for Flows. - + Test that a ValueError is raised when attempting to get geojson for a query + with no geography data. """ - if CellSpatialUnit() == exemplar_spatial_unit_param: - pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") - dl = daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) - dl2 = daily_location("2016-01-02", spatial_unit=exemplar_spatial_unit_param) - fl = Flows(dl, dl2) - assert geojson.loads(fl.to_geojson_string()).is_valid + dl = daily_location( + "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit() + ).aggregate() + with pytest.raises(ValueError): + dl.to_geojson_string() def test_correct_geojson(): @@ -151,30 +147,6 @@ def test_geojson_file_output(tmpdir): assert found -def test_flows_geojson(get_dataframe): - """ - Test geojson works for flows with non-standard column names. - """ - - dl = daily_location( - "2016-01-01", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") - ) - dl2 = daily_location( - "2016-01-02", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") - ) - fl = Flows(dl, dl2) - js = fl.to_geojson() - df = get_dataframe(fl) - check_features = [js["features"][0], js["features"][5], js["features"][7]] - for feature in check_features: - outflows = feature["properties"]["outflows"] - df_src = df[ - df.admin2name_from == feature["properties"]["admin2name"] - ].set_index("admin2name_to") - for dest, tot in outflows.items(): - assert tot == df_src.loc[dest]["count"] - - def test_reprojection(): """ Test that in db reprojection works. From 631b880d931d38adbbe4f810c5f03faa6fd457a9 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 17:50:14 +0100 Subject: [PATCH 066/138] Remove geo_augment method --- .../flowmachine/core/join_to_location.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 81 ++----------------- .../features/network/total_network_objects.py | 2 +- .../features/spatial/distance_matrix.py | 2 +- flowmachine/tests/test_spatial_unit.py | 9 +-- 5 files changed, 11 insertions(+), 85 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index a7770a64d8..98294cbfdd 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -34,7 +34,7 @@ class JoinToLocation(Query): This represents a table that can be joined to the cell information table. This must have a date column (called time) and a location column call 'location_id'. - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit + spatial_unit : flowmachine.core.spatial_unit.SpatialUnit A query which maps cell identifiers in the CDR to a different spatial unit (e.g. versioned site or admin region) time_col : str, default 'time' diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 71c30bf836..23f6652c89 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -31,7 +31,6 @@ Special case of PolygonSpatialUnit. """ from typing import List -from abc import ABCMeta, abstractmethod from flowmachine.utils import get_name_and_alias from . import Query, Table @@ -64,7 +63,7 @@ def location_columns(self) -> List[str]: return list(self._loc_cols) -class BaseSpatialUnit(Query, metaclass=ABCMeta): +class SpatialUnit(Query): """ Base class for all spatial units except CellSpatialUnit. Selects columns from the location table, and optionally joins to data in another table. @@ -163,25 +162,6 @@ def get_geom_query(self): return sql, self.location_columns + ["geom"] - @abstractmethod - def geo_augment(self, sql_query): - """ - Given a SQL string (which will usually be from a JoinToLocation object, - joined to this spatial unit), return a version of the query augmented - with a geom column and a gid column. - - Parameters - ---------- - sql_query : string - The query to augment with geom and gid columns - - Returns - ------- - str - A version of this query with geom and gid columns - """ - raise NotImplementedError - def _make_query(self): columns = ", ".join(self._cols) sql = f""" @@ -194,7 +174,7 @@ def _make_query(self): return sql -class LatLonSpatialUnit(BaseSpatialUnit): +class LatLonSpatialUnit(SpatialUnit): """ Class that maps cell location_id to lat-lon coordinates. """ @@ -212,18 +192,8 @@ def __init__(self): geom_column="geom_point", ) - def geo_augment(self, sql_query): - sql = f""" - SELECT - row_number() over() AS gid, - *, - ST_SetSRID(ST_Point(lon, lat), 4326) AS geom - FROM ({sql_query}) AS L - """ - return sql - -class VersionedCellSpatialUnit(BaseSpatialUnit): +class VersionedCellSpatialUnit(SpatialUnit): """ Class that maps cell location_id to a cell version and lat-lon coordinates. """ @@ -246,21 +216,8 @@ def __init__(self): geom_column="geom_point", ) - def geo_augment(self, sql_query): - sql = f""" - SELECT - row_number() OVER () AS gid, - geom_point AS geom, - U.* - FROM ({sql_query}) AS U - LEFT JOIN infrastructure.cells AS S - ON U.location_id = S.id AND - U.version = S.version - """ - return sql - -class VersionedSiteSpatialUnit(BaseSpatialUnit): +class VersionedSiteSpatialUnit(SpatialUnit): """ Class that maps cell location_id to a site version and lat-lon coordinates. """ @@ -301,21 +258,8 @@ def __init__(self): join_clause=join_clause, ) - def geo_augment(self, sql_query): - sql = f""" - SELECT - row_number() OVER () AS gid, - geom_point AS geom, - U.* - FROM ({sql_query}) AS U - LEFT JOIN infrastructure.sites AS S - ON U.site_id = S.id AND - U.version = S.version - """ - return sql - -class PolygonSpatialUnit(BaseSpatialUnit): +class PolygonSpatialUnit(SpatialUnit): """ Class that provides a mapping from cell/site data in the location table to spatial regions defined by geography information in a table. @@ -338,7 +282,7 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): self.polygon_table = polygon_table else: # Creating a Table object here means that we don't have to handle - # admin tables and Grid objects differently in join_clause and self.geo_augment + # admin tables and Grid objects differently in join_clause and self.get_geom_query self.polygon_table = Table(name=polygon_table) location_info_table = self.connection.location_table @@ -401,19 +345,6 @@ def get_geom_query(self): return sql, self.location_columns + ["geom"] - def geo_augment(self, sql_query): - r_col_name, l_col_name = get_name_and_alias(self._polygon_column_names[0]) - sql = f""" - SELECT - row_number() OVER () as gid, - {self._geom_column} AS geom, - U.* - FROM ({sql_query}) AS U - LEFT JOIN ({self.polygon_table.get_query()}) AS G - ON U.{l_col_name} = G.{r_col_name} - """ - return sql - def admin_spatial_unit(*, level, column_name=None): """ diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index b4fa7ec4e4..eab464678f 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -48,7 +48,7 @@ class TotalNetworkObjects(GeoDataMixin, Query): network_object : {Cell,VersionedCell,VersionedSite}SpatialUnit, default CellSpatialUnit() Objects to track, defaults to CellSpatialUnit(), the unversioned lowest level of infrastructure available. - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, + spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default admin_spatial_unit(level=0) Spatial unit to facet on. diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 37f003f9dd..df60a13d9b 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -25,7 +25,7 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default VersionedCellSpatialUnit() + spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default VersionedCellSpatialUnit() Locations to compute distances for. Note: only point locations (i.e. spatial units with "lat" and "lon" included in location_columns) are supported at this time. diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 6be9aac11c..1aee6f8205 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -4,7 +4,7 @@ from flowmachine.core import CustomQuery from flowmachine.core.spatial_unit import ( - BaseSpatialUnit, + SpatialUnit, CellSpatialUnit, LatLonSpatialUnit, VersionedCellSpatialUnit, @@ -87,13 +87,8 @@ def test_missing_location_columns_raises_error(): Test that a ValueError is raised if the location_column_names passed to SpatialUnit are not a subset of column_names. """ - - class TestSpatialUnit(BaseSpatialUnit): - def geo_augment(self, query): - pass - with pytest.raises(ValueError, match="['NOT_A_COLUMN']"): - su = TestSpatialUnit( + su = SpatialUnit( selected_column_names=[ "id AS location_id", "date_of_first_service", From b5695b41fbeb3dc668f50bad1e0c933150d5eccf Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 4 Jun 2019 18:02:15 +0100 Subject: [PATCH 067/138] Add test for columns of get_geom_query() --- flowmachine/flowmachine/core/mixins/geodata_mixin.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 4 ++-- flowmachine/flowmachine/features/location/flows.py | 2 +- .../flowmachine/features/spatial/distance_matrix.py | 2 +- flowmachine/tests/test_spatial_unit.py | 12 ++++++++++++ 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index 8586a1cc0d..1701b1a4cf 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -49,7 +49,7 @@ def _geo_augmented_query(self): join_columns_string = ",".join(self.spatial_unit.location_columns) try: - geom_query, _ = self.spatial_unit.get_geom_query() + geom_query = self.spatial_unit.get_geom_query() except AttributeError: raise ValueError( f"Query {self} with spatial_unit of type " diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 23f6652c89..1cb30f2f51 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -160,7 +160,7 @@ def get_geom_query(self): sql = f"SELECT {','.join(columns)} FROM {self.location_info_table}" - return sql, self.location_columns + ["geom"] + return sql def _make_query(self): columns = ", ".join(self._cols) @@ -343,7 +343,7 @@ def get_geom_query(self): SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon """ - return sql, self.location_columns + ["geom"] + return sql def admin_spatial_unit(*, level, column_name=None): diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index e0b8640069..89acfcfb20 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -153,7 +153,7 @@ def _geo_augmented_query(self): """ try: - geom_query, _ = self.spatial_unit.get_geom_query() + geom_query = self.spatial_unit.get_geom_query() except AttributeError: raise ValueError( f"Query {self} with spatial_unit of type " diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index df60a13d9b..873294e9ca 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -70,7 +70,7 @@ def _make_query(self): [f"B.{c} AS {c}_to" for c in self.spatial_unit.location_columns] ) - geom_query, _ = self.spatial_unit.get_geom_query() + geom_query = self.spatial_unit.get_geom_query() if self.return_geometry: return_geometry_statement = """ diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 1aee6f8205..e66c8d854a 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -28,6 +28,18 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): assert su.head(0).columns.tolist() == su.column_names +def test_get_geom_query_column_names(exemplar_spatial_unit_param): + """ + Test that the get_geom_query method returns a query with the correct columns. + """ + if CellSpatialUnit() == exemplar_spatial_unit_param: + pytest.skip("CellSpatialUnit does not have a get_geom_query method") + geom_query = exemplar_spatial_unit_param.get_geom_query() + cols = exemplar_spatial_unit_param.location_columns + ["geom"] + cq = CustomQuery(geom_query, cols) + assert cq.head(0).columns.tolist() == cols + + @pytest.mark.parametrize( "spatial_unit, kwargs, loc_cols", [ From c2c0bf8d9a54fa2b6e1392441d6a2a4417eb47fb Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 10:26:39 +0100 Subject: [PATCH 068/138] Make internal attributes immutable --- flowmachine/flowmachine/core/spatial_unit.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 1cb30f2f51..0302df5174 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -96,14 +96,14 @@ def __init__( join_clause="", ): if type(selected_column_names) is str: - self._cols = [selected_column_names] + self._cols = (selected_column_names,) else: - self._cols = selected_column_names + self._cols = tuple(selected_column_names) if type(location_column_names) is str: - self._loc_cols = [location_column_names] + self._loc_cols = (location_column_names,) else: - self._loc_cols = location_column_names + self._loc_cols = tuple(location_column_names) # Check that _loc_cols is a subset of column_names missing_cols = [c for c in self._loc_cols if not (c in self.column_names)] @@ -142,7 +142,7 @@ def location_columns(self) -> List[str]: """ List of names of the columns which identify the locations. """ - return self._loc_cols + return list(self._loc_cols) @property def column_names(self) -> List[str]: @@ -313,9 +313,9 @@ def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): f"{locinfo_alias}.date_of_last_service AS date_of_last_service", ] if type(polygon_column_names) is str: - self._polygon_column_names = [polygon_column_names] + self._polygon_column_names = (polygon_column_names,) else: - self._polygon_column_names = polygon_column_names + self._polygon_column_names = tuple(polygon_column_names) all_column_names = locinfo_column_names + [ f"{joined_alias}.{c}" for c in self._polygon_column_names ] @@ -337,7 +337,7 @@ def get_geom_query(self): the values in self.location_columns) to their geometries (in a column named "geom"). """ - columns = self._polygon_column_names + [f"{self._geom_column} AS geom"] + columns = list(self._polygon_column_names) + [f"{self._geom_column} AS geom"] sql = f""" SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon From cc335d32c52cd78c4a9e083fa3b7f3d20f2f234e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 10:27:03 +0100 Subject: [PATCH 069/138] Fix get_geom_query test --- flowmachine/tests/test_spatial_unit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index e66c8d854a..4963c8e980 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -28,7 +28,9 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): assert su.head(0).columns.tolist() == su.column_names -def test_get_geom_query_column_names(exemplar_spatial_unit_param): +def test_get_geom_query_column_names( + exemplar_spatial_unit_param, get_column_names_from_run +): """ Test that the get_geom_query method returns a query with the correct columns. """ @@ -37,7 +39,7 @@ def test_get_geom_query_column_names(exemplar_spatial_unit_param): geom_query = exemplar_spatial_unit_param.get_geom_query() cols = exemplar_spatial_unit_param.location_columns + ["geom"] cq = CustomQuery(geom_query, cols) - assert cq.head(0).columns.tolist() == cols + assert sorted(get_column_names_from_run(cq)) == sorted(cols) @pytest.mark.parametrize( From 3f42bd7b353cb68cc887e69fd120493f1f9a9040 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 12:59:07 +0100 Subject: [PATCH 070/138] Remove VersionedCellSpatialUnit, VersionedSiteSpatialUnit and LatLonSpatialUnit classes --- .../flowmachine/core/join_to_location.py | 6 +- .../flowmachine/core/mixins/geodata_mixin.py | 4 +- flowmachine/flowmachine/core/spatial_unit.py | 173 ++++++++++-------- .../flowmachine/features/location/flows.py | 4 +- .../features/network/total_network_objects.py | 27 ++- .../features/spatial/distance_matrix.py | 6 +- .../features/subscriber/displacement.py | 30 +-- flowmachine/flowmachine/models/pwo.py | 8 +- flowmachine/tests/conftest.py | 12 +- flowmachine/tests/test_calldays.py | 6 +- flowmachine/tests/test_displacement.py | 8 +- flowmachine/tests/test_geomixin.py | 10 +- flowmachine/tests/test_indexes.py | 4 +- flowmachine/tests/test_inoutflows.py | 6 +- flowmachine/tests/test_join_to_location.py | 10 +- flowmachine/tests/test_last_location.py | 9 +- .../tests/test_meaningful_locations.py | 22 +-- .../tests/test_most_frequent_locations.py | 8 +- flowmachine/tests/test_spatial_aggregate.py | 4 +- .../tests/test_spatial_distancematrix.py | 14 +- flowmachine/tests/test_spatial_unit.py | 26 +-- .../tests/test_subscriber_location_cluster.py | 24 +-- .../tests/test_total_network_objects.py | 14 +- 23 files changed, 222 insertions(+), 213 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 98294cbfdd..f0d3480919 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -51,7 +51,7 @@ class JoinToLocation(Query): """ def __init__(self, left, *, spatial_unit, time_col="time"): - if isinstance(spatial_unit, CellSpatialUnit): + if spatial_unit == CellSpatialUnit(): # Nothing to join in this case raise ValueError( "CellSpatialUnit is not a valid spatial unit type for JoinToLocation" @@ -113,7 +113,7 @@ def _make_query(self): def location_joined_query(left, *, spatial_unit, time_col="time"): """ Helper function which returns JoinToLocation(left_query, spatial_unit, time_col) - unless type(spatial_unit)==CellSpatialUnit, in which case this returns left_query. + unless spatial_unit == CellSpatialUnit(), in which case this returns left_query. Parameters ---------- @@ -133,7 +133,7 @@ def location_joined_query(left, *, spatial_unit, time_col="time"): flowmachine.Query Either a JoinToLocation object, or the input parameter 'left' """ - if isinstance(spatial_unit, CellSpatialUnit): + if spatial_unit == CellSpatialUnit(): return left else: return JoinToLocation(left, spatial_unit=spatial_unit, time_col=time_col) diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index 1701b1a4cf..974d32b54e 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -52,8 +52,8 @@ def _geo_augmented_query(self): geom_query = self.spatial_unit.get_geom_query() except AttributeError: raise ValueError( - f"Query {self} with spatial_unit of type " - f"{type(self.spatial_unit)} has no geography information." + f"Query {self} with spatial_unit {self.spatial_unit} has no " + "geography information." ) sql = f""" diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 0302df5174..fbd821bd7c 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -8,12 +8,12 @@ The available spatial units are: CellSpatialUnit: The identifier as found in the CDR. - LatLonSpatialUnit: + lat_lon_spatial_unit: Latitude and longitude of cell/site locations. - VersionedCellSpatialUnit: + versioned_cell_spatial_unit: The identifier as found in the CDR combined with the version from the cells table. - VersionedSiteSpatialUnit: + versioned_site_spatial_unit: The ID found in the sites table, coupled with the version number. PolygonSpatialUnit: A custom set of polygons that live in the database. Takes the @@ -37,6 +37,9 @@ from .grid import Grid +# class SpatialUnitMixin: + + class CellSpatialUnit: """ This class represents the case where no join of cell ID to other data is @@ -174,90 +177,98 @@ def _make_query(self): return sql -class LatLonSpatialUnit(SpatialUnit): - """ - Class that maps cell location_id to lat-lon coordinates. +def lat_lon_spatial_unit(): """ + Returns a SpatialUnit that maps cell location_id to lat-lon coordinates. - def __init__(self): - super().__init__( - selected_column_names=[ - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - "ST_X(geom_point::geometry) AS lon", - "ST_Y(geom_point::geometry) AS lat", - ], - location_column_names=["lat", "lon"], - geom_column="geom_point", - ) - - -class VersionedCellSpatialUnit(SpatialUnit): - """ - Class that maps cell location_id to a cell version and lat-lon coordinates. + Returns + ------- + flowmachine.core.spatial_unit.SpatialUnit """ - - def __init__(self): - if self.connection.location_table != "infrastructure.cells": - raise ValueError("Versioned cell spatial unit is unavailable.") - - super().__init__( - selected_column_names=[ - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - "version", - "ST_X(geom_point::geometry) AS lon", - "ST_Y(geom_point::geometry) AS lat", - ], - location_column_names=["location_id", "version", "lon", "lat"], - location_info_table="infrastructure.cells", - geom_column="geom_point", - ) + return SpatialUnit( + selected_column_names=[ + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + "ST_X(geom_point::geometry) AS lon", + "ST_Y(geom_point::geometry) AS lat", + ], + location_column_names=["lat", "lon"], + geom_column="geom_point", + ) -class VersionedSiteSpatialUnit(SpatialUnit): +def versioned_cell_spatial_unit(): """ - Class that maps cell location_id to a site version and lat-lon coordinates. + Returns a SpatialUnit that maps cell location_id to a cell version and + lat-lon coordinates. + + Returns + ------- + flowmachine.core.spatial_unit.SpatialUnit """ + if Query.connection.location_table != "infrastructure.cells": + raise ValueError("Versioned cell spatial unit is unavailable.") + + return SpatialUnit( + selected_column_names=[ + "id AS location_id", + "date_of_first_service", + "date_of_last_service", + "version", + "ST_X(geom_point::geometry) AS lon", + "ST_Y(geom_point::geometry) AS lat", + ], + location_column_names=["location_id", "version", "lon", "lat"], + location_info_table="infrastructure.cells", + geom_column="geom_point", + ) - def __init__(self): - location_table = self.connection.location_table - sites_alias = "s" - if location_table == "infrastructure.sites": - cells_alias = sites_alias - join_clause = "" - elif location_table == "infrastructure.cells": - cells_alias = "c" - join_clause = f""" - RIGHT JOIN - infrastructure.cells AS {cells_alias} - ON {sites_alias}.id = {cells_alias}.site_id - """ - else: - raise ValueError( - f"Expected location table to be 'infrastructure.cells' " - f"or 'infrastructure.sites', not '{location_table}''" - ) +def versioned_site_spatial_unit(): + """ + Returns a SpatialUnit that maps cell location_id to a site version and + lat-lon coordinates. - super().__init__( - selected_column_names=[ - f"{cells_alias}.id AS location_id", - f"{sites_alias}.id AS site_id", - f"{sites_alias}.date_of_first_service AS date_of_first_service", - f"{sites_alias}.date_of_last_service AS date_of_last_service", - f"{sites_alias}.version AS version", - f"ST_X({sites_alias}.geom_point::geometry) AS lon", - f"ST_Y({sites_alias}.geom_point::geometry) AS lat", - ], - location_column_names=["site_id", "version", "lon", "lat"], - location_info_table=f"infrastructure.sites AS {sites_alias}", - geom_column="geom_point", - join_clause=join_clause, + Returns + ------- + flowmachine.core.spatial_unit.SpatialUnit + """ + location_table = Query.connection.location_table + + sites_alias = "s" + if location_table == "infrastructure.sites": + cells_alias = sites_alias + join_clause = "" + elif location_table == "infrastructure.cells": + cells_alias = "c" + join_clause = f""" + RIGHT JOIN + infrastructure.cells AS {cells_alias} + ON {sites_alias}.id = {cells_alias}.site_id + """ + else: + raise ValueError( + f"Expected location table to be 'infrastructure.cells' " + f"or 'infrastructure.sites', not '{location_table}''" ) + return SpatialUnit( + selected_column_names=[ + f"{cells_alias}.id AS location_id", + f"{sites_alias}.id AS site_id", + f"{sites_alias}.date_of_first_service AS date_of_first_service", + f"{sites_alias}.date_of_last_service AS date_of_last_service", + f"{sites_alias}.version AS version", + f"ST_X({sites_alias}.geom_point::geometry) AS lon", + f"ST_Y({sites_alias}.geom_point::geometry) AS lat", + ], + location_column_names=["site_id", "version", "lon", "lat"], + location_info_table=f"infrastructure.sites AS {sites_alias}", + geom_column="geom_point", + join_clause=join_clause, + ) + class PolygonSpatialUnit(SpatialUnit): """ @@ -348,9 +359,9 @@ def get_geom_query(self): def admin_spatial_unit(*, level, column_name=None): """ - Helper function to create a PolygonSpatialUnit object that maps all cells - (aka sites) to an admin region. This assumes that you have geography data - in the standard location in FlowDB. + Returns a PolygonSpatialUnit object that maps all cells (aka sites) to an + admin region. This assumes that you have geography data in the standard + location in FlowDB. Parameters ---------- @@ -364,7 +375,7 @@ def admin_spatial_unit(*, level, column_name=None): Returns ------- - PolygonSpatialUnit + flowmachine.core.spatial_unit.PolygonSpatialUnit Query which maps cell/site IDs to admin regions """ # If there is no column_name passed then we can use the default, which is @@ -382,7 +393,7 @@ def admin_spatial_unit(*, level, column_name=None): def grid_spatial_unit(*, size): """ - Helper function to create a PolygonSpatialUnit representing a mapping + Returns a PolygonSpatialUnit representing a mapping between all the sites in the database and a grid of arbitrary size. Parameters @@ -392,7 +403,7 @@ def grid_spatial_unit(*, size): Returns ------- - PolygonSpatialUnit + flowmachine.core.spatial_unit.PolygonSpatialUnit Query which maps cell/site IDs to grid squares """ return PolygonSpatialUnit( diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 89acfcfb20..d49a4c9569 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -156,8 +156,8 @@ def _geo_augmented_query(self): geom_query = self.spatial_unit.get_geom_query() except AttributeError: raise ValueError( - f"Query {self} with spatial_unit of type " - f"{type(self.spatial_unit)} has no geography information." + f"Query {self} with spatial_unit {self.spatial_unit} has no " + "geography information." ) joined_query = f""" diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index eab464678f..7611b8c344 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -16,12 +16,7 @@ from ...core.mixins import GeoDataMixin from ...core import location_joined_query from ...core.query import Query -from ...core.spatial_unit import ( - CellSpatialUnit, - VersionedSiteSpatialUnit, - VersionedCellSpatialUnit, - admin_spatial_unit, -) +from ...core.spatial_unit import CellSpatialUnit, admin_spatial_unit from ..utilities import EventsTablesUnion valid_stats = {"avg", "max", "min", "median", "mode", "stddev", "variance"} @@ -95,27 +90,27 @@ def __init__( if self.table != "all" and not self.table.startswith("events"): self.table = "events.{}".format(self.table) - allowed_network_object_types = [ - CellSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, - ] + def is_allowed_network_object(spatial_unit): + return ( + "location_id" in spatial_unit.location_columns + or "site_id" in spatial_unit.location_columns + ) self.network_object = network_object - if type(self.network_object) not in allowed_network_object_types: + if not is_allowed_network_object(self.network_object): raise ValueError( - "{} is not a valid network object type.".format(type(network_object)) + "{} is not a valid network object.".format(self.network_object) ) if spatial_unit is None: self.spatial_unit = admin_spatial_unit(level=0) else: self.spatial_unit = spatial_unit - if type(self.spatial_unit) in allowed_network_object_types: + if is_allowed_network_object(self.spatial_unit): # No sense in aggregating network object to network object raise ValueError( - "{} is not a valid spatial unit type for TotalNetworkObjects".format( - type(self.spatial_unit) + "{} is not a valid spatial unit for TotalNetworkObjects".format( + self.spatial_unit ) ) diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 873294e9ca..948ed7ed17 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -11,7 +11,7 @@ from ...core.query import Query from ...core.mixins import GraphMixin -from ...core.spatial_unit import VersionedSiteSpatialUnit, VersionedCellSpatialUnit +from ...core.spatial_unit import versioned_cell_spatial_unit class DistanceMatrix(GraphMixin, Query): @@ -25,7 +25,7 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default VersionedCellSpatialUnit() + spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default versioned_cell_spatial_unit() Locations to compute distances for. Note: only point locations (i.e. spatial units with "lat" and "lon" included in location_columns) are supported at this time. @@ -39,7 +39,7 @@ class DistanceMatrix(GraphMixin, Query): def __init__(self, spatial_unit=None, return_geometry=False): if spatial_unit is None: - self.spatial_unit = VersionedCellSpatialUnit() + self.spatial_unit = versioned_cell_spatial_unit() else: self.spatial_unit = spatial_unit diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index c06e5ed69f..b6c627485d 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -18,9 +18,9 @@ from ..utilities.subscriber_locations import subscriber_locations from flowmachine.utils import parse_datestring, get_dist_string, list_of_dates from flowmachine.core.spatial_unit import ( - LatLonSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, + lat_lon_spatial_unit, + versioned_cell_spatial_unit, + versioned_site_spatial_unit, ) from dateutil.relativedelta import relativedelta @@ -84,33 +84,33 @@ def __init__( self.start = start - allowed_spatial_units = [ - LatLonSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, - ] + def is_allowed_spatial_unit(spatial_unit): + return ( + "lat" in spatial_unit.location_columns + and "lon" in spatial_unit.location_columns + ) + if modal_locations: - if ( - isinstance(modal_locations, ModalLocation) - and type(modal_locations.spatial_unit) in allowed_spatial_units + if isinstance(modal_locations, ModalLocation) and is_allowed_spatial_unit( + modal_locations.spatial_unit ): hl = modal_locations else: raise ValueError( "Argument 'modal_locations' should be an instance of " - "ModalLocation class with type(spatial_unit) in " - f"{su.__name__ for su in allowed_spatial_units}" + "ModalLocation class with 'lat' and 'lon' in " + "spatial_unit.location_columns" ) else: hl = ModalLocation( *[ - daily_location(date, spatial_unit=LatLonSpatialUnit(), **kwargs) + daily_location(date, spatial_unit=lat_lon_spatial_unit(), **kwargs) for date in list_of_dates(self.start, self.stop_hl) ] ) sl = subscriber_locations( - self.start, self.stop_sl, spatial_unit=LatLonSpatialUnit(), **kwargs + self.start, self.stop_sl, spatial_unit=lat_lon_spatial_unit(), **kwargs ) self.statistic = statistic.lower() diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 44c7a97475..2280dd638a 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -35,7 +35,7 @@ from ..features import ModalLocation from ..core.query import Query from ..core.model import Model, model_result -from ..core.spatial_unit import VersionedSiteSpatialUnit +from ..core.spatial_unit import versioned_site_spatial_unit from ..features.spatial.distance_matrix import DistanceMatrix import structlog @@ -193,9 +193,9 @@ class PopulationWeightedOpportunities(Model): documentation for other available methods. spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default VersionedSiteSpatialUnit() + default versioned_site_spatial_unit() Note: DistanceMatrix only supports spatial units - VersionedCellSpatialUnit() and VersionedSiteSpatialUnit() at this time. + with 'lat' and 'lon' columns at this time. **kwargs : arguments Used to pass custom arguments to the ModalLocation() objects. @@ -253,7 +253,7 @@ def __init__( self.stop = stop self.method = method if spatial_unit is None: - self.spatial_unit = VersionedSiteSpatialUnit() + self.spatial_unit = versioned_site_spatial_unit() else: self.spatial_unit = spatial_unit self.distance_matrix = DistanceMatrix( diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 91b61d55fe..6ead63b8cb 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -22,9 +22,9 @@ from flowmachine.core.cache import reset_cache from flowmachine.core.spatial_unit import ( CellSpatialUnit, - LatLonSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, + lat_lon_spatial_unit, + versioned_cell_spatial_unit, + versioned_site_spatial_unit, PolygonSpatialUnit, admin_spatial_unit, grid_spatial_unit, @@ -75,10 +75,10 @@ def exemplar_level_param(request): params=[ (admin_spatial_unit, {"level": 2}), (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), - (VersionedSiteSpatialUnit, {}), - (VersionedCellSpatialUnit, {}), + (versioned_site_spatial_unit, {}), + (versioned_cell_spatial_unit, {}), (CellSpatialUnit, {}), - (LatLonSpatialUnit, {}), + (lat_lon_spatial_unit, {}), (grid_spatial_unit, {"size": 5}), ( PolygonSpatialUnit, diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index 640ac766cd..42c2957f46 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -11,7 +11,7 @@ import pytest -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit, CellSpatialUnit +from flowmachine.core.spatial_unit import versioned_site_spatial_unit, CellSpatialUnit from flowmachine.features import CallDays, subscriber_locations import numpy as np @@ -39,7 +39,7 @@ def test_call_days_returns_expected_counts_per_subscriber(get_dataframe): ) for (subscriber, start, end, calls) in test_values: cd = CallDays( - subscriber_locations(start, end, spatial_unit=VersionedSiteSpatialUnit()) + subscriber_locations(start, end, spatial_unit=versioned_site_spatial_unit()) ) df = get_dataframe(cd).query('subscriber == "{}"'.format(subscriber)) assert df.calldays.sum() == calls @@ -57,7 +57,7 @@ def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): ) for (subscriber, location, start, end, calls) in test_values: cd = CallDays( - subscriber_locations(start, end, spatial_unit=VersionedSiteSpatialUnit()) + subscriber_locations(start, end, spatial_unit=versioned_site_spatial_unit()) ) df = get_dataframe(cd).query( 'subscriber == "{}" & site_id == "{}"'.format(subscriber, location) diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index 71ebf62cc5..60213372f1 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -7,7 +7,7 @@ from numpy import isnan from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import LatLonSpatialUnit +from flowmachine.core.spatial_unit import lat_lon_spatial_unit @pytest.mark.parametrize( @@ -51,7 +51,7 @@ def test_pass_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=LatLonSpatialUnit()) + daily_location(d, spatial_unit=lat_lon_spatial_unit()) for d in list_of_dates("2016-01-01", "2016-01-06") ] ) @@ -90,7 +90,7 @@ def test_get_all_users_in_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=LatLonSpatialUnit(), hours=(12, 13)) + daily_location(d, spatial_unit=lat_lon_spatial_unit(), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) @@ -114,7 +114,7 @@ def test_subscriber_with_home_loc_but_no_calls_is_nan(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=LatLonSpatialUnit(), hours=(12, 13)) + daily_location(d, spatial_unit=lat_lon_spatial_unit(), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index 8bd9567ba6..d844068e66 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -18,7 +18,7 @@ from flowmachine.core.mixins import GeoDataMixin from flowmachine.core.spatial_unit import ( CellSpatialUnit, - LatLonSpatialUnit, + lat_lon_spatial_unit, admin_spatial_unit, ) from flowmachine.features import daily_location @@ -153,7 +153,7 @@ def test_reprojection(): """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js["features"][0]["geometry"]["coordinates"] == [ @@ -168,7 +168,7 @@ def test_geojson_cache(): Test geojson is cached locally. """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js == dl._geojson[proj4string(dl.connection, 2770)] @@ -177,7 +177,7 @@ def test_geojson_cache(): def test_geojson_cache_exluded_from_pickle(): """Test that cached geojson is not going to get pickled.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert "_geojson" not in dl.__getstate__() # Check excluded from pickle @@ -186,7 +186,7 @@ def test_geojson_cache_exluded_from_pickle(): def test_geojson_caching_off(): """Test that switching off caching clears the cache, and doesn't add to it.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 dl.turn_off_caching() # Check caching for geojson switches off diff --git a/flowmachine/tests/test_indexes.py b/flowmachine/tests/test_indexes.py index 8e01d9039d..8be9e7f101 100644 --- a/flowmachine/tests/test_indexes.py +++ b/flowmachine/tests/test_indexes.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import LatLonSpatialUnit +from flowmachine.core.spatial_unit import lat_lon_spatial_unit from flowmachine.features.subscriber import * @@ -15,7 +15,7 @@ def test_default_indexes(): '"subscriber"', ] assert daily_location( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ).index_cols == [["lat", "lon"], '"subscriber"'] assert SubscriberDegree("2016-01-01", "2016-01-02").index_cols == ['"subscriber"'] diff --git a/flowmachine/tests/test_inoutflows.py b/flowmachine/tests/test_inoutflows.py index e0a3a24787..b63b0ded95 100644 --- a/flowmachine/tests/test_inoutflows.py +++ b/flowmachine/tests/test_inoutflows.py @@ -8,7 +8,7 @@ from flowmachine.features import Flows, daily_location -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import versioned_site_spatial_unit def test_inoutflow_with_double_column_location(): @@ -17,8 +17,8 @@ def test_inoutflow_with_double_column_location(): more than one column. """ - dl1 = daily_location("2016-01-01", spatial_unit=VersionedSiteSpatialUnit()) - dl2 = daily_location("2016-01-02", spatial_unit=VersionedSiteSpatialUnit()) + dl1 = daily_location("2016-01-01", spatial_unit=versioned_site_spatial_unit()) + dl2 = daily_location("2016-01-02", spatial_unit=versioned_site_spatial_unit()) flow = Flows(dl1, dl2) expected_columns = ["site_id_to", "version_to", "lon_to", "lat_to", "total"] diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index db01b37614..7baf9a14c9 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -13,9 +13,9 @@ from flowmachine.core.spatial_unit import ( CellSpatialUnit, admin_spatial_unit, - VersionedSiteSpatialUnit, - VersionedCellSpatialUnit, - LatLonSpatialUnit, + versioned_site_spatial_unit, + versioned_cell_spatial_unit, + lat_lon_spatial_unit, grid_spatial_unit, PolygonSpatialUnit, ) @@ -64,7 +64,7 @@ def test_join_with_versioned_cells(get_dataframe, get_length): ul = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=VersionedCellSpatialUnit())) + df = get_dataframe(JoinToLocation(ul, spatial_unit=versioned_cell_spatial_unit())) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) # These should all be version zero, these are the towers before the changeover date, or those that @@ -89,7 +89,7 @@ def test_join_with_lat_lon(get_dataframe): ul = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=LatLonSpatialUnit())) + df = get_dataframe(JoinToLocation(ul, spatial_unit=lat_lon_spatial_unit())) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) assert sorted(df.columns) == expected_cols diff --git a/flowmachine/tests/test_last_location.py b/flowmachine/tests/test_last_location.py index 266a492db8..a2cea1351e 100644 --- a/flowmachine/tests/test_last_location.py +++ b/flowmachine/tests/test_last_location.py @@ -4,7 +4,10 @@ import pytest -from flowmachine.core.spatial_unit import LatLonSpatialUnit, VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import ( + lat_lon_spatial_unit, + versioned_site_spatial_unit, +) from flowmachine.features import LastLocation @@ -26,7 +29,7 @@ def test_last_loc_vsite(get_dataframe): """ last_loc = LastLocation( - "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=versioned_site_spatial_unit() ) df = get_dataframe(last_loc) @@ -41,7 +44,7 @@ def test_last_loc_lat_lon(get_dataframe): """ last_loc = LastLocation( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ) df = get_dataframe(last_loc) df.set_index("subscriber", inplace=True) diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 6f8a987001..018910bfcd 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -4,7 +4,7 @@ import pytest from flowmachine.core.errors import BadLevelError -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import versioned_site_spatial_unit from flowmachine.features import ( HartiganCluster, CallDays, @@ -37,7 +37,7 @@ def test_column_names_meaningful_locations(get_column_names_from_run): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -67,7 +67,7 @@ def test_column_names_meaningful_locations_aggregate( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -95,7 +95,7 @@ def test_meaningful_locations_aggregate_disallowed_level_raises(): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -124,7 +124,7 @@ def test_column_names_meaningful_locations_od( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -142,7 +142,7 @@ def test_column_names_meaningful_locations_od( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -178,7 +178,7 @@ def test_meaningful_locations_results( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -212,7 +212,7 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -246,7 +246,7 @@ def test_meaningful_locations_od_raises_for_bad_level( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -276,7 +276,7 @@ def test_meaningful_locations_od_results(get_dataframe): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, @@ -294,7 +294,7 @@ def test_meaningful_locations_od_results(get_dataframe): subscriber_locations=subscriber_locations( start="2016-01-02", stop="2016-01-03", - spatial_unit=VersionedSiteSpatialUnit(), + spatial_unit=versioned_site_spatial_unit(), ) ), radius=1, diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index fad6a5a14f..da5eeeb248 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -6,8 +6,8 @@ from flowmachine.core.spatial_unit import ( admin_spatial_unit, - VersionedSiteSpatialUnit, - LatLonSpatialUnit, + versioned_site_spatial_unit, + lat_lon_spatial_unit, ) from flowmachine.features import MostFrequentLocation from flowmachine.features.subscriber.daily_location import locate_subscribers @@ -32,7 +32,7 @@ def test_vsites(get_dataframe): """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-02", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=versioned_site_spatial_unit() ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) @@ -47,7 +47,7 @@ def test_lat_lons(get_dataframe): """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-02", spatial_unit=LatLonSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 51e613ef04..27c503f10a 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import admin_spatial_unit, LatLonSpatialUnit +from flowmachine.core.spatial_unit import admin_spatial_unit, lat_lon_spatial_unit from flowmachine.features import ModalLocation, daily_location from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates @@ -29,7 +29,7 @@ def test_can_be_aggregated_latlong(get_dataframe): """ hl = ModalLocation( *[ - daily_location(d, spatial_unit=LatLonSpatialUnit(), method="last") + daily_location(d, spatial_unit=lat_lon_spatial_unit(), method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ] ) diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index 0badd6da51..573ed5f2ab 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -10,9 +10,9 @@ from flowmachine.features.spatial import DistanceMatrix from flowmachine.core.spatial_unit import ( - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, - LatLonSpatialUnit, + versioned_cell_spatial_unit, + versioned_site_spatial_unit, + lat_lon_spatial_unit, ) @@ -20,7 +20,7 @@ def test_some_results(get_dataframe): """ DistanceMatrix() returns a dataframe that contains hand-picked results. """ - c = DistanceMatrix(spatial_unit=VersionedSiteSpatialUnit()) + c = DistanceMatrix(spatial_unit=versioned_site_spatial_unit()) df = get_dataframe(c) set_df = df.set_index("site_id_from") assert round(set_df.loc["8wPojr"]["distance"].values[0]) == 789 @@ -31,9 +31,9 @@ def test_some_results(get_dataframe): @pytest.mark.parametrize( "spatial_unit_type, length", [ - (VersionedCellSpatialUnit, 62), - (VersionedSiteSpatialUnit, 35), - (LatLonSpatialUnit, 62), + (versioned_cell_spatial_unit, 62), + (versioned_site_spatial_unit, 35), + (lat_lon_spatial_unit, 62), ], ) def test_result_has_correct_length(spatial_unit_type, length, get_length): diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 4963c8e980..6fae355a69 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -6,9 +6,9 @@ from flowmachine.core.spatial_unit import ( SpatialUnit, CellSpatialUnit, - LatLonSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, + lat_lon_spatial_unit, + versioned_cell_spatial_unit, + versioned_site_spatial_unit, PolygonSpatialUnit, admin_spatial_unit, grid_spatial_unit, @@ -45,9 +45,9 @@ def test_get_geom_query_column_names( @pytest.mark.parametrize( "spatial_unit, kwargs, loc_cols", [ - (LatLonSpatialUnit, {}, ["lat", "lon"]), - (VersionedCellSpatialUnit, {}, ["location_id", "version", "lon", "lat"]), - (VersionedSiteSpatialUnit, {}, ["site_id", "version", "lon", "lat"]), + (lat_lon_spatial_unit, {}, ["lat", "lon"]), + (versioned_cell_spatial_unit, {}, ["location_id", "version", "lon", "lat"]), + (versioned_site_spatial_unit, {}, ["site_id", "version", "lon", "lat"]), ( PolygonSpatialUnit, { @@ -117,10 +117,10 @@ def test_missing_location_columns_raises_error(): [ (admin_spatial_unit, {"level": 2}), (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), - (VersionedSiteSpatialUnit, {}), - (VersionedCellSpatialUnit, {}), + (versioned_site_spatial_unit, {}), + (versioned_cell_spatial_unit, {}), (CellSpatialUnit, {}), - (LatLonSpatialUnit, {}), + (lat_lon_spatial_unit, {}), (grid_spatial_unit, {"size": 5}), ( PolygonSpatialUnit, @@ -148,10 +148,10 @@ def test_spatial_unit_equals_itself(spatial_unit, kwargs): def test_cell_spatial_unit_not_equal_to_other_spatial_unit(): """ - Test that a CellSpatialUnit is not equal to a VersionedCellSpatialUnit. + Test that a CellSpatialUnit is not equal to a versioned_cell_spatial_unit. """ su1 = CellSpatialUnit() - su2 = VersionedCellSpatialUnit() + su2 = versioned_cell_spatial_unit() assert su1 != su2 assert su2 != su1 @@ -160,8 +160,8 @@ def test_different_spatial_units_are_not_equal(): """ Test that two different spatial units are not equal. """ - su1 = VersionedCellSpatialUnit() - su2 = VersionedSiteSpatialUnit() + su1 = versioned_cell_spatial_unit() + su2 = versioned_site_spatial_unit() assert su1 != su2 diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index a00a8a377a..8daefe8f84 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -19,7 +19,7 @@ from flowmachine.core import Table, CustomQuery from flowmachine.core.query import Query from flowmachine.core.mixins import GeoDataMixin -from flowmachine.core.spatial_unit import VersionedSiteSpatialUnit +from flowmachine.core.spatial_unit import versioned_site_spatial_unit from flowmachine.features import ( CallDays, HartiganCluster, @@ -34,7 +34,7 @@ def test_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -46,7 +46,7 @@ def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -66,7 +66,7 @@ def test_joined_hartigan_type_error(): """Test that joining hartigan to something which isn't query like raises a type error.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -107,7 +107,7 @@ def test_cluster_is_within_envelope(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) @@ -125,7 +125,7 @@ def test_first_call_day_in_first_cluster(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) cd_df = get_dataframe(cd) @@ -152,7 +152,7 @@ def test_bigger_radius_yields_fewer_clusters(get_dataframe): radius = [1, 2, 5, 10, 50] cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) @@ -172,7 +172,7 @@ def test_different_call_days_format(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) @@ -198,7 +198,7 @@ def test_call_threshold_works(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) @@ -217,7 +217,7 @@ def test_buffered_hartigan(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) @@ -237,7 +237,7 @@ def test_all_options_hartigan(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) @@ -255,7 +255,7 @@ def test_join_returns_the_same_clusters(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=VersionedSiteSpatialUnit() + "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() ) ) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 027c52b836..fc3b93f6a0 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -12,9 +12,9 @@ from flowmachine.core.spatial_unit import ( CellSpatialUnit, - VersionedCellSpatialUnit, - VersionedSiteSpatialUnit, - LatLonSpatialUnit, + versioned_cell_spatial_unit, + versioned_site_spatial_unit, + lat_lon_spatial_unit, ) from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects @@ -26,8 +26,8 @@ def test_tno_at_lat_lng(get_dataframe): tno = TotalNetworkObjects( start="2016-01-01", stop="2016-01-07", - network_object=VersionedCellSpatialUnit(), - spatial_unit=LatLonSpatialUnit(), + network_object=versioned_cell_spatial_unit(), + spatial_unit=lat_lon_spatial_unit(), ) assert tno.get_dataframe().sum().value == 330 @@ -117,7 +117,7 @@ def test_median_returns_correct_values(get_dataframe): """ instance = AggregateNetworkObjects( total_network_objects=TotalNetworkObjects( - table="calls", total_by="hour", network_object=VersionedSiteSpatialUnit() + table="calls", total_by="hour", network_object=versioned_site_spatial_unit() ), aggregate_by="day", statistic="median", @@ -141,7 +141,7 @@ def test_mean_returns_correct_values(get_dataframe): start="2016-01-01", stop="2016-12-30", total_by="hour", - network_object=VersionedSiteSpatialUnit(), + network_object=versioned_site_spatial_unit(), ), aggregate_by="day", ) From a7ee3abb72293181f232853659bbf814bef8e46d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 13:24:29 +0100 Subject: [PATCH 071/138] Remove **kwargs from displacement.py --- .../features/subscriber/displacement.py | 57 ++++++++++++++++++- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index b6c627485d..51e617eec8 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -56,12 +56,36 @@ class Displacement(SubscriberFeature): unit : {'km', 'm'}, default 'km' Unit with which to express the answers, currently the choices are kilometres ('km') or metres ('m') + + Other parameters + ---------------- + hours : tuple of ints, default 'all' + Subset the result within certain hours, e.g. (4,17) + This will subset the query only with these hours, but + across all specified days. Or set to 'all' to include + all hours. + method : str, default 'last' + The method by which to calculate the location of the subscriber. + This can be either 'most-common' or last. 'most-common' is + simply the modal location of the subscribers, whereas 'lsat' is + the location of the subscriber at the time of the final call in + the data. + table : str, default 'all' + schema qualified name of the table which the analysis is + based upon. If 'ALL' it will use all tables that contain + location data, specified in flowmachine.yml. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. + ignore_nulls : bool, default True + ignores those values that are null. Sometime data appears for which + the cell is null. If set to true this will ignore those lines. If false + these lines with null cells should still be present, although they contain + no information on the subscribers location, they still tell us that the subscriber made + a call at that time. Examples -------- @@ -74,7 +98,18 @@ class Displacement(SubscriberFeature): """ def __init__( - self, start, stop, modal_locations=None, statistic="avg", unit="km", **kwargs + self, + start, + stop, + modal_locations=None, + statistic="avg", + unit="km", + hours="all", + method="last", + table="all", + subscriber_identifier="msisdn", + ignore_nulls=True, + subscriber_subset=None, ): # need to subtract one day from hl end in order to be @@ -104,13 +139,29 @@ def is_allowed_spatial_unit(spatial_unit): else: hl = ModalLocation( *[ - daily_location(date, spatial_unit=lat_lon_spatial_unit(), **kwargs) + daily_location( + date, + spatial_unit=lat_lon_spatial_unit(), + hours=hours, + method=method, + table=table, + subscriber_identifier=subscriber_identifier, + ignore_nulls=ignore_nulls, + subscriber_subset=subscriber_subset, + ) for date in list_of_dates(self.start, self.stop_hl) ] ) sl = subscriber_locations( - self.start, self.stop_sl, spatial_unit=lat_lon_spatial_unit(), **kwargs + self.start, + self.stop_sl, + spatial_unit=lat_lon_spatial_unit(), + hours=hours, + table=table, + subscriber_identifier=subscriber_identifier, + ignore_nulls=ignore_nulls, + subscriber_subset=subscriber_subset, ) self.statistic = statistic.lower() From ebc73c5b32a4da27c851c33a90a9d870ab1300af Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 16:24:20 +0100 Subject: [PATCH 072/138] Add a helper function to create spatial units. --- flowmachine/flowmachine/core/spatial_unit.py | 157 +++++++++++++++---- 1 file changed, 125 insertions(+), 32 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index fbd821bd7c..86c8b0dad2 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -3,32 +3,9 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ -Classes that map cells (or towers or sites) to a spatial unit. - -The available spatial units are: - CellSpatialUnit: - The identifier as found in the CDR. - lat_lon_spatial_unit: - Latitude and longitude of cell/site locations. - versioned_cell_spatial_unit: - The identifier as found in the CDR combined with the version from the - cells table. - versioned_site_spatial_unit: - The ID found in the sites table, coupled with the version number. - PolygonSpatialUnit: - A custom set of polygons that live in the database. Takes the - parameters polygon_column_names, which is the columns you want to - return after the join, and polygon_table, the table where the polygons - reside (with the schema), and additionally geom_column which is the column - with the geometry information (will default to 'geom'). - admin_spatial_unit: - An admin region of interest, such as admin3. Must live in the database - in the standard location. - Special case of PolygonSpatialUnit. - grid_spatial_unit: - A square in a regular grid, in addition pass size to determine the size - of the polygon. - Special case of PolygonSpatialUnit. +Classes that map cell (or tower or site) IDs to a spatial unit. + +The helper function 'make_spatial_unit' can be used to create spatial unit objects. """ from typing import List @@ -282,8 +259,8 @@ class PolygonSpatialUnit(SpatialUnit): table in the database. Can also be a list of names. polygon_table : str or flowmachine.Query name of the table containing the geography information. - Can be either the name of a table, with the schema, a flowmachine.Query - object, or a string representing a query. + Can be either the name of a table, with the schema, or a + flowmachine.Query object. geom_column : str, default 'geom' Name of the column in polygon_table that defines the geography. """ @@ -357,7 +334,7 @@ def get_geom_query(self): return sql -def admin_spatial_unit(*, level, column_name=None): +def admin_spatial_unit(*, level, region_id_column_name=None): """ Returns a PolygonSpatialUnit object that maps all cells (aka sites) to an admin region. This assumes that you have geography data in the standard @@ -367,7 +344,7 @@ def admin_spatial_unit(*, level, column_name=None): ---------- level : int Admin level (e.g. 1 for admin1, 2 for admin2, etc.) - column_name : str, optional + region_id_column_name : str, optional Pass a string of the column to use as the identifier of the admin region. By default this will be admin*pcod. But you may wish @@ -382,10 +359,10 @@ def admin_spatial_unit(*, level, column_name=None): # of the form admin3pcod. If the user has asked for the standard # column_name then we will alias this column as 'pcod', otherwise we won't # alias it at all. - if (column_name is None) or (column_name == f"admin{level}pcod"): + if region_id_column_name is None or region_id_column_name == f"admin{level}pcod": col_name = f"admin{level}pcod AS pcod" else: - col_name = column_name + col_name = region_id_column_name table = f"geography.admin{level}" return PolygonSpatialUnit(polygon_column_names=col_name, polygon_table=table) @@ -411,3 +388,119 @@ def grid_spatial_unit(*, size): polygon_table=Grid(size), geom_column="geom_square", ) + + +def make_spatial_unit( + spatial_unit_type, + *, + level=None, + region_id_column_name=None, + size=None, + polygon_table=None, + geom_column="geom", +): + """ + Helper function to create an object representing a spatial unit. + + Parameters + ---------- + spatial_unit_type : str + Can be one of: + 'cell' + The identifier as found in the CDR. + 'lat-lon' + Latitude and longitude of cell/site locations. + 'versioned-cell' + The identifier as found in the CDR combined with the version + from the cells table. + 'versioned-site' + The ID found in the sites table, coupled with the version + number. + 'polygon' + A custom set of polygons that live in the database. In which + case you can pass the parameters 'column_name', which is the + column or columns you want to return after the join, and + 'polygon_table', the table where the polygons reside (with the + schema), and additionally geom_column which is the column with + the geometry information (will default to 'geom'). + 'admin' + An admin region of interest, such as admin3. Must live in the + database in the standard location. In addition pass the 'level' + parameter, e.g. level=3 for admin3. Optionally also pass the + parameter 'column_name' to choose the column to use as the + identifier of the admin region (default is 'admin*pcod') + 'grid' + A square in a regular grid, in addition pass the 'size' + parameter to determine the size of the polygon. + level : int + Admin level (e.g. 1 for admin1, 2 for admin2, etc.). + Required when spatial_unit_type='admin'. + region_id_column_name : str or list + Name(s) of column(s) which identifies the polygon regions. + Required when spatial_unit_type='polygon', + optional when spatial_unit_type='admin'. + size : float or int + Size of the grid in kilometres. + Required when spatial_unit_type='grid'. + polygon_table : str or flowmachine.Query + Name of the table containing the geography information. Can be either + the name of a table, with the schema, or a flowmachine.Query object. + Required when spatial_unit_type='polygon'. + geom_column : str, default 'geom' + Name of the column in polygon_table that defines the geography. + Required when spatial_unit_type='polygon'. + + Returns + ------- + flowmachine.core.spatial_unit.*SpatialUnit + An object representing a mapping from location identifiers to a spatial + unit. + """ + valid_spatial_unit_types = { + "cell", + "versioned-cell", + "versioned-site", + "lat-lon", + "admin", + "grid", + "polygon", + } + if not (spatial_unit_type in valid_spatial_unit_types): + raise ValueError(f"Unrecognised spatial unit type: {spatial_unit_type}.") + + if spatial_unit_type == "cell": + return CellSpatialUnit() + elif spatial_unit_type == "versioned-cell": + return versioned_cell_spatial_unit() + elif spatial_unit_type == "versioned-site": + return versioned_site_spatial_unit() + elif spatial_unit_type == "lat-lon": + return lat_lon_spatial_unit() + elif spatial_unit_type == "admin": + if level is None: + raise ValueError( + "'level' parameter is required for spatial unit of type 'admin'." + ) + return admin_spatial_unit( + level=level, region_id_column_name=region_id_column_name + ) + elif spatial_unit_type == "grid": + if size is None: + raise ValueError( + "'size' parameter is required for spatial unit of type 'grid'." + ) + return grid_spatial_unit(size=size) + elif spatial_unit_type == "polygon": + if polygon_table is None: + raise ValueError( + "'polygon_table' parameter is required for spatial unit of type 'polygon'." + ) + if region_id_column_name is None: + raise ValueError( + "'region_id_column_name' parameter is required for spatial unit of type 'polygon'." + ) + return PolygonSpatialUnit( + polygon_column_names=region_id_column_name, + polygon_table=polygon_table, + geom_column=geom_column, + ) From 37ab5423d9dfe0af8ebf2c775676cea94b52d9e1 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 17:06:16 +0100 Subject: [PATCH 073/138] Add tests for make_spatial_unit --- flowmachine/flowmachine/core/spatial_unit.py | 8 +- flowmachine/tests/conftest.py | 55 ++++---- .../test_sql_strings_and_results.py | 2 +- flowmachine/tests/test_daily_location.py | 3 +- flowmachine/tests/test_flows.py | 6 +- flowmachine/tests/test_spatial_unit.py | 126 +++++++++++------- 6 files changed, 113 insertions(+), 87 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 86c8b0dad2..02e14d071e 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -355,10 +355,10 @@ def admin_spatial_unit(*, level, region_id_column_name=None): flowmachine.core.spatial_unit.PolygonSpatialUnit Query which maps cell/site IDs to admin regions """ - # If there is no column_name passed then we can use the default, which is - # of the form admin3pcod. If the user has asked for the standard - # column_name then we will alias this column as 'pcod', otherwise we won't - # alias it at all. + # If there is no region_id_column_name passed then we can use the default, + # which is of the form admin3pcod. If the user has asked for the standard + # region_id_column_name then we will alias this column as 'pcod', otherwise + # we won't alias it at all. if region_id_column_name is None or region_id_column_name == f"admin{level}pcod": col_name = f"admin{level}pcod AS pcod" else: diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 6ead63b8cb..aa278abb4a 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -20,15 +20,7 @@ import flowmachine from flowmachine.core import Query from flowmachine.core.cache import reset_cache -from flowmachine.core.spatial_unit import ( - CellSpatialUnit, - lat_lon_spatial_unit, - versioned_cell_spatial_unit, - versioned_site_spatial_unit, - PolygonSpatialUnit, - admin_spatial_unit, - grid_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import EventTableSubset logger = logging.getLogger() @@ -73,27 +65,30 @@ def exemplar_level_param(request): @pytest.fixture( params=[ - (admin_spatial_unit, {"level": 2}), - (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), - (versioned_site_spatial_unit, {}), - (versioned_cell_spatial_unit, {}), - (CellSpatialUnit, {}), - (lat_lon_spatial_unit, {}), - (grid_spatial_unit, {"size": 5}), - ( - PolygonSpatialUnit, - {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, - ), - ( - PolygonSpatialUnit, - { - "polygon_column_names": "id", - "polygon_table": "infrastructure.sites", - "geom_column": "geom_point", - }, - ), + {"spatial_unit_type": "admin", "level": 2}, + { + "spatial_unit_type": "admin", + "level": 2, + "region_id_column_name": "admin2name", + }, + {"spatial_unit_type": "versioned-site"}, + {"spatial_unit_type": "versioned-cell"}, + {"spatial_unit_type": "cell"}, + {"spatial_unit_type": "lat-lon"}, + {"spatial_unit_type": "grid", "size": 5}, + { + "spatial_unit_type": "polygon", + "region_id_column_name": "admin3pcod", + "polygon_table": "geography.admin3", + }, + { + "spatial_unit_type": "polygon", + "region_id_column_name": "id", + "polygon_table": "infrastructure.sites", + "geom_column": "geom_point", + }, ], - ids=lambda x: x[0].__name__, + ids=lambda x: str(x), ) def exemplar_spatial_unit_param(request): """ @@ -105,7 +100,7 @@ def exemplar_spatial_unit_param(request): flowmachine.core.spatial_unit.*SpatialUnit """ - yield request.param[0](**request.param[1]) + yield make_spatial_unit(**request.param) def get_string_with_test_parameter_values(item): diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index d20849bfe7..53a9880480 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -34,7 +34,7 @@ def test_daily_location_2_sql(diff_reporter): """ dl = daily_location( "2016-01-04", - spatial_unit=admin_spatial_unit(level=2, column_name="admin2pcod"), + spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2pcod"), hours=(3, 9), method="most-common", subscriber_identifier="imei", diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index 6b9f732982..cba8e6c0de 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -43,7 +43,8 @@ def test_works_with_admin_names(get_dataframe): """ dl = daily_location( - "2016-01-05", spatial_unit=admin_spatial_unit(level=3, column_name="admin3name") + "2016-01-05", + spatial_unit=admin_spatial_unit(level=3, region_id_column_name="admin3name"), ) df = get_dataframe(dl) assert "Lamjung" == df.admin3name[0] diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 323929a8f9..150c4e2eba 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -130,10 +130,12 @@ def test_flows_geojson(get_dataframe): """ dl = daily_location( - "2016-01-01", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") + "2016-01-01", + spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2name"), ) dl2 = daily_location( - "2016-01-02", spatial_unit=admin_spatial_unit(level=2, column_name="admin2name") + "2016-01-02", + spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2name"), ) fl = Flows(dl, dl2) js = fl.to_geojson() diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 6fae355a69..8580b81328 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -3,16 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. from flowmachine.core import CustomQuery -from flowmachine.core.spatial_unit import ( - SpatialUnit, - CellSpatialUnit, - lat_lon_spatial_unit, - versioned_cell_spatial_unit, - versioned_site_spatial_unit, - PolygonSpatialUnit, - admin_spatial_unit, - grid_spatial_unit, -) +from flowmachine.core.spatial_unit import * import pytest @@ -43,40 +34,57 @@ def test_get_geom_query_column_names( @pytest.mark.parametrize( - "spatial_unit, kwargs, loc_cols", + "make_spatial_unit_args, loc_cols", [ - (lat_lon_spatial_unit, {}, ["lat", "lon"]), - (versioned_cell_spatial_unit, {}, ["location_id", "version", "lon", "lat"]), - (versioned_site_spatial_unit, {}, ["site_id", "version", "lon", "lat"]), + ({"spatial_unit_type": "lat-lon"}, ["lat", "lon"]), + ( + {"spatial_unit_type": "versioned-cell"}, + ["location_id", "version", "lon", "lat"], + ), + ({"spatial_unit_type": "versioned-site"}, ["site_id", "version", "lon", "lat"]), ( - PolygonSpatialUnit, { - "polygon_column_names": "id", + "spatial_unit_type": "polygon", + "region_id_column_name": "id", "polygon_table": "infrastructure.sites", "geom_column": "geom_point", }, ["id"], ), ( - PolygonSpatialUnit, { - "polygon_column_names": ["id"], + "spatial_unit_type": "polygon", + "region_id_column_name": ["id"], "polygon_table": "infrastructure.sites", "geom_column": "geom_point", }, ["id"], ), - (admin_spatial_unit, {"level": 3}, ["pcod"]), - (admin_spatial_unit, {"level": 3, "column_name": "admin3pcod"}, ["pcod"]), - (admin_spatial_unit, {"level": 3, "column_name": "admin3name"}, ["admin3name"]), - (grid_spatial_unit, {"size": 5}, ["grid_id"]), + ({"spatial_unit_type": "admin", "level": 3}, ["pcod"]), + ( + { + "spatial_unit_type": "admin", + "level": 3, + "region_id_column_name": "admin3pcod", + }, + ["pcod"], + ), + ( + { + "spatial_unit_type": "admin", + "level": 3, + "region_id_column_name": "admin3name", + }, + ["admin3name"], + ), + ({"spatial_unit_type": "grid", "size": 5}, ["grid_id"]), ], ) -def test_spatial_unit_location_columns(spatial_unit, kwargs, loc_cols): +def test_spatial_unit_location_columns(make_spatial_unit_args, loc_cols): """ Test that the SpatialUnit classes have the correct location_columns properties. """ - su = spatial_unit(**kwargs) + su = make_spatial_unit(**make_spatial_unit_args) assert loc_cols == su.location_columns @@ -113,35 +121,40 @@ def test_missing_location_columns_raises_error(): @pytest.mark.parametrize( - "spatial_unit, kwargs", + "make_spatial_unit_args", [ - (admin_spatial_unit, {"level": 2}), - (admin_spatial_unit, {"level": 2, "column_name": "admin2name"}), - (versioned_site_spatial_unit, {}), - (versioned_cell_spatial_unit, {}), - (CellSpatialUnit, {}), - (lat_lon_spatial_unit, {}), - (grid_spatial_unit, {"size": 5}), - ( - PolygonSpatialUnit, - {"polygon_column_names": "admin3pcod", "polygon_table": "geography.admin3"}, - ), - ( - PolygonSpatialUnit, - { - "polygon_column_names": "id", - "polygon_table": "infrastructure.sites", - "geom_column": "geom_point", - }, - ), + {"spatial_unit_type": "admin", "level": 2}, + { + "spatial_unit_type": "admin", + "level": 2, + "region_id_column_name": "admin2name", + }, + {"spatial_unit_type": "versioned-site"}, + {"spatial_unit_type": "versioned-cell"}, + {"spatial_unit_type": "cell"}, + {"spatial_unit_type": "lat-lon"}, + {"spatial_unit_type": "grid", "size": 5}, + { + "spatial_unit_type": "polygon", + "region_id_column_name": "admin3pcod", + "polygon_table": "geography.admin3", + }, + { + "spatial_unit_type": "polygon", + "region_id_column_name": "id", + "polygon_table": "infrastructure.sites", + "geom_column": "geom_point", + }, ], ) -def test_spatial_unit_equals_itself(spatial_unit, kwargs): +def test_spatial_unit_equals_itself(make_spatial_unit_args): """ Test that instances of the SpatialUnit classes are equal to themselves. """ - su1 = spatial_unit(**kwargs) - su2 = spatial_unit(**kwargs) + # Can't use exemplar_spatial_unit_param here because we need to create two + # different but equal spatial units. + su1 = make_spatial_unit(**make_spatial_unit_args) + su2 = make_spatial_unit(**make_spatial_unit_args) assert su1 == su2 assert hash(su1) == hash(su2) @@ -178,8 +191,8 @@ def test_different_column_name_admin_spatial_units_are_not_equal(): """ Test that two admin spatial units with different column_names are not equal. """ - su1 = admin_spatial_unit(level=3, column_name="admin3pcod") - su2 = admin_spatial_unit(level=3, column_name="admin3name") + su1 = admin_spatial_unit(level=3, region_id_column_name="admin3pcod") + su2 = admin_spatial_unit(level=3, region_id_column_name="admin3name") assert su1 != su2 @@ -190,3 +203,18 @@ def test_different_grid_spatial_units_are_not_equal(): su1 = grid_spatial_unit(size=5) su2 = grid_spatial_unit(size=50) assert su1 != su2 + + +@pytest.mark.parametrize( + "make_spatial_unit_args", + [ + {"spatial_unit_type": "INVALID_SPATIAL_UNIT_TYPE"}, + {"spatial_unit_type": "admin"}, + {"spatial_unit_type": "grid"}, + {"spatial_unit_type": "polygon", "polygon_table": "geography.admin3"}, + {"spatial_unit_type": "polygon", "region_id_column_name": "DUMMY_COLUMN_NAME"}, + ], +) +def test_make_spatial_unit_raises_errors(make_spatial_unit_args): + with pytest.raises(ValueError): + su = make_spatial_unit(**make_spatial_unit_args) From fd9beac916841ca56c64497bd134171f257ce57c Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 5 Jun 2019 18:02:13 +0100 Subject: [PATCH 074/138] Use make_spatial_unit everywhere --- .../features/subscriber/displacement.py | 10 ++-- .../features/subscriber/last_location.py | 6 +-- .../subscriber/most_frequent_location.py | 6 +-- .../test_sql_strings_and_results.py | 16 +++--- flowmachine/tests/test_async.py | 14 ++--- flowmachine/tests/test_calldays.py | 14 +++-- flowmachine/tests/test_daily_location.py | 16 ++++-- flowmachine/tests/test_day_trajectories.py | 8 +-- flowmachine/tests/test_displacement.py | 8 +-- flowmachine/tests/test_flows.py | 27 +++++----- flowmachine/tests/test_geomixin.py | 24 ++++----- flowmachine/tests/test_indexes.py | 4 +- flowmachine/tests/test_inoutflows.py | 6 +-- flowmachine/tests/test_join_to_location.py | 51 +++++++++---------- flowmachine/tests/test_joined_aggregate.py | 14 ++--- flowmachine/tests/test_last_location.py | 9 ++-- flowmachine/tests/test_location_visits.py | 6 +-- .../tests/test_meaningful_locations.py | 22 ++++---- .../tests/test_most_frequent_locations.py | 12 ++--- flowmachine/tests/test_radius_of_gyration.py | 4 +- flowmachine/tests/test_spatial_aggregate.py | 6 +-- .../tests/test_spatial_distancematrix.py | 16 ++---- .../tests/test_subscriber_location_cluster.py | 24 ++++----- .../tests/test_subscriber_locations.py | 8 +-- .../tests/test_total_network_objects.py | 19 +++---- .../tests/test_unique_location_counts.py | 7 ++- .../tests/test_unique_subscriber_counts.py | 12 +++-- .../test_daily_location_results.py | 6 +-- 28 files changed, 190 insertions(+), 185 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index 51e617eec8..bf0a362448 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -17,11 +17,7 @@ from . import ModalLocation from ..utilities.subscriber_locations import subscriber_locations from flowmachine.utils import parse_datestring, get_dist_string, list_of_dates -from flowmachine.core.spatial_unit import ( - lat_lon_spatial_unit, - versioned_cell_spatial_unit, - versioned_site_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from dateutil.relativedelta import relativedelta @@ -141,7 +137,7 @@ def is_allowed_spatial_unit(spatial_unit): *[ daily_location( date, - spatial_unit=lat_lon_spatial_unit(), + spatial_unit=make_spatial_unit("lat-lon"), hours=hours, method=method, table=table, @@ -156,7 +152,7 @@ def is_allowed_spatial_unit(spatial_unit): sl = subscriber_locations( self.start, self.stop_sl, - spatial_unit=lat_lon_spatial_unit(), + spatial_unit=make_spatial_unit("lat-lon"), hours=hours, table=table, subscriber_identifier=subscriber_identifier, diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index c35195bd50..555b01bb52 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -15,7 +15,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation from ..utilities.subscriber_locations import subscriber_locations -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit class LastLocation(BaseLocation, Query): @@ -31,7 +31,7 @@ class LastLocation(BaseLocation, Query): stop : str As above spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin_spatial_unit(level=3) + default admin3 Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of ints, default 'all' @@ -82,7 +82,7 @@ def __init__( self.start = start self.stop = stop if spatial_unit is None: - self.spatial_unit = admin_spatial_unit(level=3) + self.spatial_unit = make_spatial_unit("admin", level=3) else: self.spatial_unit = spatial_unit self.hours = hours diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 4e1001cab3..8b90a2da48 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -13,7 +13,7 @@ from flowmachine.core import Query from ..utilities.subscriber_locations import BaseLocation, subscriber_locations -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit class MostFrequentLocation(BaseLocation, Query): @@ -29,7 +29,7 @@ class MostFrequentLocation(BaseLocation, Query): stop : str As above spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin_spatial_unit(level=3) + default admin3 Spatial unit to which subscriber locations will be mapped. See the docstring of spatial_unit.py for more information. hours : tuple of int, default 'all' @@ -84,7 +84,7 @@ def __init__( self.start = start self.stop = stop if spatial_unit is None: - self.spatial_unit = admin_spatial_unit(level=3) + self.spatial_unit = make_spatial_unit("admin", level=3) else: self.spatial_unit = spatial_unit self.hours = hours diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index 53a9880480..f33236425a 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -7,7 +7,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery from flowmachine.features import daily_location -from flowmachine.core.spatial_unit import admin_spatial_unit, CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit def test_daily_location_1_sql(diff_reporter): @@ -34,7 +34,9 @@ def test_daily_location_2_sql(diff_reporter): """ dl = daily_location( "2016-01-04", - spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2pcod"), + spatial_unit=make_spatial_unit( + "admin", level=2, region_id_column_name="admin2pcod" + ), hours=(3, 9), method="most-common", subscriber_identifier="imei", @@ -56,7 +58,7 @@ def test_daily_location_2_df(get_dataframe, diff_reporter): """ dl = daily_location( "2016-01-04", - spatial_unit=admin_spatial_unit(level=2), + spatial_unit=make_spatial_unit("admin", level=2), hours=(3, 9), method="most-common", # subscriber_identifier="imei", @@ -82,7 +84,7 @@ def test_daily_location_3_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours=(23, 5), method="last", # subscriber_identifier="imei", @@ -103,7 +105,7 @@ def test_daily_location_3_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours=(23, 5), method="last", # subscriber_identifier="imei", @@ -160,7 +162,7 @@ def test_daily_location_5_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours=(23, 5), method="last", # subscriber_identifier="imei", @@ -187,7 +189,7 @@ def test_daily_location_5_df(get_dataframe, diff_reporter): dl = daily_location( "2016-01-02", - spatial_unit=admin_spatial_unit(level=3), + spatial_unit=make_spatial_unit("admin", level=3), hours=(4, 9), method="most-common", # subscriber_identifier="imei", diff --git a/flowmachine/tests/test_async.py b/flowmachine/tests/test_async.py index a9e94ad03d..2fc9bf8053 100644 --- a/flowmachine/tests/test_async.py +++ b/flowmachine/tests/test_async.py @@ -8,7 +8,7 @@ from flowmachine.features.subscriber import * from threading import Thread import pandas as pd -from flowmachine.core.spatial_unit import CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit def test_returns_future(): @@ -35,7 +35,7 @@ def test_double_store(): Storing a query twice doesn't raise an error. """ - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) dl.store().result() dl.store().result() @@ -46,12 +46,12 @@ def test_store_async(): """ schema = "cache" - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) table_name = dl.fully_qualified_table_name.split(".")[1] store_future = dl.store() store_future.result() assert dl.connection.has_table(table_name, schema=schema) - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) assert table_name in dl.get_query() @@ -59,7 +59,7 @@ def test_get_query_blocks_on_store(): """ If a store is running get_query should block. """ - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) dl.store().result() timer = [] @@ -83,8 +83,8 @@ def test_blocks_on_store_cascades(): If a store is running on a query that is used in a another query, that query should wait. """ - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) - dl2 = daily_location("2016-01-02", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) + dl2 = daily_location("2016-01-02", spatial_unit=make_spatial_unit("cell")) store_future = dl.store() store_future.result() hl = ModalLocation(dl, dl2) diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index 42c2957f46..dc812fccf6 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -11,7 +11,7 @@ import pytest -from flowmachine.core.spatial_unit import versioned_site_spatial_unit, CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import CallDays, subscriber_locations import numpy as np @@ -39,7 +39,9 @@ def test_call_days_returns_expected_counts_per_subscriber(get_dataframe): ) for (subscriber, start, end, calls) in test_values: cd = CallDays( - subscriber_locations(start, end, spatial_unit=versioned_site_spatial_unit()) + subscriber_locations( + start, end, spatial_unit=make_spatial_unit("versioned-site") + ) ) df = get_dataframe(cd).query('subscriber == "{}"'.format(subscriber)) assert df.calldays.sum() == calls @@ -57,7 +59,9 @@ def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): ) for (subscriber, location, start, end, calls) in test_values: cd = CallDays( - subscriber_locations(start, end, spatial_unit=versioned_site_spatial_unit()) + subscriber_locations( + start, end, spatial_unit=make_spatial_unit("versioned-site") + ) ) df = get_dataframe(cd).query( 'subscriber == "{}" & site_id == "{}"'.format(subscriber, location) @@ -71,7 +75,9 @@ def test_locations_are_only_repeated_once_per_subscriber(get_dataframe): """ cd = CallDays( - subscriber_locations("2016-01-01", "2016-01-03", spatial_unit=CellSpatialUnit()) + subscriber_locations( + "2016-01-01", "2016-01-03", spatial_unit=make_spatial_unit("cell") + ) ) df = get_dataframe(cd) assert not np.any(df.groupby(["subscriber", "location_id"]).count() > 1) diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index cba8e6c0de..0b27d84af5 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import MissingDateError -from flowmachine.core.spatial_unit import admin_spatial_unit, CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import daily_location, MostFrequentLocation @@ -44,7 +44,9 @@ def test_works_with_admin_names(get_dataframe): dl = daily_location( "2016-01-05", - spatial_unit=admin_spatial_unit(level=3, region_id_column_name="admin3name"), + spatial_unit=make_spatial_unit( + "admin", level=3, region_id_column_name="admin3name" + ), ) df = get_dataframe(dl) assert "Lamjung" == df.admin3name[0] @@ -58,9 +60,13 @@ def test_hours(get_length): # Lower level test test that subsetdates handles this correctly # we're just testing that it is passed on in this case. - dl1 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) - dl2 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit(), hours=(19, 23)) - dl3 = daily_location("2016-01-01", spatial_unit=CellSpatialUnit(), hours=(19, 20)) + dl1 = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) + dl2 = daily_location( + "2016-01-01", spatial_unit=make_spatial_unit("cell"), hours=(19, 23) + ) + dl3 = daily_location( + "2016-01-01", spatial_unit=make_spatial_unit("cell"), hours=(19, 20) + ) assert get_length(dl1) > get_length(dl2) > get_length(dl3) diff --git a/flowmachine/tests/test_day_trajectories.py b/flowmachine/tests/test_day_trajectories.py index 74844d7be4..f92687d827 100644 --- a/flowmachine/tests/test_day_trajectories.py +++ b/flowmachine/tests/test_day_trajectories.py @@ -4,7 +4,7 @@ from flowmachine.features import DayTrajectories, daily_location -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit def test_column_names_day_trajectories(exemplar_spatial_unit_param): @@ -21,12 +21,14 @@ def test_day_trajectories(get_dataframe): """ traj = DayTrajectories( daily_location( - "2016-01-01", spatial_unit=admin_spatial_unit(level=3), method="last" + "2016-01-01", + spatial_unit=make_spatial_unit("admin", level=3), + method="last", ) ) df = get_dataframe(traj).drop("date", axis=1) dldf = daily_location( - "2016-01-01", spatial_unit=admin_spatial_unit(level=3), method="last" + "2016-01-01", spatial_unit=make_spatial_unit("admin", level=3), method="last" ).get_dataframe() assert [df["subscriber"][0], df["pcod"][0]] == [ dldf["subscriber"][0], diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index 60213372f1..cb59bf29e3 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -7,7 +7,7 @@ from numpy import isnan from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import lat_lon_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit @pytest.mark.parametrize( @@ -51,7 +51,7 @@ def test_pass_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=lat_lon_spatial_unit()) + daily_location(d, spatial_unit=make_spatial_unit("lat-lon")) for d in list_of_dates("2016-01-01", "2016-01-06") ] ) @@ -90,7 +90,7 @@ def test_get_all_users_in_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=lat_lon_spatial_unit(), hours=(12, 13)) + daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) @@ -114,7 +114,7 @@ def test_subscriber_with_home_loc_but_no_calls_is_nan(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=lat_lon_spatial_unit(), hours=(12, 13)) + daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 150c4e2eba..92d583359c 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -7,11 +7,10 @@ import geojson import pytest -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import daily_location from flowmachine.features.location.flows import * from flowmachine.features.subscriber.daily_location import locate_subscribers -from flowmachine.core.spatial_unit import CellSpatialUnit pytestmark = pytest.mark.usefixtures("skip_datecheck") @@ -31,8 +30,8 @@ def test_flows_raise_error(): """ Flows() raises error if location levels are different. """ - dl1 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=3)) - dl2 = daily_location("2016-01-01", spatial_unit=admin_spatial_unit(level=2)) + dl1 = daily_location("2016-01-01", spatial_unit=make_spatial_unit("admin", level=3)) + dl2 = daily_location("2016-01-01", spatial_unit=make_spatial_unit("admin", level=2)) with pytest.raises(ValueError): Flows(dl1, dl2) @@ -50,7 +49,7 @@ def test_calculates_flows(get_dataframe): """ Flows() are correctly calculated """ - spatial_unit = admin_spatial_unit(level=3) + spatial_unit = make_spatial_unit("admin", level=3) dl1 = locate_subscribers( "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" ) @@ -83,7 +82,7 @@ def test_flows_geojson_correct(): """ Test that flows outputs expected geojson. """ - spatial_unit = admin_spatial_unit(level=3) + spatial_unit = make_spatial_unit("admin", level=3) dl1 = locate_subscribers( "2016-01-01", "2016-01-02", spatial_unit=spatial_unit, method="last" ) @@ -104,8 +103,8 @@ def test_valid_flows_geojson(exemplar_spatial_unit_param): Check that valid geojson is returned for Flows. """ - if CellSpatialUnit() == exemplar_spatial_unit_param: - pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") + if make_spatial_unit("cell") == exemplar_spatial_unit_param: + pytest.skip("Query with spatial_unit=CellSpatialUnit() has no geometry.") dl = daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) dl2 = daily_location("2016-01-02", spatial_unit=exemplar_spatial_unit_param) fl = Flows(dl, dl2) @@ -117,8 +116,8 @@ def test_flows_geo_augmented_query_raises_error(): Test that a ValueError is raised when attempting to get geojson for a flows query with no geography data. """ - dl = daily_location("2016-01-01", spatial_unit=CellSpatialUnit()) - dl2 = daily_location("2016-01-02", spatial_unit=CellSpatialUnit()) + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("cell")) + dl2 = daily_location("2016-01-02", spatial_unit=make_spatial_unit("cell")) fl = Flows(dl, dl2) with pytest.raises(ValueError): fl.to_geojson_string() @@ -131,11 +130,15 @@ def test_flows_geojson(get_dataframe): dl = daily_location( "2016-01-01", - spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2name"), + spatial_unit=make_spatial_unit( + "admin", level=2, region_id_column_name="admin2name" + ), ) dl2 = daily_location( "2016-01-02", - spatial_unit=admin_spatial_unit(level=2, region_id_column_name="admin2name"), + spatial_unit=make_spatial_unit( + "admin", level=2, region_id_column_name="admin2name" + ), ) fl = Flows(dl, dl2) js = fl.to_geojson() diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index d844068e66..ae6e0aeb59 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -16,11 +16,7 @@ from flowmachine.core import Query from flowmachine.core.mixins import GeoDataMixin -from flowmachine.core.spatial_unit import ( - CellSpatialUnit, - lat_lon_spatial_unit, - admin_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import daily_location from flowmachine.utils import proj4string @@ -78,8 +74,8 @@ def test_valid_geojson(exemplar_spatial_unit_param): Check that valid geojson is returned. """ - if CellSpatialUnit() == exemplar_spatial_unit_param: - pytest.skip("Query with spatial_unit=CellSpatialUnit has no geometry.") + if make_spatial_unit("cell") == exemplar_spatial_unit_param: + pytest.skip("Query with spatial_unit=CellSpatialUnit() has no geometry.") dl = daily_location( "2016-01-01", "2016-01-02", spatial_unit=exemplar_spatial_unit_param ).aggregate() @@ -92,7 +88,7 @@ def test_geo_augmented_query_raises_error(): with no geography data. """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("cell") ).aggregate() with pytest.raises(ValueError): dl.to_geojson_string() @@ -104,7 +100,7 @@ def test_correct_geojson(): """ js = ( daily_location( - "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=2) ) .aggregate() .to_geojson() @@ -130,7 +126,7 @@ def test_geojson_file_output(tmpdir): js_file = tmpdir / "geojson_test.json" daily_location( - "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=2) + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=2) ).aggregate().to_geojson_file(js_file) with open(js_file) as fin: js = json.load(fin) @@ -153,7 +149,7 @@ def test_reprojection(): """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js["features"][0]["geometry"]["coordinates"] == [ @@ -168,7 +164,7 @@ def test_geojson_cache(): Test geojson is cached locally. """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js == dl._geojson[proj4string(dl.connection, 2770)] @@ -177,7 +173,7 @@ def test_geojson_cache(): def test_geojson_cache_exluded_from_pickle(): """Test that cached geojson is not going to get pickled.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert "_geojson" not in dl.__getstate__() # Check excluded from pickle @@ -186,7 +182,7 @@ def test_geojson_cache_exluded_from_pickle(): def test_geojson_caching_off(): """Test that switching off caching clears the cache, and doesn't add to it.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 dl.turn_off_caching() # Check caching for geojson switches off diff --git a/flowmachine/tests/test_indexes.py b/flowmachine/tests/test_indexes.py index 8be9e7f101..684cc33549 100644 --- a/flowmachine/tests/test_indexes.py +++ b/flowmachine/tests/test_indexes.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import lat_lon_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features.subscriber import * @@ -15,7 +15,7 @@ def test_default_indexes(): '"subscriber"', ] assert daily_location( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ).index_cols == [["lat", "lon"], '"subscriber"'] assert SubscriberDegree("2016-01-01", "2016-01-02").index_cols == ['"subscriber"'] diff --git a/flowmachine/tests/test_inoutflows.py b/flowmachine/tests/test_inoutflows.py index b63b0ded95..243dbf41a5 100644 --- a/flowmachine/tests/test_inoutflows.py +++ b/flowmachine/tests/test_inoutflows.py @@ -8,7 +8,7 @@ from flowmachine.features import Flows, daily_location -from flowmachine.core.spatial_unit import versioned_site_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit def test_inoutflow_with_double_column_location(): @@ -17,8 +17,8 @@ def test_inoutflow_with_double_column_location(): more than one column. """ - dl1 = daily_location("2016-01-01", spatial_unit=versioned_site_spatial_unit()) - dl2 = daily_location("2016-01-02", spatial_unit=versioned_site_spatial_unit()) + dl1 = daily_location("2016-01-01", spatial_unit=make_spatial_unit("versioned-site")) + dl2 = daily_location("2016-01-02", spatial_unit=make_spatial_unit("versioned-site")) flow = Flows(dl1, dl2) expected_columns = ["site_id_to", "version_to", "lon_to", "lat_to", "total"] diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 7baf9a14c9..b6e7203c4e 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -10,23 +10,15 @@ from flowmachine.features import subscriber_locations from flowmachine.core import JoinToLocation, location_joined_query -from flowmachine.core.spatial_unit import ( - CellSpatialUnit, - admin_spatial_unit, - versioned_site_spatial_unit, - versioned_cell_spatial_unit, - lat_lon_spatial_unit, - grid_spatial_unit, - PolygonSpatialUnit, -) +from flowmachine.core.spatial_unit import make_spatial_unit def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" - if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): + if make_spatial_unit("cell") == exemplar_spatial_unit_param: pytest.skip("JoinToLocation does not accept CellSpatialUnit objects") table = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) joined = JoinToLocation(table, spatial_unit=exemplar_spatial_unit_param) assert joined.head(0).columns.tolist() == joined.column_names @@ -38,9 +30,9 @@ def test_join_to_location_raises_value_error(): """ with pytest.raises(ValueError): table = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) - joined = JoinToLocation(table, spatial_unit=CellSpatialUnit()) + joined = JoinToLocation(table, spatial_unit=make_spatial_unit("cell")) moving_sites = [ @@ -62,9 +54,11 @@ def test_join_with_versioned_cells(get_dataframe, get_length): Test that flowmachine.JoinToLocation can fetch the cell version. """ ul = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") + ) + df = get_dataframe( + JoinToLocation(ul, spatial_unit=make_spatial_unit("versioned-cell")) ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=versioned_cell_spatial_unit())) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) # These should all be version zero, these are the towers before the changeover date, or those that @@ -87,9 +81,9 @@ def test_join_with_lat_lon(get_dataframe): Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ ul = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=lat_lon_spatial_unit())) + df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lat-lon"))) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) assert sorted(df.columns) == expected_cols @@ -113,12 +107,13 @@ def test_join_with_polygon(get_dataframe, get_length): of each cell. """ ul = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) j = JoinToLocation( ul, - spatial_unit=PolygonSpatialUnit( - polygon_column_names="admin3pcod", + spatial_unit=make_spatial_unit( + "polygon", + region_id_column_name="admin3pcod", polygon_table="geography.admin3", geom_column="geom", ), @@ -135,9 +130,11 @@ def test_join_to_admin(get_dataframe, get_length): Test that flowmachine.JoinToLocation can join to a admin region. """ ul = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") + ) + df = get_dataframe( + JoinToLocation(ul, spatial_unit=make_spatial_unit("admin", level=3)) ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=admin_spatial_unit(level=3))) assert len(df) == get_length(ul) expected_cols = sorted(["subscriber", "time", "location_id", "pcod"]) assert sorted(df.columns) == expected_cols @@ -148,9 +145,11 @@ def test_join_to_grid(get_dataframe, get_length): Test that we can join to a grid square """ ul = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") + ) + df = get_dataframe( + JoinToLocation(ul, spatial_unit=make_spatial_unit("grid", size=50)) ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=grid_spatial_unit(size=50))) assert len(df) == get_length(ul) @@ -161,10 +160,10 @@ def test_location_joined_query_return_type(exemplar_spatial_unit_param): query when spatial_unit == CellSpatialUnit(). """ table = subscriber_locations( - "2016-01-05", "2016-01-07", spatial_unit=CellSpatialUnit() + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) joined = location_joined_query(table, spatial_unit=exemplar_spatial_unit_param) - if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): + if make_spatial_unit("cell") == exemplar_spatial_unit_param: assert joined is table else: assert isinstance(joined, JoinToLocation) diff --git a/flowmachine/tests/test_joined_aggregate.py b/flowmachine/tests/test_joined_aggregate.py index 2db820aed6..7b480052e5 100644 --- a/flowmachine/tests/test_joined_aggregate.py +++ b/flowmachine/tests/test_joined_aggregate.py @@ -6,7 +6,7 @@ import pytest -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import ( MostFrequentLocation, RadiusOfGyration, @@ -19,7 +19,7 @@ def test_joined_aggregate(get_dataframe): Test join aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("admin", level=3) ) joined = mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert ( @@ -33,7 +33,7 @@ def test_joined_modal_aggregate(get_dataframe): Test join with modal aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("admin", level=3) ) rog = SubscriberDegree("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="mode") @@ -56,7 +56,7 @@ def test_joined_median_aggregate(get_dataframe): Test join with median aggregate. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("admin", level=3) ) rog = RadiusOfGyration("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="median") @@ -79,7 +79,7 @@ def test_joined_agg_date_mismatch(): Test that join aggregate with mismatched dates raises a warning. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("admin", level=3) ) with pytest.warns(UserWarning): mfl.join_aggregate(RadiusOfGyration("2016-01-02", "2016-01-04")) @@ -93,7 +93,9 @@ def test_joined_agg_hours_mismatch(): Test that join aggregate with mismatched hours doesn't warn. """ mfl = MostFrequentLocation( - "2016-01-01 10:00", "2016-01-04", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01 10:00", + "2016-01-04", + spatial_unit=make_spatial_unit("admin", level=3), ) with warnings.catch_warnings(record=True) as w: mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) diff --git a/flowmachine/tests/test_last_location.py b/flowmachine/tests/test_last_location.py index a2cea1351e..e443ada76d 100644 --- a/flowmachine/tests/test_last_location.py +++ b/flowmachine/tests/test_last_location.py @@ -4,10 +4,7 @@ import pytest -from flowmachine.core.spatial_unit import ( - lat_lon_spatial_unit, - versioned_site_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import LastLocation @@ -29,7 +26,7 @@ def test_last_loc_vsite(get_dataframe): """ last_loc = LastLocation( - "2016-01-01", "2016-01-02", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("versioned-site") ) df = get_dataframe(last_loc) @@ -44,7 +41,7 @@ def test_last_loc_lat_lon(get_dataframe): """ last_loc = LastLocation( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ) df = get_dataframe(last_loc) df.set_index("subscriber", inplace=True) diff --git a/flowmachine/tests/test_location_visits.py b/flowmachine/tests/test_location_visits.py index 4c7fcba701..7f08455088 100644 --- a/flowmachine/tests/test_location_visits.py +++ b/flowmachine/tests/test_location_visits.py @@ -4,7 +4,7 @@ from flowmachine.features import LocationVisits, daily_location, DayTrajectories from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit def test_column_names_location_visits(exemplar_spatial_unit_param): @@ -30,7 +30,7 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): DayTrajectories( *[ daily_location( - d, spatial_unit=admin_spatial_unit(level=3), method="last" + d, spatial_unit=make_spatial_unit("admin", level=3), method="last" ) for d in list_of_dates(start_date, stop_date) ] @@ -46,7 +46,7 @@ def test_dl_count_sum_equal_or_less_than_period(get_dataframe): DayTrajectories( *[ daily_location( - d, spatial_unit=admin_spatial_unit(level=3), method="last" + d, spatial_unit=make_spatial_unit("admin", level=3), method="last" ) for d in list_of_dates(start_date, stop_date) ] diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 018910bfcd..6e02dc3d9a 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -4,7 +4,7 @@ import pytest from flowmachine.core.errors import BadLevelError -from flowmachine.core.spatial_unit import versioned_site_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import ( HartiganCluster, CallDays, @@ -37,7 +37,7 @@ def test_column_names_meaningful_locations(get_column_names_from_run): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -67,7 +67,7 @@ def test_column_names_meaningful_locations_aggregate( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -95,7 +95,7 @@ def test_meaningful_locations_aggregate_disallowed_level_raises(): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -124,7 +124,7 @@ def test_column_names_meaningful_locations_od( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -142,7 +142,7 @@ def test_column_names_meaningful_locations_od( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -178,7 +178,7 @@ def test_meaningful_locations_results( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -212,7 +212,7 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -246,7 +246,7 @@ def test_meaningful_locations_od_raises_for_bad_level( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -276,7 +276,7 @@ def test_meaningful_locations_od_results(get_dataframe): subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, @@ -294,7 +294,7 @@ def test_meaningful_locations_od_results(get_dataframe): subscriber_locations=subscriber_locations( start="2016-01-02", stop="2016-01-03", - spatial_unit=versioned_site_spatial_unit(), + spatial_unit=make_spatial_unit("versioned-site"), ) ), radius=1, diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index da5eeeb248..551b495f30 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -4,11 +4,7 @@ import pytest -from flowmachine.core.spatial_unit import ( - admin_spatial_unit, - versioned_site_spatial_unit, - lat_lon_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import MostFrequentLocation from flowmachine.features.subscriber.daily_location import locate_subscribers @@ -32,7 +28,7 @@ def test_vsites(get_dataframe): """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-02", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("versioned-site") ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) @@ -47,7 +43,7 @@ def test_lat_lons(get_dataframe): """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-02", spatial_unit=lat_lon_spatial_unit() + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) @@ -63,7 +59,7 @@ def test_most_fequent_admin(get_dataframe): mfl = locate_subscribers( "2016-01-01", "2016-01-02", - spatial_unit=admin_spatial_unit(level=3), + spatial_unit=make_spatial_unit("admin", level=3), method="most-common", ) df = get_dataframe(mfl) diff --git a/flowmachine/tests/test_radius_of_gyration.py b/flowmachine/tests/test_radius_of_gyration.py index e253cad0d7..cc98048f9e 100644 --- a/flowmachine/tests/test_radius_of_gyration.py +++ b/flowmachine/tests/test_radius_of_gyration.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import pytest -from flowmachine.core.spatial_unit import admin_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.features.subscriber import * @@ -58,7 +58,7 @@ def test_can_be_joined(get_dataframe): """ RoG = RadiusOfGyration("2016-01-01", "2016-01-02") dl = locate_subscribers( - "2016-01-01", "2016-01-02", spatial_unit=admin_spatial_unit(level=3) + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3) ) rog_JA = RoG.join_aggregate(dl) df = get_dataframe(rog_JA) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 27c503f10a..7ae6dc9a29 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import admin_spatial_unit, lat_lon_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import ModalLocation, daily_location from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates @@ -15,7 +15,7 @@ def test_can_be_aggregated_admin3(get_dataframe): mfl = locate_subscribers( "2016-01-01", "2016-01-02", - spatial_unit=admin_spatial_unit(level=3), + spatial_unit=make_spatial_unit("admin", level=3), method="most-common", ) agg = mfl.aggregate() @@ -29,7 +29,7 @@ def test_can_be_aggregated_latlong(get_dataframe): """ hl = ModalLocation( *[ - daily_location(d, spatial_unit=lat_lon_spatial_unit(), method="last") + daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ] ) diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index 573ed5f2ab..44e0b2c6ae 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -9,18 +9,14 @@ import pytest from flowmachine.features.spatial import DistanceMatrix -from flowmachine.core.spatial_unit import ( - versioned_cell_spatial_unit, - versioned_site_spatial_unit, - lat_lon_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit def test_some_results(get_dataframe): """ DistanceMatrix() returns a dataframe that contains hand-picked results. """ - c = DistanceMatrix(spatial_unit=versioned_site_spatial_unit()) + c = DistanceMatrix(spatial_unit=make_spatial_unit("versioned-site")) df = get_dataframe(c) set_df = df.set_index("site_id_from") assert round(set_df.loc["8wPojr"]["distance"].values[0]) == 789 @@ -30,15 +26,11 @@ def test_some_results(get_dataframe): @pytest.mark.parametrize( "spatial_unit_type, length", - [ - (versioned_cell_spatial_unit, 62), - (versioned_site_spatial_unit, 35), - (lat_lon_spatial_unit, 62), - ], + [("versioned-cell", 62), ("versioned-site", 35), ("lat-lon", 62)], ) def test_result_has_correct_length(spatial_unit_type, length, get_length): """ DistanceMatrix() has the correct length. """ - c = DistanceMatrix(spatial_unit=spatial_unit_type()) + c = DistanceMatrix(spatial_unit=make_spatial_unit(spatial_unit_type)) assert get_length(c) == length ** 2 diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 8daefe8f84..6eff214e2a 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -19,7 +19,7 @@ from flowmachine.core import Table, CustomQuery from flowmachine.core.query import Query from flowmachine.core.mixins import GeoDataMixin -from flowmachine.core.spatial_unit import versioned_site_spatial_unit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import ( CallDays, HartiganCluster, @@ -34,7 +34,7 @@ def test_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -46,7 +46,7 @@ def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -66,7 +66,7 @@ def test_joined_hartigan_type_error(): """Test that joining hartigan to something which isn't query like raises a type error.""" cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) hartigan = HartiganCluster(calldays=cd, radius=50) @@ -107,7 +107,7 @@ def test_cluster_is_within_envelope(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -125,7 +125,7 @@ def test_first_call_day_in_first_cluster(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) cd_df = get_dataframe(cd) @@ -152,7 +152,7 @@ def test_bigger_radius_yields_fewer_clusters(get_dataframe): radius = [1, 2, 5, 10, 50] cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -172,7 +172,7 @@ def test_different_call_days_format(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) @@ -198,7 +198,7 @@ def test_call_threshold_works(get_dataframe): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -217,7 +217,7 @@ def test_buffered_hartigan(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -237,7 +237,7 @@ def test_all_options_hartigan(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -255,7 +255,7 @@ def test_join_returns_the_same_clusters(): """ cd = CallDays( subscriber_locations( - "2016-01-01", "2016-01-04", spatial_unit=versioned_site_spatial_unit() + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index 3b759f9fa7..cdf742f49e 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import PolygonSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features.utilities.subscriber_locations import subscriber_locations @@ -15,8 +15,10 @@ def test_can_get_pcods(get_dataframe): subscriber_pcod = subscriber_locations( "2016-01-01 13:30:30", "2016-01-02 16:25:00", - spatial_unit=PolygonSpatialUnit( - polygon_column_names="admin3pcod", polygon_table="geography.admin3" + spatial_unit=make_spatial_unit( + "polygon", + region_id_column_name="admin3pcod", + polygon_table="geography.admin3", ), ) df = get_dataframe(subscriber_pcod) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index fc3b93f6a0..ae6fdedaa0 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -10,12 +10,7 @@ import pytest -from flowmachine.core.spatial_unit import ( - CellSpatialUnit, - versioned_cell_spatial_unit, - versioned_site_spatial_unit, - lat_lon_spatial_unit, -) +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects @@ -26,8 +21,8 @@ def test_tno_at_lat_lng(get_dataframe): tno = TotalNetworkObjects( start="2016-01-01", stop="2016-01-07", - network_object=versioned_cell_spatial_unit(), - spatial_unit=lat_lon_spatial_unit(), + network_object=make_spatial_unit("versioned-cell"), + spatial_unit=make_spatial_unit("lat-lon"), ) assert tno.get_dataframe().sum().value == 330 @@ -87,7 +82,7 @@ def test_count_returns_correct_values(get_dataframe): "bad_arg, bad_val", [ ("total_by", "BAD_TOTAL_BY"), - ("spatial_unit", CellSpatialUnit()), + ("spatial_unit", make_spatial_unit("cell")), ("network_object", "BAD_OBJECT"), ], ) @@ -117,7 +112,9 @@ def test_median_returns_correct_values(get_dataframe): """ instance = AggregateNetworkObjects( total_network_objects=TotalNetworkObjects( - table="calls", total_by="hour", network_object=versioned_site_spatial_unit() + table="calls", + total_by="hour", + network_object=make_spatial_unit("versioned-site"), ), aggregate_by="day", statistic="median", @@ -141,7 +138,7 @@ def test_mean_returns_correct_values(get_dataframe): start="2016-01-01", stop="2016-12-30", total_by="hour", - network_object=versioned_site_spatial_unit(), + network_object=make_spatial_unit("versioned-site"), ), aggregate_by="day", ) diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index 7afa597a07..2d5d3baf29 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import BadLevelError -from flowmachine.core.spatial_unit import CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import UniqueLocationCounts, subscriber_locations @@ -33,7 +33,10 @@ def test_correct_counts(get_dataframe): df = get_dataframe(ulc) dful = get_dataframe( subscriber_locations( - "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("cell"), + hours=(5, 17), ) ) assert [ diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index 3ccfedaa1d..156e505084 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -8,7 +8,7 @@ import pytest -from flowmachine.core.spatial_unit import CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import UniqueSubscriberCounts from flowmachine.features.utilities import subscriber_locations @@ -29,12 +29,18 @@ def test_correct_counts(get_dataframe): UniqueLocationCounts returns correct counts. """ usc = UniqueSubscriberCounts( - "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("cell"), + hours=(5, 17), ) df = get_dataframe(usc) dful = get_dataframe( subscriber_locations( - "2016-01-01", "2016-01-02", spatial_unit=CellSpatialUnit(), hours=(5, 17) + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("cell"), + hours=(5, 17), ) ) assert [ diff --git a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py index 8f48658759..1412c7c952 100644 --- a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py +++ b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py @@ -6,7 +6,7 @@ from approvaltests.approvals import verify from flowmachine.core import CustomQuery -from flowmachine.core.spatial_unit import CellSpatialUnit +from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import daily_location @@ -20,7 +20,7 @@ def test_daily_location_1_sql(diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours=(23, 5), method="last", subscriber_subset=subset_query, @@ -41,7 +41,7 @@ def test_daily_location_1_df(get_dataframe, diff_reporter): ) dl = daily_location( "2016-01-05", - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours=(23, 5), method="last", subscriber_subset=subset_query, From f761085ad4e9759e637b4f8d3b4657adcd96e8d8 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 6 Jun 2019 11:15:15 +0100 Subject: [PATCH 075/138] Add SpatialUnitMixin and InvalidSpatialUnitError --- .../flowmachine/core/errors/__init__.py | 9 +- .../core/errors/flowmachine_errors.py | 8 ++ flowmachine/flowmachine/core/spatial_unit.py | 113 +++++++++++++++--- flowmachine/tests/test_spatial_unit.py | 38 ++++++ 4 files changed, 150 insertions(+), 18 deletions(-) diff --git a/flowmachine/flowmachine/core/errors/__init__.py b/flowmachine/flowmachine/core/errors/__init__.py index 35db6f4e72..7dd502b620 100644 --- a/flowmachine/flowmachine/core/errors/__init__.py +++ b/flowmachine/flowmachine/core/errors/__init__.py @@ -10,7 +10,14 @@ NameTooLongError, NotConnectedError, BadLevelError, + InvalidSpatialUnitError, MissingDateError, ) -__all__ = ["NameTooLongError", "NotConnectedError", "BadLevelError", "MissingDateError"] +__all__ = [ + "NameTooLongError", + "NotConnectedError", + "BadLevelError", + "InvalidSpatialUnitError", + "MissingDateError", +] diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py index d0a02a4d8a..38112718f5 100644 --- a/flowmachine/flowmachine/core/errors/flowmachine_errors.py +++ b/flowmachine/flowmachine/core/errors/flowmachine_errors.py @@ -106,6 +106,14 @@ def __init__(self, level, allowed_levels=None): Exception.__init__(self, msg) +class InvalidSpatialUnitError(ValueError): + """ + Raised when any class is given a spatial unit that is not valid. + """ + + pass + + class MissingDateError(Exception): """ Raised when instantiating a class that points to a date that does not exist diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 02e14d071e..8fe95e71a4 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -10,14 +10,107 @@ from typing import List from flowmachine.utils import get_name_and_alias +from flowmachine.core.errors import InvalidSpatialUnitError from . import Query, Table from .grid import Grid -# class SpatialUnitMixin: +class SpatialUnitMixin: + """ + Mixin for SpatialUnit classes, which provides a 'location_columns' property + and methods for verifying whether a spatial unit meets different criteria + (useful for checking whether a spatial unit is valid in a given query). + """ + @property + def location_columns(self) -> List[str]: + """ + Names of the columns that identify a location. + """ + return list(self._loc_cols) + + @property + def has_geometry(self): + """ + True if spatial unit has geometry information. + """ + return hasattr(self, "get_geom_query") -class CellSpatialUnit: + @property + def has_lat_lon_columns(self): + """ + True if spatial unit has lat/lon columns. + """ + return "lat" in self.location_columns and "lon" in self.location_columns + + @property + def is_network_object(self): + """ + True if spatial unit is a network object (cell or site). + """ + return ( + "location_id" in self.location_columns or "site_id" in self.location_columns + ) + + @property + def is_polygon(self): + """ + True if spatial unit's geographies are polygons. + """ + return isinstance(self, PolygonSpatialUnit) + + def verify_criterion(self, criterion, negate=False): + """ + Check whether this spatial unit meets a criterion, and raise an + InvalidSpatialUnitError if not. + + Parameters + ---------- + criterion : str + One of: + 'has_geometry' + 'has_lat_lon_columns' + 'is_network_object' + 'is_polygon' + negate : bool, default False + If True, negate the criterion check (i.e. raise an error if + criterion is met). + + Raises + ------ + InvalidSpatialUnitError + if criterion is not met + ValueError + if criterion is not recognised + """ + criteria = { + "has_geometry": { + "property": self.has_geometry, + "message": f"{'has' if negate else 'does not have'} geometry information.", + }, + "has_lat_lon_columns": { + "property": self.has_lat_lon_columns, + "message": f"{'has' if negate else 'does not have'} latitude/longitude columns.", + }, + "is_network_object": { + "property": self.is_network_object, + "message": f"{'is' if negate else 'is not'} a network object.", + }, + "is_polygon": { + "property": self.is_polygon, + "message": f"{'is' if negate else 'is not'} a polygon spatial unit.", + }, + } + if criterion not in criteria.keys(): + raise ValueError(f"Unrecognised criterion '{criterion}'.") + if criteria[criterion]["property"] == negate: + raise InvalidSpatialUnitError( + f"Spatial unit {self} with location columns {self.location_columns} " + + criteria[criterion]["message"] + ) + + +class CellSpatialUnit(SpatialUnitMixin): """ This class represents the case where no join of cell ID to other data is required. As such, this class does not inherit from Query, is not a valid @@ -35,15 +128,8 @@ def __hash__(self): # this just in case. return hash(self.__class__.__name__) - @property - def location_columns(self) -> List[str]: - """ - List of the location-related column names. - """ - return list(self._loc_cols) - -class SpatialUnit(Query): +class SpatialUnit(SpatialUnitMixin, Query): """ Base class for all spatial units except CellSpatialUnit. Selects columns from the location table, and optionally joins to data in another table. @@ -117,13 +203,6 @@ def __hash__(self): # Must define this because we explicitly define self.__eq__ return hash(self.md5) - @property - def location_columns(self) -> List[str]: - """ - List of names of the columns which identify the locations. - """ - return list(self._loc_cols) - @property def column_names(self) -> List[str]: return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 8580b81328..07ac5ff632 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -3,6 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. from flowmachine.core import CustomQuery +from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.core.spatial_unit import * import pytest @@ -216,5 +217,42 @@ def test_different_grid_spatial_units_are_not_equal(): ], ) def test_make_spatial_unit_raises_errors(make_spatial_unit_args): + """ + Test that make_spatial_unit raises a ValueError when bad arguments are passed. + """ with pytest.raises(ValueError): su = make_spatial_unit(**make_spatial_unit_args) + + +@pytest.mark.parametrize( + "make_spatial_unit_args, criterion, negate", + [ + ({"spatial_unit_type": "cell"}, "has_geometry", False), + ({"spatial_unit_type": "versioned-cell"}, "has_geometry", True), + ({"spatial_unit_type": "admin", "level": 3}, "has_lat_lon_columns", False), + ({"spatial_unit_type": "lat-lon"}, "has_lat_lon_columns", True), + ({"spatial_unit_type": "admin", "level": 3}, "is_network_object", False), + ({"spatial_unit_type": "cell"}, "is_network_object", True), + ({"spatial_unit_type": "versioned-site"}, "is_network_object", True), + ({"spatial_unit_type": "lat-lon"}, "is_polygon", False), + ({"spatial_unit_type": "grid", "size": 10}, "is_polygon", True), + ], +) +def test_verify_criterion(make_spatial_unit_args, criterion, negate): + """ + Test that the verify_criterion method raises an InvalidSpatialUnitError + when the criterion is not met. + """ + su = make_spatial_unit(**make_spatial_unit_args) + with pytest.raises(InvalidSpatialUnitError): + su.verify_criterion(criterion, negate=negate) + + +def test_verify_criterion_raises_value_error(): + """ + Test that the verify_criterion method raises a ValueError if the criterion + is not recognised. + """ + su = CellSpatialUnit() + with pytest.raises(ValueError): + su.verify_criterion("BAD_CRITERION") From 04fe2f4563501eaebf0cdcb357dd6b113c915a0e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 6 Jun 2019 12:56:59 +0100 Subject: [PATCH 076/138] Use spatial_unit.verify_criterion anywhere that needs it --- .../flowmachine/core/join_to_location.py | 15 +++----- .../flowmachine/core/mixins/geodata_mixin.py | 10 +---- flowmachine/flowmachine/core/spatial_unit.py | 12 +++--- .../flowmachine/features/location/flows.py | 15 +++----- .../location/unique_subscriber_counts.py | 13 +++++-- .../features/network/total_network_objects.py | 38 ++++++------------- .../features/spatial/distance_matrix.py | 16 +++----- .../features/subscriber/daily_location.py | 16 ++++---- .../features/subscriber/day_trajectories.py | 12 +++--- .../features/subscriber/displacement.py | 15 ++------ .../features/subscriber/last_location.py | 5 +-- .../features/subscriber/modal_location.py | 24 +++++------- .../subscriber/most_frequent_location.py | 5 +-- .../features/utilities/multilocation.py | 7 ---- .../utilities/subscriber_locations.py | 10 ++--- flowmachine/flowmachine/models/pwo.py | 11 +++--- flowmachine/tests/test_flows.py | 2 +- flowmachine/tests/test_geomixin.py | 2 +- flowmachine/tests/test_join_to_location.py | 5 ++- flowmachine/tests/test_spatial_unit.py | 6 +-- .../tests/test_total_network_objects.py | 32 +++++++++++----- 21 files changed, 115 insertions(+), 156 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index f0d3480919..8989958d12 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -51,11 +51,8 @@ class JoinToLocation(Query): """ def __init__(self, left, *, spatial_unit, time_col="time"): - if spatial_unit == CellSpatialUnit(): - # Nothing to join in this case - raise ValueError( - "CellSpatialUnit is not a valid spatial unit type for JoinToLocation" - ) + # No need to join if spatial_unit has no geography information (i.e. just cell ID) + spatial_unit.verify_criterion("has_geography") self.spatial_unit = spatial_unit self.left = left self.time_col = time_col @@ -113,7 +110,7 @@ def _make_query(self): def location_joined_query(left, *, spatial_unit, time_col="time"): """ Helper function which returns JoinToLocation(left_query, spatial_unit, time_col) - unless spatial_unit == CellSpatialUnit(), in which case this returns left_query. + if spatial_unit has geography information, otherwise returns left_query. Parameters ---------- @@ -133,7 +130,7 @@ def location_joined_query(left, *, spatial_unit, time_col="time"): flowmachine.Query Either a JoinToLocation object, or the input parameter 'left' """ - if spatial_unit == CellSpatialUnit(): - return left - else: + if spatial_unit.has_geography: return JoinToLocation(left, spatial_unit=spatial_unit, time_col=time_col) + else: + return left diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index 974d32b54e..ff1e1f86ad 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -48,20 +48,14 @@ def _geo_augmented_query(self): """ join_columns_string = ",".join(self.spatial_unit.location_columns) - try: - geom_query = self.spatial_unit.get_geom_query() - except AttributeError: - raise ValueError( - f"Query {self} with spatial_unit {self.spatial_unit} has no " - "geography information." - ) + self.spatial_unit.verify_criterion("has_geography") sql = f""" SELECT row_number() over() AS gid, * FROM ({self.get_query()}) AS Q - LEFT JOIN ({geom_query}) AS G + LEFT JOIN ({self.spatial_unit.get_geom_query()}) AS G USING ({join_columns_string}) """ cols = list(set(self.column_names + ["gid", "geom"])) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 8fe95e71a4..5dccf0de4d 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -30,9 +30,9 @@ def location_columns(self) -> List[str]: return list(self._loc_cols) @property - def has_geometry(self): + def has_geography(self): """ - True if spatial unit has geometry information. + True if spatial unit has geography information. """ return hasattr(self, "get_geom_query") @@ -68,7 +68,7 @@ def verify_criterion(self, criterion, negate=False): ---------- criterion : str One of: - 'has_geometry' + 'has_geography' 'has_lat_lon_columns' 'is_network_object' 'is_polygon' @@ -84,9 +84,9 @@ def verify_criterion(self, criterion, negate=False): if criterion is not recognised """ criteria = { - "has_geometry": { - "property": self.has_geometry, - "message": f"{'has' if negate else 'does not have'} geometry information.", + "has_geography": { + "property": self.has_geography, + "message": f"{'has' if negate else 'does not have'} geography information.", }, "has_lat_lon_columns": { "property": self.has_lat_lon_columns, diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index d49a4c9569..68e5ed9903 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -17,6 +17,7 @@ from ...core.query import Query from ...core.mixins import GeoDataMixin, GraphMixin +from ...core.errors import InvalidSpatialUnitError import structlog @@ -43,7 +44,7 @@ def __init__(self, loc1, loc2): """ if loc1.spatial_unit != loc2.spatial_unit: - raise ValueError( + raise InvalidSpatialUnitError( "You cannot compute flows for locations on different spatial units" ) @@ -116,6 +117,8 @@ def _geo_augmented_query(self): str A version of this query with geom and gid columns """ + self.spatial_unit.verify_criterion("has_geography") + loc_cols = self.spatial_unit.location_columns loc_cols_string = ",".join(loc_cols) loc_cols_from_string = ",".join([f"{col}_from" for col in loc_cols]) @@ -152,20 +155,12 @@ def _geo_augmented_query(self): USING ({loc_cols_string}) """ - try: - geom_query = self.spatial_unit.get_geom_query() - except AttributeError: - raise ValueError( - f"Query {self} with spatial_unit {self.spatial_unit} has no " - "geography information." - ) - joined_query = f""" SELECT row_number() over() AS gid, * FROM ({agg_qry}) AS Q - LEFT JOIN ({geom_query}) AS G + LEFT JOIN ({self.spatial_unit.get_geom_query()}) AS G USING ({loc_cols_string}) """ diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index 9677d506cc..f740d0e6d4 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -17,7 +17,7 @@ """ from ...core.query import Query from ...core.mixins import GeoDataMixin -from ...core.spatial_unit import CellSpatialUnit +from ...core.spatial_unit import make_spatial_unit from ..utilities.subscriber_locations import subscriber_locations @@ -35,9 +35,9 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default CellSpatialUnit() + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -71,7 +71,12 @@ class UniqueSubscriberCounts(GeoDataMixin, Query): """ def __init__( - self, start, stop, spatial_unit=CellSpatialUnit(), hours="all", table="all" + self, + start, + stop, + spatial_unit=make_spatial_unit("cell"), + hours="all", + table="all", ): """ diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 7611b8c344..f8826b2157 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -16,7 +16,7 @@ from ...core.mixins import GeoDataMixin from ...core import location_joined_query from ...core.query import Query -from ...core.spatial_unit import CellSpatialUnit, admin_spatial_unit +from ...core.spatial_unit import make_spatial_unit from ..utilities import EventsTablesUnion valid_stats = {"avg", "max", "min", "median", "mode", "stddev", "variance"} @@ -37,15 +37,15 @@ class TotalNetworkObjects(GeoDataMixin, Query): total_by : {'second', 'minute', 'hour', 'day', 'month', 'year'} A period definition to group data by. table : str - Either 'calls', 'sms', or other table under `events.*`. If - no specific table is provided this will collect - statistics from all tables. - network_object : {Cell,VersionedCell,VersionedSite}SpatialUnit, default CellSpatialUnit() + Either 'calls', 'sms', or other table under `events.*`. If no specific + table is provided this will collect statistics from all tables. + network_object : flowmachine.core.spatial_unit.*SpatialUnit, default cell Objects to track, defaults to CellSpatialUnit(), the unversioned lowest level of infrastructure available. - spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, - default admin_spatial_unit(level=0) + Must have network_object.is_network_object == True. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin0 Spatial unit to facet on. + Must have spatial_unit.is_network_object == False. Other Parameters ---------------- @@ -69,7 +69,7 @@ def __init__( *, table="all", total_by="day", - network_object=CellSpatialUnit(), + network_object=make_spatial_unit("cell"), spatial_unit=None, hours="all", subscriber_subset=None, @@ -90,29 +90,15 @@ def __init__( if self.table != "all" and not self.table.startswith("events"): self.table = "events.{}".format(self.table) - def is_allowed_network_object(spatial_unit): - return ( - "location_id" in spatial_unit.location_columns - or "site_id" in spatial_unit.location_columns - ) - + network_object.verify_criterion("is_network_object") self.network_object = network_object - if not is_allowed_network_object(self.network_object): - raise ValueError( - "{} is not a valid network object.".format(self.network_object) - ) if spatial_unit is None: - self.spatial_unit = admin_spatial_unit(level=0) + self.spatial_unit = make_spatial_unit("admin", level=0) else: self.spatial_unit = spatial_unit - if is_allowed_network_object(self.spatial_unit): - # No sense in aggregating network object to network object - raise ValueError( - "{} is not a valid spatial unit for TotalNetworkObjects".format( - self.spatial_unit - ) - ) + # No sense in aggregating network object to network object + self.spatial_unit.verify_criterion("is_network_object", negate=True) events = location_joined_query( EventsTablesUnion( diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 948ed7ed17..78a402904f 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -11,7 +11,7 @@ from ...core.query import Query from ...core.mixins import GraphMixin -from ...core.spatial_unit import versioned_cell_spatial_unit +from ...core.spatial_unit import make_spatial_unit class DistanceMatrix(GraphMixin, Query): @@ -25,10 +25,10 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default versioned_cell_spatial_unit() + spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default versioned-cell Locations to compute distances for. - Note: only point locations (i.e. spatial units with "lat" and "lon" - included in location_columns) are supported at this time. + Note: only point locations (i.e. spatial_unit.has_lat_lon_columns) are + supported at this time. return_geometry : bool If True, geometries are returned in query (represented as WKB in a dataframe). This @@ -39,15 +39,11 @@ class DistanceMatrix(GraphMixin, Query): def __init__(self, spatial_unit=None, return_geometry=False): if spatial_unit is None: - self.spatial_unit = versioned_cell_spatial_unit() + self.spatial_unit = make_spatial_unit("versioned-cell") else: self.spatial_unit = spatial_unit - if not ( - "lat" in self.spatial_unit.location_columns - and "lon" in self.spatial_unit.location_columns - ): - raise ValueError("Only point locations are supported at this time.") + self.spatial_unit.verify_criterion("has_lat_lon_columns") self.return_geometry = return_geometry diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index 8377df339b..a7e0160743 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -13,7 +13,7 @@ """ import datetime -from ...core.spatial_unit import admin_spatial_unit +from ...core.spatial_unit import make_spatial_unit from .last_location import LastLocation from .most_frequent_location import MostFrequentLocation @@ -42,10 +42,9 @@ def locate_subscribers( start, stop : str iso format date range for the the time frame, e.g. 2016-01-01 or 2016-01-01 14:03:01 - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin_spatial_unit(level=3) + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -95,7 +94,7 @@ def locate_subscribers( . """ if spatial_unit is None: - spatial_unit = admin_spatial_unit(level=3) + spatial_unit = make_spatial_unit("admin", level=3) if method == "last": return LastLocation( @@ -150,10 +149,9 @@ def daily_location( stop : str optionally specify a stop datetime in iso format date for the day in question, e.g. 2016-01-02 06:00:00 - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin_spatial_unit(level=3) + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -186,7 +184,7 @@ def daily_location( """ if spatial_unit is None: - spatial_unit = admin_spatial_unit(level=3) + spatial_unit = make_spatial_unit("admin", level=3) # Temporary band-aid; marshmallow deserialises date strings # to date objects, so we convert it back here because the diff --git a/flowmachine/flowmachine/features/subscriber/day_trajectories.py b/flowmachine/flowmachine/features/subscriber/day_trajectories.py index 560f552f1a..53d1255249 100644 --- a/flowmachine/flowmachine/features/subscriber/day_trajectories.py +++ b/flowmachine/flowmachine/features/subscriber/day_trajectories.py @@ -49,7 +49,7 @@ def _make_query(self): Default query method implemented in the metaclass Query(). """ - relevant_columns = self._get_relevant_columns() + location_columns_string = ", ".join(self.spatial_unit.location_columns) # This query represents the concatenated locations of the # subscribers. Similar to the first step when calculating @@ -58,15 +58,13 @@ def _make_query(self): lambda x, y: x.union(y), (self._append_date(dl) for dl in self._all_dls) ) - sql = """ + sql = f""" SELECT all_locs.subscriber, - {rc}, + {location_columns_string}, all_locs.date - FROM ({all_locs}) AS all_locs + FROM ({all_locs.get_query()}) AS all_locs ORDER BY all_locs.subscriber, all_locs.date - """.format( - all_locs=all_locs.get_query(), rc=relevant_columns - ) + """ return sql diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index bf0a362448..a4fe095329 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -115,23 +115,14 @@ def __init__( self.start = start - def is_allowed_spatial_unit(spatial_unit): - return ( - "lat" in spatial_unit.location_columns - and "lon" in spatial_unit.location_columns - ) - if modal_locations: - if isinstance(modal_locations, ModalLocation) and is_allowed_spatial_unit( - modal_locations.spatial_unit - ): + if isinstance(modal_locations, ModalLocation): hl = modal_locations else: raise ValueError( - "Argument 'modal_locations' should be an instance of " - "ModalLocation class with 'lat' and 'lon' in " - "spatial_unit.location_columns" + "Argument 'modal_locations' should be an instance of ModalLocation class" ) + hl.spatial_unit.verify_criterion("has_lat_lon_columns") else: hl = ModalLocation( *[ diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index 555b01bb52..faa07bab7f 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -30,10 +30,9 @@ class LastLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin3 + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring ofmake_spatial_unit for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but diff --git a/flowmachine/flowmachine/features/subscriber/modal_location.py b/flowmachine/flowmachine/features/subscriber/modal_location.py index 73a850d74c..3cbf24823c 100644 --- a/flowmachine/flowmachine/features/subscriber/modal_location.py +++ b/flowmachine/flowmachine/features/subscriber/modal_location.py @@ -36,7 +36,7 @@ def _make_query(self): Default query method implemented in the metaclass Query(). """ - relevant_columns = self._get_relevant_columns() + location_columns_string = ", ".join(self.spatial_unit.location_columns) # This query represents the concatenated locations of the # subscribers @@ -44,24 +44,20 @@ def _make_query(self): lambda x, y: x.union(y), (self._append_date(dl) for dl in self._all_dls) ) - times_visited = """ - SELECT all_locs.subscriber, {rc}, count(*) AS total, max(all_locs.date) as date - FROM ({all_locs}) AS all_locs - GROUP BY all_locs.subscriber, {rc} - """.format( - all_locs=all_locs.get_query(), rc=relevant_columns - ) + times_visited = f""" + SELECT all_locs.subscriber, {location_columns_string}, count(*) AS total, max(all_locs.date) as date + FROM ({all_locs.get_query()}) AS all_locs + GROUP BY all_locs.subscriber, {location_columns_string} + """ - sql = """ - SELECT ranked.subscriber, {rc} + sql = f""" + SELECT ranked.subscriber, {location_columns_string} FROM - (SELECT times_visited.subscriber, {rc}, + (SELECT times_visited.subscriber, {location_columns_string}, row_number() OVER (PARTITION BY times_visited.subscriber ORDER BY total DESC, times_visited.date DESC) AS rank FROM ({times_visited}) AS times_visited) AS ranked WHERE rank = 1 - """.format( - times_visited=times_visited, rc=relevant_columns - ) + """ return sql diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 8b90a2da48..daeb6a247b 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -28,10 +28,9 @@ class MostFrequentLocation(BaseLocation, Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default admin3 + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring of make_spatial_unit for more information. hours : tuple of int, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but diff --git a/flowmachine/flowmachine/features/utilities/multilocation.py b/flowmachine/flowmachine/features/utilities/multilocation.py index d41b915167..5adbd6cabe 100644 --- a/flowmachine/flowmachine/features/utilities/multilocation.py +++ b/flowmachine/flowmachine/features/utilities/multilocation.py @@ -81,10 +81,3 @@ def _append_date(self, dl): date_string = f"to_date('{dl.start}','YYYY-MM-DD') AS date" sql = f"SELECT *, {date_string} FROM ({dl.get_query()}) AS dl" return CustomQuery(sql, self.spatial_unit.location_columns + ["date"]) - - def _get_relevant_columns(self): - """ - Get a string of the location related columns - """ - - return ", ".join(self.spatial_unit.location_columns) diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index e0885181b3..608eab6293 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -18,7 +18,7 @@ from ...core.query import Query from ...core.join_to_location import location_joined_query -from ...core.spatial_unit import CellSpatialUnit +from ...core.spatial_unit import make_spatial_unit import structlog @@ -46,7 +46,7 @@ def __init__( self.table = table self.subscriber_identifier = subscriber_identifier self.ignore_nulls = ignore_nulls - self.spatial_unit = CellSpatialUnit() + self.spatial_unit = make_spatial_unit("cell") self.tables = table cols = [self.subscriber_identifier, "datetime", "location_id"] @@ -122,7 +122,7 @@ def subscriber_locations( start, stop, *, - spatial_unit=CellSpatialUnit(), + spatial_unit=make_spatial_unit("cell"), hours="all", table="all", subscriber_identifier="msisdn", @@ -140,9 +140,9 @@ def subscriber_locations( e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default CellSpatialUnit() + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell Spatial unit to which subscriber locations will be mapped. See the - docstring of spatial_unit.py for more information. + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 2280dd638a..4441a1e8eb 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -35,7 +35,7 @@ from ..features import ModalLocation from ..core.query import Query from ..core.model import Model, model_result -from ..core.spatial_unit import versioned_site_spatial_unit +from ..core.spatial_unit import make_spatial_unit from ..features.spatial.distance_matrix import DistanceMatrix import structlog @@ -192,10 +192,9 @@ class PopulationWeightedOpportunities(Model): default method used. Refer to the Population() documentation for other available methods. - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, - default versioned_site_spatial_unit() - Note: DistanceMatrix only supports spatial units - with 'lat' and 'lon' columns at this time. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default versioned-site + Note: DistanceMatrix only supports spatial units with 'lat' and 'lon' + columns at this time. **kwargs : arguments Used to pass custom arguments to the ModalLocation() objects. @@ -253,7 +252,7 @@ def __init__( self.stop = stop self.method = method if spatial_unit is None: - self.spatial_unit = versioned_site_spatial_unit() + self.spatial_unit = make_spatial_unit("versioned-site") else: self.spatial_unit = spatial_unit self.distance_matrix = DistanceMatrix( diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 92d583359c..98f0aecc77 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -103,7 +103,7 @@ def test_valid_flows_geojson(exemplar_spatial_unit_param): Check that valid geojson is returned for Flows. """ - if make_spatial_unit("cell") == exemplar_spatial_unit_param: + if not exemplar_spatial_unit_param.has_geography: pytest.skip("Query with spatial_unit=CellSpatialUnit() has no geometry.") dl = daily_location("2016-01-01", spatial_unit=exemplar_spatial_unit_param) dl2 = daily_location("2016-01-02", spatial_unit=exemplar_spatial_unit_param) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index ae6e0aeb59..6795c947ee 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -74,7 +74,7 @@ def test_valid_geojson(exemplar_spatial_unit_param): Check that valid geojson is returned. """ - if make_spatial_unit("cell") == exemplar_spatial_unit_param: + if not exemplar_spatial_unit_param.has_geography: pytest.skip("Query with spatial_unit=CellSpatialUnit() has no geometry.") dl = daily_location( "2016-01-01", "2016-01-02", spatial_unit=exemplar_spatial_unit_param diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index b6e7203c4e..aba83465de 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -15,7 +15,7 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" - if make_spatial_unit("cell") == exemplar_spatial_unit_param: + if not exemplar_spatial_unit_param.has_geography: pytest.skip("JoinToLocation does not accept CellSpatialUnit objects") table = subscriber_locations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") @@ -26,7 +26,8 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): def test_join_to_location_raises_value_error(): """ - Test that JoinToLocation raises a ValueError if spatial_unit==CellSpatialUnit(). + Test that JoinToLocation raises a ValueError if spatial_unit does not have + geography information. """ with pytest.raises(ValueError): table = subscriber_locations( diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 07ac5ff632..efd669b4c0 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -26,7 +26,7 @@ def test_get_geom_query_column_names( """ Test that the get_geom_query method returns a query with the correct columns. """ - if CellSpatialUnit() == exemplar_spatial_unit_param: + if not exemplar_spatial_unit_param.has_geography: pytest.skip("CellSpatialUnit does not have a get_geom_query method") geom_query = exemplar_spatial_unit_param.get_geom_query() cols = exemplar_spatial_unit_param.location_columns + ["geom"] @@ -227,8 +227,8 @@ def test_make_spatial_unit_raises_errors(make_spatial_unit_args): @pytest.mark.parametrize( "make_spatial_unit_args, criterion, negate", [ - ({"spatial_unit_type": "cell"}, "has_geometry", False), - ({"spatial_unit_type": "versioned-cell"}, "has_geometry", True), + ({"spatial_unit_type": "cell"}, "has_geography", False), + ({"spatial_unit_type": "versioned-cell"}, "has_geography", True), ({"spatial_unit_type": "admin", "level": 3}, "has_lat_lon_columns", False), ({"spatial_unit_type": "lat-lon"}, "has_lat_lon_columns", True), ({"spatial_unit_type": "admin", "level": 3}, "is_network_object", False), diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index ae6fdedaa0..5e15c196c4 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -11,6 +11,7 @@ import pytest from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects @@ -78,19 +79,30 @@ def test_count_returns_correct_values(get_dataframe): assert df.value[34] == 31 +def test_bad_total_by(): + """Test value errors are raised for bad 'total_by' param""" + with pytest.raises(ValueError): + TotalNetworkObjects( + start="2016-01-01", + stop="2016-12-30", + table="calls", + total_by="BAD_TOTAL_BY", + ) + + @pytest.mark.parametrize( - "bad_arg, bad_val", - [ - ("total_by", "BAD_TOTAL_BY"), - ("spatial_unit", make_spatial_unit("cell")), - ("network_object", "BAD_OBJECT"), - ], + "bad_arg, spatial_unit_type", + [("spatial_unit", "cell"), ("network_object", "lat-lon")], ) -def test_bad_params(bad_arg, bad_val): - """Test value errors are raised for bad params""" - with pytest.raises(ValueError): +def test_bad_spatial_units(bad_arg, spatial_unit_type): + """ + Test InvalidSpatialUnitErrors are raised for bad 'network_object' or + 'spatial_unit' params. + """ + su = make_spatial_unit(spatial_unit_type) + with pytest.raises(InvalidSpatialUnitError): TotalNetworkObjects( - start="2016-01-01", stop="2016-12-30", table="calls", **{bad_arg: bad_val} + start="2016-01-01", stop="2016-12-30", table="calls", **{bad_arg: su} ) From 2e959e5d61a27a427f801fac0d9f741620a8e397 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 6 Jun 2019 18:34:47 +0100 Subject: [PATCH 077/138] Update get_name_and_alias --- flowmachine/flowmachine/utils.py | 6 +++++- flowmachine/tests/test_utils.py | 30 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index 111f162eec..73d1d73dc5 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -317,10 +317,14 @@ def get_name_and_alias(column_name): ('col', 'alias') >>> get_name_and_alias("col") ('col', 'col') + >>> get_name_and_alias("table.col") + ('table.col', 'col') + >>> get_name_and_alias("table.col as alias") + ('table.col', 'alias') """ column_name_split = re.split(" as ", column_name, flags=re.IGNORECASE) if len(column_name_split) == 1: - return column_name_split[0].strip(), column_name_split[0].strip() + return column_name_split[0].strip(), column_name_split[0].strip().split(".")[-1] else: return column_name_split[0].strip(), column_name_split[-1].strip() diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index a9224039e4..dd1c4afc37 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -169,11 +169,20 @@ def test_get_secrets_default(monkeypatch): @pytest.mark.parametrize( - "column_name, alias", - [("column", "column"), ("column AS alias", "alias"), ("column as alias", "alias")], + "column_name, name, alias", + [ + ("column", "column", "column"), + ("column AS alias", "column", "alias"), + ("column as alias", "column", "alias"), + ("table.column", "table.column", "column"), + ("table.column AS alias", "table.column", "alias"), + ], ) -def test_get_name_and_alias(column_name, alias): - assert ("column", alias) == get_name_and_alias(column_name) +def test_get_name_and_alias(column_name, name, alias): + """ + Test that get_name_and_alias correctly splits a column name into name and alias. + """ + assert (name, alias) == get_name_and_alias(column_name) def test_convert_dict_keys_to_strings(): @@ -224,10 +233,7 @@ def test_print_dependency_tree(): ) ) q = daily_location( - date="2016-01-02", - level="admin2", - method="most-common", - subscriber_subset=subscriber_subsetter, + date="2016-01-02", method="most-common", subscriber_subset=subscriber_subsetter ) expected_output = textwrap.dedent( @@ -244,8 +250,10 @@ def test_print_dependency_tree(): - - - - - - - + - + - + - + - """ ) @@ -276,7 +284,7 @@ def test_plot_dependency_graph(): """ Test that plot_dependency_graph() runs and returns the expected IPython.display objects. """ - query = daily_location(date="2016-01-02", level="admin2", method="most-common") + query = daily_location(date="2016-01-02") output_svg = plot_dependency_graph(query, format="svg") output_png = plot_dependency_graph(query, format="png", width=600, height=200) From 720d2170d57efc6fbfe115c903483f9fdc71ad98 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 6 Jun 2019 18:37:30 +0100 Subject: [PATCH 078/138] Rename 'location_columns' to 'location_id_columns' --- .../flowmachine/core/join_to_location.py | 4 ++-- .../flowmachine/core/mixins/geodata_mixin.py | 2 +- flowmachine/flowmachine/core/query.py | 2 +- .../flowmachine/features/location/flows.py | 14 +++++++------- .../location/unique_subscriber_counts.py | 4 ++-- .../features/network/total_network_objects.py | 10 +++++----- .../features/spatial/distance_matrix.py | 8 ++++---- .../features/subscriber/day_trajectories.py | 4 ++-- .../features/subscriber/last_location.py | 4 ++-- .../features/subscriber/modal_location.py | 4 ++-- .../subscriber/most_frequent_location.py | 4 ++-- .../features/utilities/multilocation.py | 2 +- flowmachine/flowmachine/models/pwo.py | 18 ++++++++++-------- flowmachine/tests/test_spatial_unit.py | 12 ++++++------ 14 files changed, 47 insertions(+), 45 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 8989958d12..f33d7f6784 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -70,7 +70,7 @@ def __getattr__(self, name): @property def column_names(self) -> List[str]: - right_columns = self.spatial_unit.location_columns + right_columns = self.spatial_unit.location_id_columns left_columns = self.left.column_names for column in right_columns: if column in left_columns: @@ -78,7 +78,7 @@ def column_names(self) -> List[str]: return left_columns + right_columns def _make_query(self): - right_columns = self.spatial_unit.location_columns + right_columns = self.spatial_unit.location_id_columns left_columns = self.left.column_names for column in right_columns: if column in left_columns: diff --git a/flowmachine/flowmachine/core/mixins/geodata_mixin.py b/flowmachine/flowmachine/core/mixins/geodata_mixin.py index ff1e1f86ad..a958f2c037 100644 --- a/flowmachine/flowmachine/core/mixins/geodata_mixin.py +++ b/flowmachine/flowmachine/core/mixins/geodata_mixin.py @@ -46,7 +46,7 @@ def _geo_augmented_query(self): list The columns this query contains """ - join_columns_string = ",".join(self.spatial_unit.location_columns) + join_columns_string = ",".join(self.spatial_unit.location_id_columns) self.spatial_unit.verify_criterion("has_geography") diff --git a/flowmachine/flowmachine/core/query.py b/flowmachine/flowmachine/core/query.py index 0822863885..a2463fa409 100644 --- a/flowmachine/flowmachine/core/query.py +++ b/flowmachine/flowmachine/core/query.py @@ -937,7 +937,7 @@ def index_cols(self): cols = self.column_names ixen = [] try: - loc_cols = self.spatial_unit.location_columns + loc_cols = self.spatial_unit.location_id_columns if set(loc_cols).issubset(cols): ixen.append(loc_cols) except AttributeError: diff --git a/flowmachine/flowmachine/features/location/flows.py b/flowmachine/flowmachine/features/location/flows.py index 68e5ed9903..bd075aab67 100644 --- a/flowmachine/flowmachine/features/location/flows.py +++ b/flowmachine/flowmachine/features/location/flows.py @@ -81,12 +81,12 @@ def inflow(self): @property def index_cols(self): - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return [["{}_from".format(x) for x in cols], ["{}_to".format(x) for x in cols]] @property def column_names(self) -> List[str]: - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return ( [f"{col}_from" for col in cols] + [f"{col}_to" for col in cols] + ["count"] ) @@ -119,7 +119,7 @@ def _geo_augmented_query(self): """ self.spatial_unit.verify_criterion("has_geography") - loc_cols = self.spatial_unit.location_columns + loc_cols = self.spatial_unit.location_id_columns loc_cols_string = ",".join(loc_cols) loc_cols_from_string = ",".join([f"{col}_from" for col in loc_cols]) loc_cols_to_string = ",".join([f"{col}_to" for col in loc_cols]) @@ -212,12 +212,12 @@ def _make_query(self): @property def index_cols(self): - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return [[f"{x}_from" for x in cols]] @property def column_names(self) -> List[str]: - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return [f"{col}_from" for col in cols] + ["total"] @@ -234,10 +234,10 @@ def _make_query(self): @property def index_cols(self): - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return [[f"{x}_to" for x in cols]] @property def column_names(self) -> List[str]: - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return [f"{col}_to" for col in cols] + ["total"] diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index f740d0e6d4..a89d08eb30 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -99,7 +99,7 @@ def __init__( @property def column_names(self) -> List[str]: - return self.spatial_unit.location_columns + ["unique_subscriber_counts"] + return self.spatial_unit.location_id_columns + ["unique_subscriber_counts"] def _make_query(self): """ @@ -107,7 +107,7 @@ def _make_query(self): metaclass Query(). """ - relevant_columns = ",".join(self.spatial_unit.location_columns) + relevant_columns = ",".join(self.spatial_unit.location_id_columns) sql = """ SELECT {rc}, COUNT(unique_subscribers) AS unique_subscriber_counts FROM (SELECT diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index f8826b2157..b8957f1320 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -125,11 +125,11 @@ def __init__( @property def column_names(self) -> List[str]: - return self.spatial_unit.location_columns + ["value", "datetime"] + return self.spatial_unit.location_id_columns + ["value", "datetime"] def _make_query(self): - cols = self.network_object.location_columns - group_cols = self.spatial_unit.location_columns + cols = self.network_object.location_id_columns + group_cols = self.spatial_unit.location_id_columns for column in group_cols: if column in cols: cols.remove(column) @@ -214,10 +214,10 @@ def __init__(self, *, total_network_objects, statistic="avg", aggregate_by=None) @property def column_names(self) -> List[str]: - return self.spatial_unit.location_columns + ["value", "datetime"] + return self.spatial_unit.location_id_columns + ["value", "datetime"] def _make_query(self): - group_cols = ",".join(self.spatial_unit.location_columns) + group_cols = ",".join(self.spatial_unit.location_id_columns) if self.statistic == "mode": av_call = f"pg_catalog.mode() WITHIN GROUP(ORDER BY z.value)" else: diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 78a402904f..efe8b5f9bd 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -51,8 +51,8 @@ def __init__(self, spatial_unit=None, return_geometry=False): @property def column_names(self) -> List[str]: - col_names = [f"{c}_from" for c in self.spatial_unit.location_columns] - col_names += [f"{c}_to" for c in self.spatial_unit.location_columns] + col_names = [f"{c}_from" for c in self.spatial_unit.location_id_columns] + col_names += [f"{c}_to" for c in self.spatial_unit.location_id_columns] col_names += ["distance"] if self.return_geometry: col_names += ["geom_origin", "geom_destination"] @@ -60,10 +60,10 @@ def column_names(self) -> List[str]: def _make_query(self): cols_A = ",".join( - [f"A.{c} AS {c}_from" for c in self.spatial_unit.location_columns] + [f"A.{c} AS {c}_from" for c in self.spatial_unit.location_id_columns] ) cols_B = ",".join( - [f"B.{c} AS {c}_to" for c in self.spatial_unit.location_columns] + [f"B.{c} AS {c}_to" for c in self.spatial_unit.location_id_columns] ) geom_query = self.spatial_unit.get_geom_query() diff --git a/flowmachine/flowmachine/features/subscriber/day_trajectories.py b/flowmachine/flowmachine/features/subscriber/day_trajectories.py index 53d1255249..edfc67c98a 100644 --- a/flowmachine/flowmachine/features/subscriber/day_trajectories.py +++ b/flowmachine/flowmachine/features/subscriber/day_trajectories.py @@ -42,14 +42,14 @@ class DayTrajectories(MultiLocation, BaseLocation, Query): @property def column_names(self) -> List[str]: - return ["subscriber"] + self.spatial_unit.location_columns + ["date"] + return ["subscriber"] + self.spatial_unit.location_id_columns + ["date"] def _make_query(self): """ Default query method implemented in the metaclass Query(). """ - location_columns_string = ", ".join(self.spatial_unit.location_columns) + location_columns_string = ", ".join(self.spatial_unit.location_id_columns) # This query represents the concatenated locations of the # subscribers. Similar to the first step when calculating diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index faa07bab7f..f0c61d9be0 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -101,14 +101,14 @@ def __init__( @property def column_names(self) -> List[str]: - return ["subscriber"] + self.spatial_unit.location_columns + return ["subscriber"] + self.spatial_unit.location_id_columns def _make_query(self): """ Default query method implemented in the metaclass Query(). """ - relevant_columns = ",".join(self.spatial_unit.location_columns) + relevant_columns = ",".join(self.spatial_unit.location_id_columns) sql = """ SELECT final_time.subscriber, {rc} diff --git a/flowmachine/flowmachine/features/subscriber/modal_location.py b/flowmachine/flowmachine/features/subscriber/modal_location.py index 3cbf24823c..92f6b6a4a5 100644 --- a/flowmachine/flowmachine/features/subscriber/modal_location.py +++ b/flowmachine/flowmachine/features/subscriber/modal_location.py @@ -29,14 +29,14 @@ class ModalLocation(MultiLocation, BaseLocation, Query): @property def column_names(self) -> List[str]: - return ["subscriber"] + self.spatial_unit.location_columns + return ["subscriber"] + self.spatial_unit.location_id_columns def _make_query(self): """ Default query method implemented in the metaclass Query(). """ - location_columns_string = ", ".join(self.spatial_unit.location_columns) + location_columns_string = ", ".join(self.spatial_unit.location_id_columns) # This query represents the concatenated locations of the # subscribers diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index daeb6a247b..872a5c88f6 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -104,7 +104,7 @@ def __init__( @property def column_names(self) -> List[str]: - return ["subscriber"] + self.spatial_unit.location_columns + return ["subscriber"] + self.spatial_unit.location_id_columns def _make_query(self): """ @@ -113,7 +113,7 @@ def _make_query(self): """ subscriber_query = "{} ORDER BY time".format(self.subscriber_locs.get_query()) - relevant_columns = ", ".join(self.spatial_unit.location_columns) + relevant_columns = ", ".join(self.spatial_unit.location_id_columns) # Create a table which has the total times each subscriber visited # each location diff --git a/flowmachine/flowmachine/features/utilities/multilocation.py b/flowmachine/flowmachine/features/utilities/multilocation.py index 5adbd6cabe..ad92f1bafb 100644 --- a/flowmachine/flowmachine/features/utilities/multilocation.py +++ b/flowmachine/flowmachine/features/utilities/multilocation.py @@ -80,4 +80,4 @@ def _append_date(self, dl): date_string = f"to_date('{dl.start}','YYYY-MM-DD') AS date" sql = f"SELECT *, {date_string} FROM ({dl.get_query()}) AS dl" - return CustomQuery(sql, self.spatial_unit.location_columns + ["date"]) + return CustomQuery(sql, self.spatial_unit.location_id_columns + ["date"]) diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 4441a1e8eb..3e4e201946 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -74,7 +74,7 @@ def __get_location_buffer(self): (i..e an origin) and all its possible counterparts (i.e. destinations). """ - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns from_cols = ", ".join("{c}_from".format(c=c) for c in cols) to_cols = ", ".join("{c}_to".format(c=c) for c in cols) @@ -99,7 +99,7 @@ def __get_location_buffer(self): @property def column_names(self) -> List[str]: - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns return ( ["id"] @@ -114,7 +114,7 @@ def _make_query(self): that calculates the population that is covered by a buffer. """ - cols = self.spatial_unit.location_columns + cols = self.spatial_unit.location_id_columns from_cols = ", ".join("B.{c}_from".format(c=c) for c in cols) outer_from_cols = ", ".join("C.{c}_from".format(c=c) for c in cols) @@ -368,18 +368,20 @@ def run( ix = [ "{}_{}".format(c, d) for d in ("from", "to") - for c in self.spatial_unit.location_columns + for c in self.spatial_unit.location_id_columns ] population_buffer.set_index(ix, inplace=True) M = population_df["total"].sum() - N = len(population_df[self.spatial_unit.location_columns].drop_duplicates()) + N = len( + population_df[self.spatial_unit.location_id_columns].drop_duplicates() + ) beta = 1 / M locations = population_df[ - self.spatial_unit.location_columns + self.spatial_unit.location_id_columns ].values.tolist() - population_df.set_index(self.spatial_unit.location_columns, inplace=True) + population_df.set_index(self.spatial_unit.location_id_columns, inplace=True) if not departure_rate_vector: logger.warning( @@ -432,7 +434,7 @@ def run( ix = [ "{}_{}".format(c, d) for d in ("from", "to") - for c in self.spatial_unit.location_columns + for c in self.spatial_unit.location_id_columns ] ix += ["prediction", "probability"] res = pd.DataFrame(results, columns=ix) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index efd669b4c0..f0e9bbbfe0 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -29,7 +29,7 @@ def test_get_geom_query_column_names( if not exemplar_spatial_unit_param.has_geography: pytest.skip("CellSpatialUnit does not have a get_geom_query method") geom_query = exemplar_spatial_unit_param.get_geom_query() - cols = exemplar_spatial_unit_param.location_columns + ["geom"] + cols = exemplar_spatial_unit_param.location_id_columns + ["geom"] cq = CustomQuery(geom_query, cols) assert sorted(get_column_names_from_run(cq)) == sorted(cols) @@ -81,18 +81,18 @@ def test_get_geom_query_column_names( ({"spatial_unit_type": "grid", "size": 5}, ["grid_id"]), ], ) -def test_spatial_unit_location_columns(make_spatial_unit_args, loc_cols): +def test_spatial_unit_location_id_columns(make_spatial_unit_args, loc_cols): """ - Test that the SpatialUnit classes have the correct location_columns properties. + Test that the SpatialUnit classes have the correct location_id_columns properties. """ su = make_spatial_unit(**make_spatial_unit_args) - assert loc_cols == su.location_columns + assert loc_cols == su.location_id_columns def test_polygon_spatial_unit_column_list(): """ Test that, when supplying polygon_column_names to PolygonSpatialUnit as a - list, location_columns returns it as a new list. + list, location_id_columns returns it as a new list. """ passed_cols = ["id"] psu = PolygonSpatialUnit( @@ -100,7 +100,7 @@ def test_polygon_spatial_unit_column_list(): polygon_table="infrastructure.sites", geom_column="geom_point", ) - loc_cols = psu.location_columns + loc_cols = psu.location_id_columns assert passed_cols == loc_cols assert id(passed_cols) != id(loc_cols) From 29fa6db091e5bd146b6ad172f40cb23f5f74b042 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 7 Jun 2019 14:00:03 +0100 Subject: [PATCH 079/138] Refactor SpatialUnit classes --- .../flowmachine/core/join_to_location.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 494 +++++++++--------- .../features/spatial/distance_matrix.py | 2 +- flowmachine/tests/conftest.py | 4 +- flowmachine/tests/test_join_to_location.py | 2 +- flowmachine/tests/test_spatial_unit.py | 35 +- .../tests/test_subscriber_locations.py | 4 +- 7 files changed, 275 insertions(+), 268 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index f33d7f6784..5b5931e03c 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -34,7 +34,7 @@ class JoinToLocation(Query): This represents a table that can be joined to the cell information table. This must have a date column (called time) and a location column call 'location_id'. - spatial_unit : flowmachine.core.spatial_unit.SpatialUnit + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit A query which maps cell identifiers in the CDR to a different spatial unit (e.g. versioned site or admin region) time_col : str, default 'time' diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 5dccf0de4d..68aac76734 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -8,26 +8,31 @@ The helper function 'make_spatial_unit' can be used to create spatial unit objects. """ from typing import List +from abc import ABCMeta, abstractmethod from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError from . import Query, Table from .grid import Grid +# TODO: Currently most spatial units require a FlowDB connection at init time. +# It would be useful to remove this requirement wherever possible, and instead +# implement a method to check whether the required data can be found in the DB. + class SpatialUnitMixin: """ - Mixin for SpatialUnit classes, which provides a 'location_columns' property + Mixin for spatial unit classes, which provides a 'location_id_columns' property and methods for verifying whether a spatial unit meets different criteria (useful for checking whether a spatial unit is valid in a given query). """ @property - def location_columns(self) -> List[str]: + def location_id_columns(self) -> List[str]: """ Names of the columns that identify a location. """ - return list(self._loc_cols) + return list(self._locid_cols) @property def has_geography(self): @@ -41,7 +46,7 @@ def has_lat_lon_columns(self): """ True if spatial unit has lat/lon columns. """ - return "lat" in self.location_columns and "lon" in self.location_columns + return "lat" in self.location_id_columns and "lon" in self.location_id_columns @property def is_network_object(self): @@ -49,7 +54,8 @@ def is_network_object(self): True if spatial unit is a network object (cell or site). """ return ( - "location_id" in self.location_columns or "site_id" in self.location_columns + "location_id" in self.location_id_columns + or "site_id" in self.location_id_columns ) @property @@ -105,7 +111,7 @@ def verify_criterion(self, criterion, negate=False): raise ValueError(f"Unrecognised criterion '{criterion}'.") if criteria[criterion]["property"] == negate: raise InvalidSpatialUnitError( - f"Spatial unit {self} with location columns {self.location_columns} " + f"Spatial unit {self} with location columns {self.location_id_columns} " + criteria[criterion]["message"] ) @@ -115,84 +121,82 @@ class CellSpatialUnit(SpatialUnitMixin): This class represents the case where no join of cell ID to other data is required. As such, this class does not inherit from Query, is not a valid parameter to JoinToLocation, and only exists to provide the - location_columns property and for consistency with the other spatial units. + location_id_columns property and for consistency with the other spatial units. """ - _loc_cols = ("location_id",) + _locid_cols = ("location_id",) def __eq__(self, other): return isinstance(other, CellSpatialUnit) def __hash__(self): - # We may never need CellSpatialUnits to be hashable, but I'll define - # this just in case. + # We may never need CellSpatialUnits to be hashable, but we define this + # just in case. return hash(self.__class__.__name__) -class SpatialUnit(SpatialUnitMixin, Query): +class GeomSpatialUnit(SpatialUnitMixin, Query, metaclass=ABCMeta): """ - Base class for all spatial units except CellSpatialUnit. Selects columns - from the location table, and optionally joins to data in another table. + Base class for spatial units that map location IDs in + connection.location_table to geographic locations. + + Derived classes must implement the _join_clause method, to determine how to + join the location table to the table with geography data. Parameters ---------- - selected_column_names : str or list - The name(s) of the column(s) to fetch from the location - table in the database. - location_column_names : str or list - Name(s) of the location-related column(s). + geom_table_column_names : str or list + Name(s) of the column(s) to fetch from geom_table. + location_id_column_names : str or list + Name(s) of the column(s) which identify the locations. Must be a subset of the column_names for this query. - location_info_table : str, optional - Fully qualified name of the location info table to select from. - Defaults to self.connection.location_table + geom_table : str or flowmachine.Query, optional + Name of the table containing the geography information. + Can be either the name of a table, with the schema, or a + flowmachine.Query object. + Defaults to connection.location_table geom_column : str, default "geom" - Name of the column that defines the geometry in location_info_table. - join_clause : str, optional - Optionally provide a SQL join clause to join data from the - location info table to spatial regions in another table. + Name of the column in geom_table that defines the geometry. """ def __init__( self, *, - selected_column_names, - location_column_names, - location_info_table=None, + geom_table_column_names, + location_id_column_names, + geom_table=None, geom_column="geom", - join_clause="", ): - if type(selected_column_names) is str: - self._cols = (selected_column_names,) + if isinstance(geom_table_column_names, str): + self._geom_table_cols = (geom_table_column_names,) else: - self._cols = tuple(selected_column_names) + self._geom_table_cols = tuple(geom_table_column_names) - if type(location_column_names) is str: - self._loc_cols = (location_column_names,) + if isinstance(location_id_column_names, str): + self._locid_cols = (location_id_column_names,) else: - self._loc_cols = tuple(location_column_names) + self._locid_cols = tuple(location_id_column_names) - # Check that _loc_cols is a subset of column_names - missing_cols = [c for c in self._loc_cols if not (c in self.column_names)] + self._geom_col = geom_column + + # Check that _locid_cols is a subset of column_names + missing_cols = [c for c in self._locid_cols if not (c in self.column_names)] if missing_cols: raise ValueError( - f"Location columns {missing_cols} are not in returned columns." + f"Location ID columns {missing_cols} are not in returned columns." ) - if location_info_table: - self.location_info_table = location_info_table + if geom_table is None: + # Creating a Table object here means that we don't have to handle + # tables and Query objects differently in _make_query and get_geom_query + self.geom_table = Table(name=self.connection.location_table) + elif isinstance(geom_table, Query): + self.geom_table = geom_table else: - self.location_info_table = self.connection.location_table - - self._geom_column = geom_column - - self._join_clause = join_clause + self.geom_table = Table(name=geom_table) super().__init__() - # TODO: Currently most spatial units require a FlowDB connection at init time. - # It would be useful to remove this requirement wherever possible, and instead - # implement a method to check whether the required data can be found in the DB. - def __eq__(self, other): try: return self.md5 == other.md5 @@ -203,214 +207,234 @@ def __hash__(self): # Must define this because we explicitly define self.__eq__ return hash(self.md5) - @property - def column_names(self) -> List[str]: - return [get_name_and_alias(c)[1].split(".").pop() for c in self._cols] + def _get_aliased_geom_table_cols(self, table_alias): + return [f"{table_alias}.{c}" for c in self._geom_table_cols] - def get_geom_query(self): + @abstractmethod + def _join_clause(self, loc_table_alias, geom_table_alias): """ - Returns a SQL query which can be used to map locations (identified by - the values in self.location_columns) to their geometries (in a column - named "geom"). + Returns a SQL join clause to join the location table to the geography + table. The join clause is not used if self.geom_table and + self.connection.location_table are the same table. + + Parameters + ---------- + loc_table_alias : str + Table alias for the location table. + geom_table_alias : str + Table alias for the geography table. + + Returns + ------- + str + SQL join clause """ - columns = [ - c for c in self._cols if get_name_and_alias(c)[1] in self.location_columns - ] + [f"{self._geom_column} AS geom"] + raise NotImplementedError - sql = f"SELECT {','.join(columns)} FROM {self.location_info_table}" + def _make_query(self): + loc_table_alias = "loc_table" - return sql + if hasattr(self.geom_table, "fully_qualified_table_name") and ( + self.geom_table.fully_qualified_table_name == self.connection.location_table + ): + # No need to join location_table to itself + geom_table_alias = loc_table_alias + join_clause = "" + else: + geom_table_alias = "geom_table" + join_clause = self._join_clause(loc_table_alias, geom_table_alias) + + geom_table_cols_string = ", ".join( + self._get_aliased_geom_table_cols(geom_table_alias) + ) - def _make_query(self): - columns = ", ".join(self._cols) sql = f""" SELECT - {columns} - FROM {self.location_info_table} - {self._join_clause} + {loc_table_alias}.id AS location_id, + {loc_table_alias}.date_of_first_service, + {loc_table_alias}.date_of_last_service, + {geom_table_cols_string} + FROM {self.connection.location_table} AS {loc_table_alias} + {join_clause} """ return sql + @property + def column_names(self) -> List[str]: + return ["location_id", "date_of_first_service", "date_of_last_service"] + [ + get_name_and_alias(c)[1] + for c in self._get_aliased_geom_table_cols("geom_table") + ] -def lat_lon_spatial_unit(): - """ - Returns a SpatialUnit that maps cell location_id to lat-lon coordinates. + def get_geom_query(self): + """ + Returns a SQL query which can be used to map locations (identified by + the values in self.location_id_columns) to their geometries (in a column + named "geom"). + """ + geom_table_alias = "geom_table" - Returns - ------- - flowmachine.core.spatial_unit.SpatialUnit - """ - return SpatialUnit( - selected_column_names=[ - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - "ST_X(geom_point::geometry) AS lon", - "ST_Y(geom_point::geometry) AS lat", - ], - location_column_names=["lat", "lon"], - geom_column="geom_point", - ) + # List of column names whose aliases are in self.location_id_columns + columns = [ + c + for c in self._get_aliased_geom_table_cols(geom_table_alias) + if get_name_and_alias(c)[1] in self.location_id_columns + ] + [f"{self._geom_col} AS geom"] + # For versioned-cell spatial unit, the geometry table _is_ the location table. + # In this case 'location_id' is one of the location ID columns but + # isn't in self._geom_table_cols, so we specify it separately. + if "location_id" in self.location_id_columns: + columns = [f"{geom_table_alias}.id AS location_id"] + columns -def versioned_cell_spatial_unit(): - """ - Returns a SpatialUnit that maps cell location_id to a cell version and - lat-lon coordinates. + sql = f"SELECT {','.join(columns)} FROM ({self.geom_table.get_query()}) AS {geom_table_alias}" - Returns - ------- - flowmachine.core.spatial_unit.SpatialUnit - """ - if Query.connection.location_table != "infrastructure.cells": - raise ValueError("Versioned cell spatial unit is unavailable.") - - return SpatialUnit( - selected_column_names=[ - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - "version", - "ST_X(geom_point::geometry) AS lon", - "ST_Y(geom_point::geometry) AS lat", - ], - location_column_names=["location_id", "version", "lon", "lat"], - location_info_table="infrastructure.cells", - geom_column="geom_point", - ) + return sql -def versioned_site_spatial_unit(): +class LatLonSpatialUnit(GeomSpatialUnit): """ - Returns a SpatialUnit that maps cell location_id to a site version and - lat-lon coordinates. + Class that provides a mapping from cell/site IDs in the location table to + latitude and longitude. - Returns - ------- - flowmachine.core.spatial_unit.SpatialUnit + In addition to the requested geom_table_column_names, this query returns + latitude and longitude values in columns "lat" and "lon". + + Parameters + ---------- + geom_table_column_names : str or list + Name(s) of the column(s) to fetch from geom_table. + location_id_column_names : str or list + Name(s) of the column(s) which identify the locations. + Must be a subset of the column_names for this query. + geom_table : str or flowmachine.Query, optional + Name of the table containing the geography information. + Can be either the name of a table, with the schema, or a + flowmachine.Query object. + Defaults to connection.location_table + geom_column : str, default "geom_point" + Name of the column in geom_table that defines the point geometry from + which latitude and longitude will be extracted. + geom_table_join_on : str + Name of the column from geom_table to join on. + location_table_join_on : str + Name of the column from connection.location_table to join on. """ - location_table = Query.connection.location_table - - sites_alias = "s" - if location_table == "infrastructure.sites": - cells_alias = sites_alias - join_clause = "" - elif location_table == "infrastructure.cells": - cells_alias = "c" - join_clause = f""" - RIGHT JOIN - infrastructure.cells AS {cells_alias} - ON {sites_alias}.id = {cells_alias}.site_id - """ - else: - raise ValueError( - f"Expected location table to be 'infrastructure.cells' " - f"or 'infrastructure.sites', not '{location_table}''" - ) - return SpatialUnit( - selected_column_names=[ - f"{cells_alias}.id AS location_id", - f"{sites_alias}.id AS site_id", - f"{sites_alias}.date_of_first_service AS date_of_first_service", - f"{sites_alias}.date_of_last_service AS date_of_last_service", - f"{sites_alias}.version AS version", - f"ST_X({sites_alias}.geom_point::geometry) AS lon", - f"ST_Y({sites_alias}.geom_point::geometry) AS lat", - ], - location_column_names=["site_id", "version", "lon", "lat"], - location_info_table=f"infrastructure.sites AS {sites_alias}", + def __init__( + self, + *, + geom_table_column_names=(), + location_id_column_names=("lat", "lon"), + geom_table=None, geom_column="geom_point", - join_clause=join_clause, - ) + geom_table_join_on=None, + location_table_join_on=None, + ): + self._geom_on = geom_table_join_on + self._loc_on = location_table_join_on + super().__init__( + geom_table_column_names=geom_table_column_names, + location_id_column_names=location_id_column_names, + geom_table=geom_table, + geom_column=geom_column, + ) + + def _get_aliased_geom_table_cols(self, table_alias): + return super()._get_aliased_geom_table_cols(table_alias) + [ + f"ST_X({table_alias}.{self._geom_col}::geometry) AS lon", + f"ST_Y({table_alias}.{self._geom_col}::geometry) AS lat", + ] + + def _join_clause(self, loc_table_alias, geom_table_alias): + if self._loc_on is None or self._geom_on is None: + raise ValueError("No columns specified for join.") + return f""" + LEFT JOIN + ({self.geom_table.get_query()}) AS {geom_table_alias} + ON {loc_table_alias}.{self._loc_on} = {geom_table_alias}.{self._geom_on} + """ -class PolygonSpatialUnit(SpatialUnit): +class PolygonSpatialUnit(GeomSpatialUnit): """ Class that provides a mapping from cell/site data in the location table to spatial regions defined by geography information in a table. Parameters ---------- - polygon_column_names : str or list - The name of the column to fetch from the geometry - table in the database. Can also be a list of names. - polygon_table : str or flowmachine.Query - name of the table containing the geography information. + geom_table_column_names : str or list + Name(s) of the column(s) to fetch from geom_table. + This column or columns will be used to identify the polygons. + geom_table : str or flowmachine.Query + Name of the table containing the geography information. Can be either the name of a table, with the schema, or a flowmachine.Query object. geom_column : str, default 'geom' - Name of the column in polygon_table that defines the geography. + Name of the column in geom_table that defines the geography. """ - def __init__(self, *, polygon_column_names, polygon_table, geom_column="geom"): - if isinstance(polygon_table, Query): - self.polygon_table = polygon_table - else: - # Creating a Table object here means that we don't have to handle - # admin tables and Grid objects differently in join_clause and self.get_geom_query - self.polygon_table = Table(name=polygon_table) - - location_info_table = self.connection.location_table - - locinfo_alias = "locinfo" - if hasattr(self.polygon_table, "fully_qualified_table_name") and ( - location_info_table == self.polygon_table.fully_qualified_table_name - ): - # if the subscriber wants to select a geometry from the sites table - # there is no need to join the table with itself. - joined_alias = locinfo_alias - join_clause = "" + def __init__(self, *, geom_table_column_names, geom_table, geom_column="geom"): + if isinstance(geom_table_column_names, str): + location_id_column_names = get_name_and_alias(geom_table_column_names)[1] else: - joined_alias = "polygon" - join_clause = f""" - INNER JOIN - ({self.polygon_table.get_query()}) AS {joined_alias} - ON ST_within( - {locinfo_alias}.geom_point::geometry, - ST_SetSRID({joined_alias}.{geom_column}, 4326)::geometry - ) - """ - - locinfo_column_names = [ - f"{locinfo_alias}.id AS location_id", - f"{locinfo_alias}.version AS version", - f"{locinfo_alias}.date_of_first_service AS date_of_first_service", - f"{locinfo_alias}.date_of_last_service AS date_of_last_service", - ] - if type(polygon_column_names) is str: - self._polygon_column_names = (polygon_column_names,) - else: - self._polygon_column_names = tuple(polygon_column_names) - all_column_names = locinfo_column_names + [ - f"{joined_alias}.{c}" for c in self._polygon_column_names - ] - location_column_names = [ - get_name_and_alias(c)[1] for c in self._polygon_column_names - ] - + location_id_column_names = [ + get_name_and_alias(c)[1] for c in geom_table_column_names + ] super().__init__( - selected_column_names=all_column_names, - location_column_names=location_column_names, - location_info_table=f"{location_info_table} AS {locinfo_alias}", + geom_table_column_names=geom_table_column_names, + location_id_column_names=location_id_column_names, + geom_table=geom_table, geom_column=geom_column, - join_clause=join_clause, ) - def get_geom_query(self): - """ - Returns a SQL query which can be used to map locations (identified by - the values in self.location_columns) to their geometries (in a column - named "geom"). + def _join_clause(self, loc_table_alias, geom_table_alias): + return f""" + INNER JOIN + ({self.geom_table.get_query()}) AS {geom_table_alias} + ON ST_within( + {loc_table_alias}.geom_point::geometry, + ST_SetSRID({geom_table_alias}.{self._geom_col}, 4326)::geometry + ) """ - columns = list(self._polygon_column_names) + [f"{self._geom_column} AS geom"] - sql = f""" - SELECT {','.join(columns)} FROM ({self.polygon_table.get_query()}) AS polygon - """ - return sql +def versioned_cell_spatial_unit(): + """ + Returns a LatLonSpatialUnit that maps cell location_id to a cell version + and lat-lon coordinates. + + Returns + ------- + flowmachine.core.spatial_unit.LatLonSpatialUnit + """ + if Query.connection.location_table != "infrastructure.cells": + raise InvalidSpatialUnitError("Versioned cell spatial unit is unavailable.") + + return LatLonSpatialUnit( + geom_table_column_names=["version"], + location_id_column_names=["location_id", "version", "lon", "lat"], + geom_table="infrastructure.cells", + ) + + +def versioned_site_spatial_unit(): + """ + Returns a LatLonSpatialUnit that maps cell location_id to a site version + and lat-lon coordinates. + + Returns + ------- + flowmachine.core.spatial_unit.LatLonSpatialUnit + """ + return LatLonSpatialUnit( + geom_table_column_names=["id AS site_id", "version"], + location_id_column_names=["site_id", "version", "lon", "lat"], + geom_table="infrastructure.sites", + geom_table_join_on="id", + location_table_join_on="site_id", + ) def admin_spatial_unit(*, level, region_id_column_name=None): @@ -444,13 +468,13 @@ def admin_spatial_unit(*, level, region_id_column_name=None): col_name = region_id_column_name table = f"geography.admin{level}" - return PolygonSpatialUnit(polygon_column_names=col_name, polygon_table=table) + return PolygonSpatialUnit(geom_table_column_names=col_name, geom_table=table) def grid_spatial_unit(*, size): """ - Returns a PolygonSpatialUnit representing a mapping - between all the sites in the database and a grid of arbitrary size. + Returns a PolygonSpatialUnit that maps all the sites in the database to a + grid of arbitrary size. Parameters ---------- @@ -463,8 +487,8 @@ def grid_spatial_unit(*, size): Query which maps cell/site IDs to grid squares """ return PolygonSpatialUnit( - polygon_column_names=["grid_id"], - polygon_table=Grid(size), + geom_table_column_names="grid_id", + geom_table=Grid(size), geom_column="geom_square", ) @@ -475,7 +499,7 @@ def make_spatial_unit( level=None, region_id_column_name=None, size=None, - polygon_table=None, + geom_table=None, geom_column="geom", ): """ @@ -497,9 +521,9 @@ def make_spatial_unit( number. 'polygon' A custom set of polygons that live in the database. In which - case you can pass the parameters 'column_name', which is the - column or columns you want to return after the join, and - 'polygon_table', the table where the polygons reside (with the + case you can pass the parameters 'region_id_column_name', which + is the column or columns you want to return after the join, and + 'geom_table', the table where the polygons reside (with the schema), and additionally geom_column which is the column with the geometry information (will default to 'geom'). 'admin' @@ -521,12 +545,12 @@ def make_spatial_unit( size : float or int Size of the grid in kilometres. Required when spatial_unit_type='grid'. - polygon_table : str or flowmachine.Query + geom_table : str or flowmachine.Query Name of the table containing the geography information. Can be either the name of a table, with the schema, or a flowmachine.Query object. Required when spatial_unit_type='polygon'. geom_column : str, default 'geom' - Name of the column in polygon_table that defines the geography. + Name of the column in geom_table that defines the geography. Required when spatial_unit_type='polygon'. Returns @@ -535,18 +559,6 @@ def make_spatial_unit( An object representing a mapping from location identifiers to a spatial unit. """ - valid_spatial_unit_types = { - "cell", - "versioned-cell", - "versioned-site", - "lat-lon", - "admin", - "grid", - "polygon", - } - if not (spatial_unit_type in valid_spatial_unit_types): - raise ValueError(f"Unrecognised spatial unit type: {spatial_unit_type}.") - if spatial_unit_type == "cell": return CellSpatialUnit() elif spatial_unit_type == "versioned-cell": @@ -554,7 +566,7 @@ def make_spatial_unit( elif spatial_unit_type == "versioned-site": return versioned_site_spatial_unit() elif spatial_unit_type == "lat-lon": - return lat_lon_spatial_unit() + return LatLonSpatialUnit() elif spatial_unit_type == "admin": if level is None: raise ValueError( @@ -570,16 +582,18 @@ def make_spatial_unit( ) return grid_spatial_unit(size=size) elif spatial_unit_type == "polygon": - if polygon_table is None: + if geom_table is None: raise ValueError( - "'polygon_table' parameter is required for spatial unit of type 'polygon'." + "'geom_table' parameter is required for spatial unit of type 'polygon'." ) if region_id_column_name is None: raise ValueError( "'region_id_column_name' parameter is required for spatial unit of type 'polygon'." ) return PolygonSpatialUnit( - polygon_column_names=region_id_column_name, - polygon_table=polygon_table, + geom_table_column_names=region_id_column_name, + geom_table=geom_table, geom_column=geom_column, ) + else: + raise ValueError(f"Unrecognised spatial unit type: {spatial_unit_type}.") diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index efe8b5f9bd..fda6b114f1 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -25,7 +25,7 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.SpatialUnit, default versioned-cell + spatial_unit : flowmachine.core.spatial_unit.LatLonSpatialUnit, default versioned-cell Locations to compute distances for. Note: only point locations (i.e. spatial_unit.has_lat_lon_columns) are supported at this time. diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index aa278abb4a..cc9380f161 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -79,12 +79,12 @@ def exemplar_level_param(request): { "spatial_unit_type": "polygon", "region_id_column_name": "admin3pcod", - "polygon_table": "geography.admin3", + "geom_table": "geography.admin3", }, { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "polygon_table": "infrastructure.sites", + "geom_table": "infrastructure.sites", "geom_column": "geom_point", }, ], diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index aba83465de..bb92bd8149 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -115,7 +115,7 @@ def test_join_with_polygon(get_dataframe, get_length): spatial_unit=make_spatial_unit( "polygon", region_id_column_name="admin3pcod", - polygon_table="geography.admin3", + geom_table="geography.admin3", geom_column="geom", ), ) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index f0e9bbbfe0..387f59a38b 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -10,9 +10,9 @@ def test_spatial_unit_column_names(exemplar_spatial_unit_param): """ - Test that the SpatialUnit classes have accurate column_names properties. + Test that the *SpatialUnit classes have accurate column_names properties. """ - if CellSpatialUnit() == exemplar_spatial_unit_param: + if isinstance(exemplar_spatial_unit_param, CellSpatialUnit): pytest.skip( "CellSpatialUnit does not have a column_names property (not a Query)" ) @@ -47,7 +47,7 @@ def test_get_geom_query_column_names( { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "polygon_table": "infrastructure.sites", + "geom_table": "infrastructure.sites", "geom_column": "geom_point", }, ["id"], @@ -56,7 +56,7 @@ def test_get_geom_query_column_names( { "spatial_unit_type": "polygon", "region_id_column_name": ["id"], - "polygon_table": "infrastructure.sites", + "geom_table": "infrastructure.sites", "geom_column": "geom_point", }, ["id"], @@ -83,7 +83,7 @@ def test_get_geom_query_column_names( ) def test_spatial_unit_location_id_columns(make_spatial_unit_args, loc_cols): """ - Test that the SpatialUnit classes have the correct location_id_columns properties. + Test that the *SpatialUnit classes have the correct location_id_columns properties. """ su = make_spatial_unit(**make_spatial_unit_args) assert loc_cols == su.location_id_columns @@ -96,8 +96,8 @@ def test_polygon_spatial_unit_column_list(): """ passed_cols = ["id"] psu = PolygonSpatialUnit( - polygon_column_names=passed_cols, - polygon_table="infrastructure.sites", + geom_table_column_names=passed_cols, + geom_table="infrastructure.sites", geom_column="geom_point", ) loc_cols = psu.location_id_columns @@ -107,17 +107,12 @@ def test_polygon_spatial_unit_column_list(): def test_missing_location_columns_raises_error(): """ - Test that a ValueError is raised if the location_column_names passed to - SpatialUnit are not a subset of column_names. + Test that a ValueError is raised if the location_id_column_names passed to + GeomSpatialUnit are not a subset of column_names. """ with pytest.raises(ValueError, match="['NOT_A_COLUMN']"): - su = SpatialUnit( - selected_column_names=[ - "id AS location_id", - "date_of_first_service", - "date_of_last_service", - ], - location_column_names=["location_id", "NOT_A_COLUMN"], + su = LatLonSpatialUnit( + location_id_column_names=["location_id", "lat", "lon", "NOT_A_COLUMN"] ) @@ -138,19 +133,19 @@ def test_missing_location_columns_raises_error(): { "spatial_unit_type": "polygon", "region_id_column_name": "admin3pcod", - "polygon_table": "geography.admin3", + "geom_table": "geography.admin3", }, { "spatial_unit_type": "polygon", "region_id_column_name": "id", - "polygon_table": "infrastructure.sites", + "geom_table": "infrastructure.sites", "geom_column": "geom_point", }, ], ) def test_spatial_unit_equals_itself(make_spatial_unit_args): """ - Test that instances of the SpatialUnit classes are equal to themselves. + Test that instances of the *SpatialUnit classes are equal to themselves. """ # Can't use exemplar_spatial_unit_param here because we need to create two # different but equal spatial units. @@ -212,7 +207,7 @@ def test_different_grid_spatial_units_are_not_equal(): {"spatial_unit_type": "INVALID_SPATIAL_UNIT_TYPE"}, {"spatial_unit_type": "admin"}, {"spatial_unit_type": "grid"}, - {"spatial_unit_type": "polygon", "polygon_table": "geography.admin3"}, + {"spatial_unit_type": "polygon", "geom_table": "geography.admin3"}, {"spatial_unit_type": "polygon", "region_id_column_name": "DUMMY_COLUMN_NAME"}, ], ) diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index cdf742f49e..ea27f836e6 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -16,9 +16,7 @@ def test_can_get_pcods(get_dataframe): "2016-01-01 13:30:30", "2016-01-02 16:25:00", spatial_unit=make_spatial_unit( - "polygon", - region_id_column_name="admin3pcod", - polygon_table="geography.admin3", + "polygon", region_id_column_name="admin3pcod", geom_table="geography.admin3" ), ) df = get_dataframe(subscriber_pcod) From a466fe509ca8fb897346d56c4ed26a51fb47409a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 7 Jun 2019 18:08:12 +0100 Subject: [PATCH 080/138] Import make_spatial_unit from flowmachine.core --- flowmachine/flowmachine/core/__init__.py | 2 ++ flowmachine/flowmachine/core/grid.py | 3 +-- flowmachine/flowmachine/core/join_to_location.py | 3 +-- .../flowmachine/features/location/unique_subscriber_counts.py | 2 +- .../flowmachine/features/network/total_network_objects.py | 3 +-- flowmachine/flowmachine/features/spatial/distance_matrix.py | 2 +- flowmachine/flowmachine/features/subscriber/daily_location.py | 2 +- flowmachine/flowmachine/features/subscriber/displacement.py | 2 +- flowmachine/flowmachine/features/subscriber/last_location.py | 3 +-- .../flowmachine/features/subscriber/most_frequent_location.py | 3 +-- .../flowmachine/features/utilities/spatial_aggregates.py | 2 ++ .../flowmachine/features/utilities/subscriber_locations.py | 3 +-- flowmachine/flowmachine/models/pwo.py | 2 +- flowmachine/tests/conftest.py | 3 +-- .../tests/functional_tests/test_sql_strings_and_results.py | 3 +-- flowmachine/tests/test_async.py | 2 +- flowmachine/tests/test_calldays.py | 2 +- flowmachine/tests/test_daily_location.py | 2 +- flowmachine/tests/test_day_trajectories.py | 2 +- flowmachine/tests/test_displacement.py | 2 +- flowmachine/tests/test_flows.py | 2 +- flowmachine/tests/test_geomixin.py | 2 +- flowmachine/tests/test_indexes.py | 2 +- flowmachine/tests/test_inoutflows.py | 2 +- flowmachine/tests/test_join_to_location.py | 3 +-- flowmachine/tests/test_joined_aggregate.py | 2 +- flowmachine/tests/test_last_location.py | 2 +- flowmachine/tests/test_location_visits.py | 2 +- flowmachine/tests/test_meaningful_locations.py | 2 +- flowmachine/tests/test_most_frequent_locations.py | 2 +- flowmachine/tests/test_spatial_aggregate.py | 2 +- flowmachine/tests/test_spatial_distancematrix.py | 2 +- flowmachine/tests/test_subscriber_location_cluster.py | 3 +-- flowmachine/tests/test_subscriber_locations.py | 2 +- flowmachine/tests/test_total_network_objects.py | 2 +- flowmachine/tests/test_unique_location_counts.py | 2 +- flowmachine/tests/test_unique_subscriber_counts.py | 2 +- .../tests/flowmachine_tests/test_daily_location_results.py | 3 +-- 38 files changed, 40 insertions(+), 47 deletions(-) diff --git a/flowmachine/flowmachine/core/__init__.py b/flowmachine/flowmachine/core/__init__.py index ca63077409..f6a38c4749 100644 --- a/flowmachine/flowmachine/core/__init__.py +++ b/flowmachine/flowmachine/core/__init__.py @@ -13,6 +13,7 @@ from .geotable import GeoTable from .init import connect from .logging import init_logging, set_log_level +from .spatial_unit import make_spatial_unit from .join_to_location import JoinToLocation, location_joined_query from .custom_query import CustomQuery from .grid import Grid @@ -25,6 +26,7 @@ "GeoTable", "Connection", "connect", + "make_spatial_unit", "JoinToLocation", "location_joined_query", "CustomQuery", diff --git a/flowmachine/flowmachine/core/grid.py b/flowmachine/flowmachine/core/grid.py index 68f482101d..5e6b2ef44f 100644 --- a/flowmachine/flowmachine/core/grid.py +++ b/flowmachine/flowmachine/core/grid.py @@ -40,8 +40,7 @@ def __init__(self, size, geom="geography.admin0"): def _geo_augmented_query(self): """ - Returns one of each geom for non-point levels, with the - flows in/out as properties. + Returns a version of this query with geom and gid columns. Returns ------- diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 5b5931e03c..ae9403e2e8 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -7,7 +7,7 @@ the joining of any query with cell/site information to another spatial level, such as a grid or an admin region. -No join is required if the spatial unit is CellSpatialUnit, +No join is required if spatial_unit.has_geography is False, so we also define the helper function location_joined_query to return a JoinToLocation object if a join is required, or the original query object otherwise. @@ -15,7 +15,6 @@ from typing import List from .query import Query -from .spatial_unit import CellSpatialUnit class JoinToLocation(Query): diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index a89d08eb30..c3c3dc1a33 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -17,7 +17,7 @@ """ from ...core.query import Query from ...core.mixins import GeoDataMixin -from ...core.spatial_unit import make_spatial_unit +from ...core import make_spatial_unit from ..utilities.subscriber_locations import subscriber_locations diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index b8957f1320..816bc6db21 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -14,9 +14,8 @@ from typing import List from ...core.mixins import GeoDataMixin -from ...core import location_joined_query +from ...core import location_joined_query, make_spatial_unit from ...core.query import Query -from ...core.spatial_unit import make_spatial_unit from ..utilities import EventsTablesUnion valid_stats = {"avg", "max", "min", "median", "mode", "stddev", "variance"} diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index fda6b114f1..93c0adc0b2 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -11,7 +11,7 @@ from ...core.query import Query from ...core.mixins import GraphMixin -from ...core.spatial_unit import make_spatial_unit +from ...core import make_spatial_unit class DistanceMatrix(GraphMixin, Query): diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index a7e0160743..64503c47a2 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -13,7 +13,7 @@ """ import datetime -from ...core.spatial_unit import make_spatial_unit +from ...core import make_spatial_unit from .last_location import LastLocation from .most_frequent_location import MostFrequentLocation diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index a4fe095329..79a8e7a3d3 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -17,7 +17,7 @@ from . import ModalLocation from ..utilities.subscriber_locations import subscriber_locations from flowmachine.utils import parse_datestring, get_dist_string, list_of_dates -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from dateutil.relativedelta import relativedelta diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index f0c61d9be0..00a50c65a9 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -12,10 +12,9 @@ """ from typing import List -from flowmachine.core import Query +from flowmachine.core import Query, make_spatial_unit from ..utilities.subscriber_locations import BaseLocation from ..utilities.subscriber_locations import subscriber_locations -from flowmachine.core.spatial_unit import make_spatial_unit class LastLocation(BaseLocation, Query): diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 872a5c88f6..12e8f441cd 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -11,9 +11,8 @@ """ from typing import List -from flowmachine.core import Query +from flowmachine.core import Query, make_spatial_unit from ..utilities.subscriber_locations import BaseLocation, subscriber_locations -from flowmachine.core.spatial_unit import make_spatial_unit class MostFrequentLocation(BaseLocation, Query): diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index c8041b8b8b..56941c8ac3 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -29,6 +29,7 @@ class SpatialAggregate(GeoDataMixin, Query): def __init__(self, *, locations): self.locations = locations + # self.spatial_unit is used in self._geo_augmented_query self.spatial_unit = locations.spatial_unit super().__init__() @@ -99,6 +100,7 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): def __init__(self, *, metric, locations, method="mean"): self.metric = metric self.locations = locations + # self.spatial_unit is used in self._geo_augmented_query self.spatial_unit = locations.spatial_unit self.method = method.lower() if self.method not in self.allowed_methods: diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 608eab6293..7ead644d70 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -17,8 +17,7 @@ from .spatial_aggregates import SpatialAggregate, JoinedSpatialAggregate from ...core.query import Query -from ...core.join_to_location import location_joined_query -from ...core.spatial_unit import make_spatial_unit +from ...core import location_joined_query, make_spatial_unit import structlog diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 3e4e201946..32347e2559 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -35,7 +35,7 @@ from ..features import ModalLocation from ..core.query import Query from ..core.model import Model, model_result -from ..core.spatial_unit import make_spatial_unit +from ..core import make_spatial_unit from ..features.spatial.distance_matrix import DistanceMatrix import structlog diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index cc9380f161..04bb11d214 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -18,9 +18,8 @@ ) import flowmachine -from flowmachine.core import Query +from flowmachine.core import Query, make_spatial_unit from flowmachine.core.cache import reset_cache -from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import EventTableSubset logger = logging.getLogger() diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index f33236425a..dded120468 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -5,9 +5,8 @@ from flowmachine.utils import pretty_sql from approvaltests.approvals import verify -from flowmachine.core import CustomQuery +from flowmachine.core import CustomQuery, make_spatial_unit from flowmachine.features import daily_location -from flowmachine.core.spatial_unit import make_spatial_unit def test_daily_location_1_sql(diff_reporter): diff --git a/flowmachine/tests/test_async.py b/flowmachine/tests/test_async.py index 2fc9bf8053..d3ee5ee5cf 100644 --- a/flowmachine/tests/test_async.py +++ b/flowmachine/tests/test_async.py @@ -8,7 +8,7 @@ from flowmachine.features.subscriber import * from threading import Thread import pandas as pd -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit def test_returns_future(): diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index dc812fccf6..aa7ea18e26 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -11,7 +11,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import CallDays, subscriber_locations import numpy as np diff --git a/flowmachine/tests/test_daily_location.py b/flowmachine/tests/test_daily_location.py index 0b27d84af5..818e64ffb4 100644 --- a/flowmachine/tests/test_daily_location.py +++ b/flowmachine/tests/test_daily_location.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import MissingDateError -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import daily_location, MostFrequentLocation diff --git a/flowmachine/tests/test_day_trajectories.py b/flowmachine/tests/test_day_trajectories.py index f92687d827..b48b057a4e 100644 --- a/flowmachine/tests/test_day_trajectories.py +++ b/flowmachine/tests/test_day_trajectories.py @@ -4,7 +4,7 @@ from flowmachine.features import DayTrajectories, daily_location -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit def test_column_names_day_trajectories(exemplar_spatial_unit_param): diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index cb59bf29e3..e0ecc7faa3 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -7,7 +7,7 @@ from numpy import isnan from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit @pytest.mark.parametrize( diff --git a/flowmachine/tests/test_flows.py b/flowmachine/tests/test_flows.py index 98f0aecc77..c1812eb117 100644 --- a/flowmachine/tests/test_flows.py +++ b/flowmachine/tests/test_flows.py @@ -7,7 +7,7 @@ import geojson import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import daily_location from flowmachine.features.location.flows import * from flowmachine.features.subscriber.daily_location import locate_subscribers diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index 6795c947ee..962d2e78ce 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -16,7 +16,7 @@ from flowmachine.core import Query from flowmachine.core.mixins import GeoDataMixin -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import daily_location from flowmachine.utils import proj4string diff --git a/flowmachine/tests/test_indexes.py b/flowmachine/tests/test_indexes.py index 684cc33549..bb3700f7d6 100644 --- a/flowmachine/tests/test_indexes.py +++ b/flowmachine/tests/test_indexes.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features.subscriber import * diff --git a/flowmachine/tests/test_inoutflows.py b/flowmachine/tests/test_inoutflows.py index 243dbf41a5..30916b4c33 100644 --- a/flowmachine/tests/test_inoutflows.py +++ b/flowmachine/tests/test_inoutflows.py @@ -8,7 +8,7 @@ from flowmachine.features import Flows, daily_location -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit def test_inoutflow_with_double_column_location(): diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index bb92bd8149..3c0c53e723 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -9,8 +9,7 @@ import numpy as np from flowmachine.features import subscriber_locations -from flowmachine.core import JoinToLocation, location_joined_query -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import JoinToLocation, location_joined_query, make_spatial_unit def test_join_to_location_column_names(exemplar_spatial_unit_param): diff --git a/flowmachine/tests/test_joined_aggregate.py b/flowmachine/tests/test_joined_aggregate.py index 7b480052e5..a126d56bb8 100644 --- a/flowmachine/tests/test_joined_aggregate.py +++ b/flowmachine/tests/test_joined_aggregate.py @@ -6,7 +6,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import ( MostFrequentLocation, RadiusOfGyration, diff --git a/flowmachine/tests/test_last_location.py b/flowmachine/tests/test_last_location.py index e443ada76d..7e7cfce884 100644 --- a/flowmachine/tests/test_last_location.py +++ b/flowmachine/tests/test_last_location.py @@ -4,7 +4,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import LastLocation diff --git a/flowmachine/tests/test_location_visits.py b/flowmachine/tests/test_location_visits.py index 7f08455088..e2d8c78dd6 100644 --- a/flowmachine/tests/test_location_visits.py +++ b/flowmachine/tests/test_location_visits.py @@ -4,7 +4,7 @@ from flowmachine.features import LocationVisits, daily_location, DayTrajectories from flowmachine.utils import list_of_dates -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit def test_column_names_location_visits(exemplar_spatial_unit_param): diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 6e02dc3d9a..0f163a9dd4 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -4,7 +4,7 @@ import pytest from flowmachine.core.errors import BadLevelError -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import ( HartiganCluster, CallDays, diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index 551b495f30..4259f373c6 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -4,7 +4,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import MostFrequentLocation from flowmachine.features.subscriber.daily_location import locate_subscribers diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 7ae6dc9a29..b1c8cfd2ab 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -2,7 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import ModalLocation, daily_location from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index 44e0b2c6ae..17952bcf32 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -9,7 +9,7 @@ import pytest from flowmachine.features.spatial import DistanceMatrix -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit def test_some_results(get_dataframe): diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 6eff214e2a..5aa55493f8 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -16,10 +16,9 @@ from geopandas import GeoSeries from shapely.geometry import box, MultiPoint -from flowmachine.core import Table, CustomQuery +from flowmachine.core import Table, CustomQuery, make_spatial_unit from flowmachine.core.query import Query from flowmachine.core.mixins import GeoDataMixin -from flowmachine.core.spatial_unit import make_spatial_unit from flowmachine.features import ( CallDays, HartiganCluster, diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index ea27f836e6..83ef6f513b 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features.utilities.subscriber_locations import subscriber_locations diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 5e15c196c4..bf9e61a295 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -10,7 +10,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index 2d5d3baf29..64b8562c9a 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -5,7 +5,7 @@ import pytest from flowmachine.core.errors import BadLevelError -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import UniqueLocationCounts, subscriber_locations diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index 156e505084..2666db8f8b 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -8,7 +8,7 @@ import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features import UniqueSubscriberCounts from flowmachine.features.utilities import subscriber_locations diff --git a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py index 1412c7c952..b301de3cb7 100644 --- a/integration_tests/tests/flowmachine_tests/test_daily_location_results.py +++ b/integration_tests/tests/flowmachine_tests/test_daily_location_results.py @@ -5,8 +5,7 @@ from flowmachine.utils import pretty_sql from approvaltests.approvals import verify -from flowmachine.core import CustomQuery -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import CustomQuery, make_spatial_unit from flowmachine.features import daily_location From edf0464dfdd5e7c847b1bb57c8a77f5868387b9f Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 7 Jun 2019 18:08:34 +0100 Subject: [PATCH 081/138] Update RadiusOfGyration --- .../flowmachine/features/subscriber/radius_of_gyration.py | 3 ++- flowmachine/tests/test_radius_of_gyration.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py index 3191432a68..9e4b8cc62c 100644 --- a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py +++ b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py @@ -14,6 +14,7 @@ from .metaclasses import SubscriberFeature from ..utilities.subscriber_locations import subscriber_locations +from flowmachine.core import make_spatial_unit class RadiusOfGyration(SubscriberFeature): @@ -101,7 +102,7 @@ def __init__( self.ul = subscriber_locations( self.start, self.stop, - level="lat-lon", + spatial_unit=make_spatial_unit("lat-lon"), hours=hours, table=table, subscriber_subset=subscriber_subset, diff --git a/flowmachine/tests/test_radius_of_gyration.py b/flowmachine/tests/test_radius_of_gyration.py index cc98048f9e..d0664a30b0 100644 --- a/flowmachine/tests/test_radius_of_gyration.py +++ b/flowmachine/tests/test_radius_of_gyration.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import pytest -from flowmachine.core.spatial_unit import make_spatial_unit +from flowmachine.core import make_spatial_unit from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.features.subscriber import * From b6444e671540f88d7a0606d24210eb6540dba90e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 7 Jun 2019 18:09:01 +0100 Subject: [PATCH 082/138] Update LocationIntroversion --- .../location/location_introversion.py | 124 ++++-------------- .../tests/test_location_introversion.py | 41 +++--- 2 files changed, 46 insertions(+), 119 deletions(-) diff --git a/flowmachine/flowmachine/features/location/location_introversion.py b/flowmachine/flowmachine/features/location/location_introversion.py index 4106455023..5daa44da1b 100644 --- a/flowmachine/flowmachine/features/location/location_introversion.py +++ b/flowmachine/flowmachine/features/location/location_introversion.py @@ -23,7 +23,7 @@ from ...core.mixins import GeoDataMixin -from ...core import JoinToLocation +from ...core import location_joined_query, make_spatial_unit from ..utilities import EventsTablesUnion @@ -44,29 +44,9 @@ class LocationIntroversion(GeoDataMixin, Query): Specifies a table of cdr data on which to base the analysis. Table must exist in events schema. If 'ALL' then we use all tables specified in flowmachine.yml. - level : str - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. direction : str, default 'both'. Determines if query should filter only outgoing events ('out'), incoming events ('in'), or both ('both'). @@ -95,84 +75,41 @@ def __init__( stop, *, table="all", - level="cell", + spatial_unit=make_spatial_unit("cell"), direction="both", hours="all", subscriber_subset=None, subscriber_identifier="msisdn", - size=None, - polygon_table=None, - geom_col="geom", - column_name=None, ): - - self.query_columns = ["id", "outgoing", "location_id", "datetime"] self.start = start self.stop = stop self.table = table - self.level = level + self.spatial_unit = spatial_unit self.direction = direction - if self.level == "versioned-site": - raise NotImplementedError( - 'The level "versioned-site" is currently not' - + "supported in the `LocationIntroversion()` class." - ) - - self.unioned_query = EventsTablesUnion( - self.start, - self.stop, - columns=self.query_columns, - tables=self.table, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, + self.unioned_query = location_joined_query( + EventsTablesUnion( + self.start, + self.stop, + columns=["id", "outgoing", "location_id", "datetime"], + tables=self.table, + hours=hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=subscriber_identifier, + ), + spatial_unit=self.spatial_unit, + time_col="datetime", ) - self.level_columns = ["location_id"] - - if self.level not in ("cell"): - self.join_to_location = JoinToLocation( - self.unioned_query, - level=self.level, - time_col="datetime", - column_name=column_name, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, - ) - cols = set(self.join_to_location.column_names) - if self.level != "lat-lon": - cols -= {"lat", "lon"} - self.level_columns = list(cols.difference(self.query_columns)) + super().__init__() @property def column_names(self) -> List[str]: - return self.level_columns + ["introversion", "extroversion"] + return self.spatial_unit.location_id_columns + ["introversion", "extroversion"] - def __build_query(self, location_columns, union): - """ - Private method for building feature query. This - is abstracted for readability. It's only called - by _make_query() - - Parameters - ---------- - location_columns: str - Relevant location column to make join to. - This - - union: str - SQL query representing an union table query. - This query either comes from the EventsTablesUnion() - or the JoinToLocation() class. - - Returns - ------- - sql: str - SQL string with query representation. + def _make_query(self): + location_columns = self.spatial_unit.location_id_columns - """ if self.direction == "both": sql_direction = "" elif self.direction == "in": @@ -185,7 +122,7 @@ def __build_query(self, location_columns, union): """ sql = f""" - WITH unioned_table AS ({union}) + WITH unioned_table AS ({self.unioned_query.get_query()}) SELECT *, 1-introversion as extroversion FROM (SELECT {', '.join(location_columns)}, sum(introverted::integer)/count(*)::float as introversion FROM ( SELECT @@ -202,18 +139,3 @@ def __build_query(self, location_columns, union): """ return sql - - def _make_query(self): - - if self.level == "cell": - sql = self.__build_query( - location_columns=self.level_columns, - union=self.unioned_query.get_query(), - ) - else: - sql = self.__build_query( - location_columns=self.level_columns, - union=self.join_to_location.get_query(), - ) - - return sql diff --git a/flowmachine/tests/test_location_introversion.py b/flowmachine/tests/test_location_introversion.py index d1dbd93e00..a4f26c57d2 100644 --- a/flowmachine/tests/test_location_introversion.py +++ b/flowmachine/tests/test_location_introversion.py @@ -9,17 +9,16 @@ import pytest +from flowmachine.core import make_spatial_unit from flowmachine.features.location import LocationIntroversion @pytest.mark.usefixtures("skip_datecheck") -def test_location_introversion_column_names(exemplar_level_param): +def test_location_introversion_column_names(exemplar_spatial_unit_param): """ Test that column_names property matches head(0)""" - if exemplar_level_param["level"] == "versioned-site": - pytest.skip( - 'The level "versioned-site" is currently not supported in the `LocationIntroversion()` class.' - ) - li = LocationIntroversion("2016-01-01", "2016-01-07", **exemplar_level_param) + li = LocationIntroversion( + "2016-01-01", "2016-01-07", spatial_unit=exemplar_spatial_unit_param + ) assert li.head(0).columns.tolist() == li.column_names @@ -27,7 +26,11 @@ def test_some_results(get_dataframe): """ LocationIntroversion() returns a dataframe that contains hand-picked results. """ - df = get_dataframe(LocationIntroversion("2016-01-01", "2016-01-07", level="admin3")) + df = get_dataframe( + LocationIntroversion( + "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("admin", level=3) + ) + ) set_df = df.set_index("pcod") assert round(set_df.loc["524 4 12 62"]["introversion"], 6) == pytest.approx( 0.108517 @@ -42,7 +45,9 @@ def test_some_results(get_dataframe): def test_lat_lng_introversion(get_dataframe): df = get_dataframe( - LocationIntroversion("2016-01-01", "2016-01-07", level="lat-lon") + LocationIntroversion( + "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lat-lon") + ) ) assert pytest.approx(0.0681818181818182) == df.introversion.max() assert 1.0 == df.extroversion.max() @@ -55,7 +60,11 @@ def test_no_result_is_greater_than_one(get_dataframe): """ No results from LocationIntroversion()['introversion'] is greater than 1. """ - df = get_dataframe(LocationIntroversion("2016-01-01", "2016-01-07", level="admin3")) + df = get_dataframe( + LocationIntroversion( + "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("admin", level=3) + ) + ) results = df[df["introversion"] > 1] assert len(results) == 0 @@ -64,14 +73,10 @@ def test_introversion_plus_extroversion_equals_one(get_dataframe): """ LocationIntroversion()['introversion'] + ['extroversion'] equals 1. """ - df = get_dataframe(LocationIntroversion("2016-01-01", "2016-01-07", level="admin3")) + df = get_dataframe( + LocationIntroversion( + "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("versioned-site") + ) + ) df["addition"] = df["introversion"] + df["extroversion"] assert df["addition"].sum() == len(df) - - -def test_introversion_raises_notimplemented_error_with_versioned_site(): - """ - LocationIntroversion(level='versioned-site') raises a NotImplementedError. - """ - with pytest.raises(NotImplementedError): - LocationIntroversion("2016-01-01", "2016-01-07", level="versioned-site") From f04ec82d6e4009b75a82646b30a498d69d1f8598 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 8 Jun 2019 15:55:38 +0100 Subject: [PATCH 083/138] Update PerLocationEventStats --- .../subscriber/per_location_event_stats.py | 83 ++++++------------- ...est_subscriber_per_location_event_stats.py | 36 ++++---- 2 files changed, 48 insertions(+), 71 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py b/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py index ea1e07070e..564434d15b 100644 --- a/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py +++ b/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py @@ -6,8 +6,7 @@ from typing import List -from ...core import JoinToLocation -from flowmachine.utils import get_columns_for_level +from ...core import location_joined_query, make_spatial_unit from ..utilities.sets import EventsTablesUnion from .metaclasses import SubscriberFeature @@ -42,29 +41,9 @@ class PerLocationEventStats(SubscriberFeature): Can be a string of a single table (with the schema) or a list of these. The keyword all is to select all subscriber tables - level : str, default 'cell' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. Examples -------- @@ -88,25 +67,20 @@ def __init__( stop, statistic="avg", *, - level="cell", + spatial_unit=make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", direction="both", subscriber_subset=None, - column_name=None, - size=None, - polygon_table=None, - geom_col="geom", ): self.start = start self.stop = stop - self.level = level + self.spatial_unit = spatial_unit self.hours = hours self.tables = tables self.subscriber_identifier = subscriber_identifier self.direction = direction - self.column_name = column_name self.statistic = statistic if self.statistic not in valid_stats: @@ -117,36 +91,31 @@ def __init__( ) if self.direction in {"both"}: - column_list = [self.subscriber_identifier, "location_id"] + column_list = [self.subscriber_identifier, "location_id", "datetime"] elif self.direction in {"in", "out"}: - column_list = [self.subscriber_identifier, "location_id", "outgoing"] + column_list = [ + self.subscriber_identifier, + "location_id", + "outgoing", + "datetime", + ] else: raise ValueError("{} is not a valid direction.".format(self.direction)) - if self.level != "cell": - column_list.append("datetime") - - self.unioned_query = EventsTablesUnion( - self.start, - self.stop, - tables=self.tables, - columns=column_list, - hours=hours, - subscriber_identifier=subscriber_identifier, - subscriber_subset=subscriber_subset, + self.unioned_query = location_joined_query( + EventsTablesUnion( + self.start, + self.stop, + tables=self.tables, + columns=column_list, + hours=hours, + subscriber_identifier=subscriber_identifier, + subscriber_subset=subscriber_subset, + ), + spatial_unit=self.spatial_unit, + time_col="datetime", ) - if self.level != "cell": - self.unioned_query = JoinToLocation( - self.unioned_query, - level=self.level, - column_name=self.column_name, - time_col="datetime", - size=size, - polygon_table=polygon_table, - geom_col=geom_col, - ) - super().__init__() @property @@ -154,7 +123,7 @@ def column_names(self): return ["subscriber", "value"] def _make_query(self): - loc_cols = ", ".join(get_columns_for_level(self.level, self.column_name)) + loc_cols = ", ".join(self.spatial_unit.location_id_columns) where_clause = "" if self.direction != "both": diff --git a/flowmachine/tests/test_subscriber_per_location_event_stats.py b/flowmachine/tests/test_subscriber_per_location_event_stats.py index 5587eab7ff..c41f6b8491 100644 --- a/flowmachine/tests/test_subscriber_per_location_event_stats.py +++ b/flowmachine/tests/test_subscriber_per_location_event_stats.py @@ -2,30 +2,38 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from flowmachine.core import make_spatial_unit from flowmachine.features.subscriber.per_location_event_stats import * import pytest @pytest.mark.parametrize( - "statistic,msisdn,want,level", + "kwargs,msisdn,want", [ - ("count", "Rzx9WE1QRqdEX2Gp", 16, {}), - ("sum", "LBlWd64rqnMGv7kY", 22, {}), - ("avg", "JZoaw2jzvK2QMKYX", 1.333_333, {}), - ("avg", "JZoaw2jzvK2QMKYX", 1.647_059, {"level": "admin3"}), - ("avg", "JZoaw2jzvK2QMKYX", 1.285_714_2, {"direction": "in"}), - ("max", "DELmRj9Vvl346G50", 4, {}), - ("min", "9vXy462Ej8V1kpWl", 1, {}), - ("stddev", "EkpjZe5z37W70QKA", 0.594_089, {}), - ("variance", "JNK7mk5G1Dy6M2Ya", 0.395_833, {}), + ({"statistic": "count"}, "Rzx9WE1QRqdEX2Gp", 16), + ({"statistic": "sum"}, "LBlWd64rqnMGv7kY", 22), + ({"statistic": "avg"}, "JZoaw2jzvK2QMKYX", 1.333_333), + ( + { + "statistic": "avg", + "spatial_unit": {"spatial_unit_type": "admin", "level": 3}, + }, + "JZoaw2jzvK2QMKYX", + 1.647_059, + ), + ({"statistic": "avg", "direction": "in"}, "JZoaw2jzvK2QMKYX", 1.285_714_2), + ({"statistic": "max"}, "DELmRj9Vvl346G50", 4), + ({"statistic": "min"}, "9vXy462Ej8V1kpWl", 1), + ({"statistic": "stddev"}, "EkpjZe5z37W70QKA", 0.594_089), + ({"statistic": "variance"}, "JNK7mk5G1Dy6M2Ya", 0.395_833), ], ) -def test_per_location_event_stats(get_dataframe, statistic, msisdn, want, level): +def test_per_location_event_stats(get_dataframe, kwargs, msisdn, want): """ Test hand-picked PerLocationEventStats. """ - query = PerLocationEventStats( - "2016-01-01", "2016-01-06", statistic=statistic, **level - ) + if "spatial_unit" in kwargs: + kwargs["spatial_unit"] = make_spatial_unit(**kwargs["spatial_unit"]) + query = PerLocationEventStats("2016-01-01", "2016-01-06", **kwargs) df = get_dataframe(query).set_index("subscriber") assert df.value[msisdn] == pytest.approx(want) From f48b0e60e65db9337ba29d8a3ee2eab4b8e6dace Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 8 Jun 2019 16:58:56 +0100 Subject: [PATCH 084/138] Update TotalLocationEvents --- .../features/location/total_events.py | 249 +++++------------- .../test_total_cell_events_column_names.py | 25 -- .../tests/test_total_location_events.py | 46 +++- 3 files changed, 96 insertions(+), 224 deletions(-) delete mode 100644 flowmachine/tests/test_total_cell_events_column_names.py diff --git a/flowmachine/flowmachine/features/location/total_events.py b/flowmachine/flowmachine/features/location/total_events.py index 1adf5029ea..75f7680192 100644 --- a/flowmachine/flowmachine/features/location/total_events.py +++ b/flowmachine/flowmachine/features/location/total_events.py @@ -13,22 +13,48 @@ """ -from ...core import JoinToLocation +from ...core import location_joined_query, make_spatial_unit from ..utilities import EventsTablesUnion from ...core import Query from ...core.mixins import GeoDataMixin -from flowmachine.utils import get_columns_for_level +class TotalLocationEvents(GeoDataMixin, Query): + """ + Calculates the total number of events on an hourly basis + per location (such as a tower or admin region), + and per interaction type. + + Parameters + ---------- + start : str + ISO format date string to at which to start the analysis + stop : str + As above for the end of the analysis + table : str, default 'all' + Specifies a table of cdr data on which to base the analysis. Table must + exist in events schema. If 'all' then we use all tables specified in + flowmachine.yml. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. + interval : ['hour', 'day', 'min'] + Records activity on an hourly, daily, or by minute level. + direction : str, default 'both' + Look only at incoming or outgoing events. Can be either + 'out', 'in' or 'both'. + """ + + allowed_intervals = {"day", "hour", "min"} -class _TotalCellEvents(Query): def __init__( self, start: str, stop: str, *, table: Union[None, List[str]] = None, + spatial_unit=make_spatial_unit("cell"), interval: str = "hour", direction: str = "both", hours="all", @@ -38,13 +64,14 @@ def __init__( self.start = start self.stop = stop self.table = table + self.spatial_unit = spatial_unit self.interval = interval self.direction = direction - if self.interval not in TotalLocationEvents.allowed_intervals: + if self.interval not in self.allowed_intervals: raise ValueError( "'Interval must be one of: {} got: {}".format( - TotalLocationEvents.allowed_intervals, self.interval + self.allowed_intervals, self.interval ) ) @@ -54,41 +81,40 @@ def __init__( if self.interval == "min": self.time_cols.append("extract(minute FROM datetime) AS min") - self.cols = ["location_id", "datetime"] + events_tables_union_cols = ["location_id", "datetime"] # if we need to filter on outgoing/incoming calls, we will also fetch this # column. Don't fetch it if it is not needed for both efficiency and the # possibility that we might want to do pass another data type which does not # have this information. if self.direction != "both": - self.cols += ["outgoing"] + events_tables_union_cols += ["outgoing"] if self.direction not in ["in", "out", "both"]: raise ValueError("Unrecognised direction: {}".format(self.direction)) - # list of columns that we want to group by, these are all the time - # columns, plus the cell column - self.groups = [x.split(" AS ")[0] for x in self.time_cols + ["location_id"]] - self.unioned = EventsTablesUnion( - self.start, - self.stop, - tables=self.table, - columns=self.cols, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, + self.unioned = location_joined_query( + EventsTablesUnion( + self.start, + self.stop, + tables=self.table, + columns=events_tables_union_cols, + hours=hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=subscriber_identifier, + ), + spatial_unit=self.spatial_unit, + time_col="datetime", ) - super().__init__() @property def column_names(self) -> List[str]: return ( - ["location_id"] + [x.split(" AS ")[1] for x in self.time_cols] + ["total"] + self.spatial_unit.location_id_columns + + [x.split(" AS ")[1] for x in self.time_cols] + + ["total"] ) def _make_query(self): - # Firstly get the result at the level of the cell, - # we'll do some further aggregation if necessary later - # Set a filter clause based on the direction of the event if self.direction == "both": filter_clause = "" @@ -99,175 +125,24 @@ def _make_query(self): else: raise ValueError("Unrecognised direction: {}".format(self.direction)) + # list of columns that we want to group by, these are all the time + # columns, plus the location columns + groups = [ + x.split(" AS ")[0] for x in self.time_cols + ] + self.spatial_unit.location_id_columns + # We now need to group this table by the relevant columns in order to # get a count per region - sql = """ - + sql = f""" SELECT - location_id, - {time_cols}, + {', '.join(self.spatial_unit.location_id_columns)}, + {', '.join(self.time_cols)}, count(*) AS total FROM - ({union}) unioned - {filter} + ({self.unioned.get_query()}) unioned + {filter_clause} GROUP BY - {groups} - - """.format( - time_cols=", ".join(self.time_cols), - union=self.unioned.get_query(), - groups=", ".join(self.groups), - filter=filter_clause, - ) - - return sql - - -class TotalLocationEvents(GeoDataMixin, Query): - """ - Calculates the total number of events on an hourly basis - per location (such as a tower or admin region), - and per interaction type. - - Parameters - ---------- - start : str - ISO format date string to at which to start the analysis - stop : str - As above for the end of the analysis - table : str, default 'all' - Specifies a table of cdr data on which to base the analysis. Table must - exist in events schema. If 'all' then we use all tables specified in - flowmachine.yml. - level : str - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. - interval : ['hour', 'day', 'min'] - Records activity on an hourly, daily, or by minute level. - direction : str, default 'both' - Look only at incoming or outgoing events. Can be either - 'out', 'in' or 'both'. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. - - """ - - allowed_intervals = {"day", "hour", "min"} - - def __init__( - self, - start: str, - stop: str, - *, - table: Union[None, List[str]] = None, - level: str = "cell", - interval: str = "hour", - direction: str = "both", - column_name: Union[str, None] = None, - hours="all", - subscriber_subset=None, - subscriber_identifier="msisdn", - size=None, - polygon_table=None, - geom_col="geom", - ): - self.start = start - self.stop = stop - self.table = table - self.level = level - self.interval = interval - self.direction = direction - self.column_name = column_name - - if self.interval not in self.allowed_intervals: - raise ValueError( - "'Interval must be one of: {} got: {}".format( - self.allowed_intervals, self.interval - ) - ) - self._obj = _TotalCellEvents( - start, - stop, - table=table, - interval=interval, - direction=direction, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, - ) - if level != "cell": - self._obj = JoinToLocation( - self._obj, - level=self.level, - time_col="date", - column_name=column_name, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, - ) - super().__init__() - - @property - def column_names(self) -> List[str]: - cols = get_columns_for_level(self.level, self.column_name) + ["date"] - if self.interval == "hour" or self.interval == "min": - cols += ["hour"] - if self.interval == "min": - cols += ["min"] - cols += ["total"] - return cols - - def _make_query(self): - # Grouped now represents a query with activities on the level of the cell, - # if that is what the user has asked for then we are done, otherwise - # we need to do the appropriate join and do a further group by. - if self.level == "cell": - sql = self._obj.get_query() - cols = self._obj.groups - # Otherwise we're after lat-lon, or an admin region. - # in either case we need to join with the cell data - else: - cols = ", ".join(get_columns_for_level(self.level, self.column_name)) - cols += ", " + self._obj.time_col - if self.interval == "hour" or self.interval == "min": - cols += ",hour" - if self.interval == "min": - cols += ",min" - - sql = """ - - SELECT - {cols}, - sum(total) AS total - FROM - ({j}) AS j - GROUP BY - {cols} - ORDER BY {cols} - """.format( - cols=cols, j=self._obj.get_query() - ) + {', '.join(groups)} + """ return sql diff --git a/flowmachine/tests/test_total_cell_events_column_names.py b/flowmachine/tests/test_total_cell_events_column_names.py deleted file mode 100644 index e1d83c75f9..0000000000 --- a/flowmachine/tests/test_total_cell_events_column_names.py +++ /dev/null @@ -1,25 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -""" -Test column_names property of _TotalCellEvents -""" - -import pytest - -from flowmachine.features.location.total_events import ( - TotalLocationEvents, - _TotalCellEvents, -) - - -@pytest.mark.usefixtures("skip_datecheck") -@pytest.mark.parametrize("interval", TotalLocationEvents.allowed_intervals) -@pytest.mark.parametrize("direction", ["in", "out", "both"]) -def test_total_cell_events_column_names(interval, direction): - """ Test that column_names property of _TotalCellEvents matches head(0)""" - tce = _TotalCellEvents( - "2016-01-01", "2016-01-04", interval=interval, direction=direction - ) - assert tce.head(0).columns.tolist() == tce.column_names diff --git a/flowmachine/tests/test_total_location_events.py b/flowmachine/tests/test_total_location_events.py index 6b23fbec57..10f433dc32 100644 --- a/flowmachine/tests/test_total_location_events.py +++ b/flowmachine/tests/test_total_location_events.py @@ -7,20 +7,23 @@ """ import pytest +from flowmachine.core import make_spatial_unit from flowmachine.features import TotalLocationEvents @pytest.mark.usefixtures("skip_datecheck") @pytest.mark.parametrize("interval", TotalLocationEvents.allowed_intervals) @pytest.mark.parametrize("direction", ["in", "out", "both"]) -def test_total_location_events_column_names(exemplar_level_param, interval, direction): +def test_total_location_events_column_names( + exemplar_spatial_unit_param, interval, direction +): """ Test that column_names property of TotalLocationEvents matches head(0)""" tle = TotalLocationEvents( "2016-01-01", "2016-01-04", - **exemplar_level_param, + spatial_unit=exemplar_spatial_unit_param, interval=interval, - direction=direction + direction=direction, ) assert tle.head(0).columns.tolist() == tle.column_names @@ -30,15 +33,19 @@ def test_events_at_cell_level(get_dataframe): TotalLocationEvents() returns data at the level of the cell. """ - te = TotalLocationEvents("2016-01-01", "2016-01-04", level="versioned-site") + te = TotalLocationEvents( + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("cell") + ) df = get_dataframe(te) # Test one of the values df.date = df.date.astype(str) val = list( - df[(df.date == "2016-01-03") & (df.site_id == "zArRjg") & (df.hour == 17)].total + df[ + (df.date == "2016-01-03") & (df.location_id == "1Gc6RSfZ") & (df.hour == 17) + ].total )[0] - assert val == 3 + assert val == 4 def test_ignore_texts(get_dataframe): @@ -46,7 +53,10 @@ def test_ignore_texts(get_dataframe): TotalLocationEvents() can get the total activity at cell level excluding texts. """ te = TotalLocationEvents( - "2016-01-01", "2016-01-04", level="versioned-site", table="events.calls" + "2016-01-01", + "2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + table="events.calls", ) df = get_dataframe(te) @@ -63,7 +73,10 @@ def test_only_incoming(get_dataframe): TotalLocationEvents() can get activity, ignoring outgoing calls. """ te = TotalLocationEvents( - "2016-01-01", "2016-01-04", level="versioned-site", direction="in" + "2016-01-01", + "2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + direction="in", ) df = get_dataframe(te) # Test one of the values @@ -79,7 +92,10 @@ def test_events_daily(get_dataframe): TotalLocationEvents() can get activity on a daily level. """ te = TotalLocationEvents( - "2016-01-01", "2016-01-04", level="versioned-site", interval="day" + "2016-01-01", + "2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + interval="day", ) df = get_dataframe(te) @@ -94,7 +110,10 @@ def test_events_min(get_dataframe): TotalLocationEvents() can get events on a min-by-min basis. """ te = TotalLocationEvents( - "2016-01-01", "2016-01-04", level="versioned-site", interval="min" + "2016-01-01", + "2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + interval="min", ) df = get_dataframe(te) @@ -117,7 +136,7 @@ def test_bad_direction_raises_error(): TotalLocationEvents( "2016-01-01", "2016-01-04", - level="versioned-site", + spatial_unit=make_spatial_unit("versioned-site"), interval="min", direction="BAD_DIRECTION", ) @@ -127,5 +146,8 @@ def test_bad_interval_raises_error(): """Total location events raises an error for a bad interval.""" with pytest.raises(ValueError): TotalLocationEvents( - "2016-01-01", "2016-01-04", level="versioned-site", interval="BAD_INTERVAL" + "2016-01-01", + "2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + interval="BAD_INTERVAL", ) From 26ff2ed41eab7de1e715abe4696c2e04788b954e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 8 Jun 2019 17:43:20 +0100 Subject: [PATCH 085/138] Updated EventScore --- .../flowmachine/features/subscriber/scores.py | 62 +++++-------------- .../subscriber/subscriber_location_cluster.py | 2 +- flowmachine/tests/test_label_event_score.py | 28 ++++++--- .../tests/test_meaningful_locations.py | 40 +++++++++--- flowmachine/tests/test_scores.py | 23 ++++--- .../tests/test_subscriber_location_cluster.py | 12 +++- 6 files changed, 92 insertions(+), 75 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/scores.py b/flowmachine/flowmachine/features/subscriber/scores.py index 1b28fdef2d..d4433608f9 100644 --- a/flowmachine/flowmachine/features/subscriber/scores.py +++ b/flowmachine/flowmachine/features/subscriber/scores.py @@ -13,9 +13,7 @@ from typing import List from ..utilities import EventsTablesUnion -from ...core import Query -from ...core import JoinToLocation -from flowmachine.utils import get_columns_for_level +from ...core import Query, location_joined_query, make_spatial_unit class EventScore(Query): @@ -27,7 +25,7 @@ class EventScore(Query): based on its signature. Such type of analysis reduces the dimensionality of the problem by projecting a given event pattern onto the real line. - This class returns a table with scores averaged across the requested level + This class returns a table with scores averaged across the requested spatial unit per subscriber. Parameters @@ -42,29 +40,9 @@ class EventScore(Query): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' Subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -79,15 +57,11 @@ class EventScore(Query): If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - column_name : str, optional - Option, none-standard, name of the column that identifies the - spatial level, i.e. could pass admin3pcod to use the admin 3 pcode - as opposed to the name of the region. Examples -------- >>> es = EventScore(start='2016-01-01', stop='2016-01-05', - level='versioned-site') + spatial_unit=make_spatial_unit('versioned-site')) >>> es.head() subscriber location_id version score_hour score_dow 3EgqzplqPYDyGRVK DbWg4K 0 0.0 -1.0 @@ -104,7 +78,7 @@ def __init__( *, start: str, stop: str, - level: str = "admin3", + spatial_unit=None, hours: Union[str, Tuple[int, int]] = "all", table: Union[str, List[str]] = "all", score_hour: List[float] = [ @@ -143,10 +117,7 @@ def __init__( "sunday": -1, }, subscriber_identifier: str = "msisdn", - column_name: Union[str, List[str]] = None, subscriber_subset=None, - polygon_table=None, - size=None, ): if set(score_dow.keys()) != { "monday", @@ -162,7 +133,7 @@ def __init__( ) if len(score_hour) != 24: raise ValueError( - f"Hour of day score dictionary must have 24 hours. Got {len(score_hour)}" + f"Hour of day score list must have 24 hours. Got {len(score_hour)}" ) if not all([-1 <= float(x) <= 1 for x in score_hour]): raise ValueError(f"Hour of day scores must be floats between -1 and 1.") @@ -170,13 +141,15 @@ def __init__( raise ValueError(f"Day of week scores must be floats between -1 and 1.") self.score_hour = score_hour self.score_dow = score_dow - self.level = level + if spatial_unit is None: + self.spatial_unit = make_spatial_unit("admin", level=3) + else: + self.spatial_unit = spatial_unit self.start = start self.stop = stop self.hours = hours self.subscriber_identifier = subscriber_identifier - self.column_name = column_name - self.sds = JoinToLocation( + self.sds = location_joined_query( EventsTablesUnion( start=start, stop=stop, @@ -186,11 +159,8 @@ def __init__( subscriber_subset=subscriber_subset, subscriber_identifier=self.subscriber_identifier, ), - level=self.level, + spatial_unit=self.spatial_unit, time_col="datetime", - column_name=self.column_name, - polygon_table=polygon_table, - size=size, ) super().__init__() @@ -212,7 +182,7 @@ def _make_query(self): {" ".join(f"WHEN dow='{dow}' THEN {score}" for dow, score in self.score_dow.items())} END)""" - location_cols = get_columns_for_level(self.level, self.column_name) + location_cols = self.spatial_unit.location_id_columns query = f""" SELECT subscriber, {", ".join(location_cols)}, datetime, @@ -241,6 +211,6 @@ def _make_query(self): def column_names(self) -> List[str]: return ( ["subscriber"] - + get_columns_for_level(self.level, self.column_name) + + self.spatial_unit.location_id_columns + ["score_hour", "score_dow"] ) diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py index deafc5d47c..c7adf1f845 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py @@ -291,7 +291,7 @@ def join_to_cluster_components(self, query): -------- >>> es = EventScore(start='2016-01-01', stop='2016-01-05', - level='versioned-site') + spatial_unit=make_spatial_unit('versioned-site')) >>> cd = CallDays(start='2016-01-01', stop='2016-01-04', level='versioned-site') diff --git a/flowmachine/tests/test_label_event_score.py b/flowmachine/tests/test_label_event_score.py index 2bf58dda26..aeee76f385 100644 --- a/flowmachine/tests/test_label_event_score.py +++ b/flowmachine/tests/test_label_event_score.py @@ -4,18 +4,18 @@ import pytest -from flowmachine.core import JoinToLocation +from flowmachine.core import make_spatial_unit from flowmachine.features import EventScore from flowmachine.features.subscriber.label_event_score import LabelEventScore @pytest.mark.usefixtures("skip_datecheck") def test_labelled_event_score_column_names( - exemplar_level_param, get_column_names_from_run + exemplar_spatial_unit_param, get_column_names_from_run ): - if exemplar_level_param["level"] not in JoinToLocation.allowed_levels: - pytest.skip(f'{exemplar_level_param["level"]} not valid for this test') - es = EventScore(start="2016-01-01", stop="2016-01-05", **exemplar_level_param) + es = EventScore( + start="2016-01-01", stop="2016-01-05", spatial_unit=exemplar_spatial_unit_param + ) labelled = LabelEventScore(scores=es, required="evening") assert get_column_names_from_run(labelled) == labelled.column_names @@ -24,7 +24,11 @@ def test_locations_are_labelled_correctly(get_dataframe): """ Test whether locations are labelled corrected. """ - es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + ) ls = LabelEventScore( scores=es, @@ -43,7 +47,11 @@ def test_whether_passing_reserved_label_fails(): """ Test whether passing the reserved label 'unknown' fails. """ - es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + ) with pytest.raises(ValueError): ls = LabelEventScore( @@ -69,7 +77,11 @@ def test_whether_required_label_relabels(get_dataframe): """ Test whether required label relabel the location of subscribers who did not originally have the required label. """ - es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + ) ls = LabelEventScore( scores=es, diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 0f163a9dd4..1a23021115 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -43,7 +43,9 @@ def test_column_names_meaningful_locations(get_column_names_from_run): radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -73,7 +75,9 @@ def test_column_names_meaningful_locations_aggregate( radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -101,7 +105,9 @@ def test_meaningful_locations_aggregate_disallowed_level_raises(): radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -130,7 +136,9 @@ def test_column_names_meaningful_locations_od( radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -148,7 +156,9 @@ def test_column_names_meaningful_locations_od( radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", @@ -184,7 +194,9 @@ def test_meaningful_locations_results( radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label=label, @@ -218,7 +230,9 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -252,7 +266,9 @@ def test_meaningful_locations_od_raises_for_bad_level( radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", @@ -282,7 +298,9 @@ def test_meaningful_locations_od_results(get_dataframe): radius=1, ), scores=EventScore( - start="2016-01-01", stop="2016-01-02", level="versioned-site" + start="2016-01-01", + stop="2016-01-02", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", @@ -300,7 +318,9 @@ def test_meaningful_locations_od_results(get_dataframe): radius=1, ), scores=EventScore( - start="2016-01-02", stop="2016-01-03", level="versioned-site" + start="2016-01-02", + stop="2016-01-03", + spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", diff --git a/flowmachine/tests/test_scores.py b/flowmachine/tests/test_scores.py index 5dffab1c19..1ba8fe800e 100644 --- a/flowmachine/tests/test_scores.py +++ b/flowmachine/tests/test_scores.py @@ -9,15 +9,15 @@ import pytest -from flowmachine.core import JoinToLocation +from flowmachine.core import make_spatial_unit from flowmachine.features import EventScore @pytest.mark.usefixtures("skip_datecheck") -def test_event_score_column_names(exemplar_level_param): - if exemplar_level_param["level"] not in JoinToLocation.allowed_levels: - pytest.skip(f'{exemplar_level_param["level"]} not valid for this test') - es = EventScore(start="2016-01-01", stop="2016-01-05", **exemplar_level_param) +def test_event_score_column_names(exemplar_spatial_unit_param): + es = EventScore( + start="2016-01-01", stop="2016-01-05", spatial_unit=exemplar_spatial_unit_param + ) assert es.head(0).columns.tolist() == es.column_names @@ -25,7 +25,11 @@ def test_whether_scores_are_within_score_bounds(get_dataframe): """ Test whether the scores are within the bounds of maximum and minimum scores. """ - es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + ) df = get_dataframe(es) max_score = df[["score_hour", "score_dow"]].max() min_score = df[["score_hour", "score_dow"]].min() @@ -57,7 +61,10 @@ def test_out_of_bounds_score_raises(scorer, out_of_bounds_val, flowmachine_conne scorers[scorer][scorers[scorer].popitem()[0]] = out_of_bounds_val with pytest.raises(ValueError): es = EventScore( - start="2016-01-01", stop="2016-01-05", level="versioned-site", **scorers + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + **scorers ) @@ -81,7 +88,7 @@ def test_whether_zero_score_returns_only_zero(get_dataframe): }, 0, ), - level="versioned-site", + spatial_unit=make_spatial_unit("versioned-site"), ) df = get_dataframe(es) valid = df[["score_hour", "score_dow"]] == 0 diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 5aa55493f8..65b1a53add 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -49,7 +49,11 @@ def test_joined_hartigan_column_names(get_column_names_from_run): ) ) hartigan = HartiganCluster(calldays=cd, radius=50) - es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-05", + spatial_unit=make_spatial_unit("versioned-site"), + ) joined = hartigan.join_to_cluster_components(es) assert get_column_names_from_run(joined) == joined.column_names @@ -260,7 +264,11 @@ def test_join_returns_the_same_clusters(): hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() - es = EventScore(start="2016-01-01", stop="2016-01-04", level="versioned-site") + es = EventScore( + start="2016-01-01", + stop="2016-01-04", + spatial_unit=make_spatial_unit("versioned-site"), + ) joined = ( hartigan.join_to_cluster_components(es) From 5e90de08499d8a6db32b3512c51d9ac69bcaa89a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sat, 8 Jun 2019 20:06:45 +0100 Subject: [PATCH 086/138] Update PerLocationSubscriberCallDurations, PairedPerLocationSubscriberCallDurations and SubscriberLocationSubset --- .../subscriber/subscriber_call_durations.py | 175 +++++------------- .../flowmachine/features/utilities/sets.py | 68 +++---- flowmachine/tests/conftest.py | 2 +- .../tests/test_subscriber_call_durations.py | 44 +++-- .../tests/test_subscriber_location_subset.py | 19 +- 5 files changed, 107 insertions(+), 201 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py index 688c8d1399..1a5cb64798 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py @@ -12,8 +12,7 @@ import warnings from typing import List -from ...core import JoinToLocation -from flowmachine.utils import get_columns_for_level +from ...core import location_joined_query, make_spatial_unit from ..utilities import EventsTablesUnion from .metaclasses import SubscriberFeature @@ -64,16 +63,12 @@ def __init__( self, start, stop, + *, subscriber_identifier="msisdn", direction="out", statistic="sum", - *, hours="all", subscriber_subset=None, - level=None, - size=None, - column_name=None, - polygon_table=None, ): self.start = start self.stop = stop @@ -141,31 +136,9 @@ class PerLocationSubscriberCallDurations(SubscriberFeature): subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'}, default 'out' Whether to consider calls made, received, or both. Defaults to 'out'. - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. - column_name : str - Optionally specify a non-default column name. Required if level is 'polygon'. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum' Defaults to sum, aggregation statistic over the durations. @@ -188,24 +161,22 @@ def __init__( self, start, stop, + *, subscriber_identifier="msisdn", direction="out", - level="admin3", statistic="sum", - column_name=None, - *, + spatial_unit=None, hours="all", subscriber_subset=None, - size=None, - polygon_table=None, - geom_col="geom", ): self.start = start self.stop = stop self.subscriber_identifier = subscriber_identifier self.direction = direction - self.level = level - self.column_name = column_name + if spatial_unit is None: + self.spatial_unit = make_spatial_unit("admin", level=3) + else: + self.spatial_unit = spatial_unit self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( @@ -222,48 +193,33 @@ def __init__( "outgoing", "duration", "location_id", + "datetime", ] - self.unioned_query = EventsTablesUnion( - self.start, - self.stop, - tables="events.calls", - columns=column_list, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, - ) - if self.level != "cell": - etu = EventsTablesUnion( + self.unioned_query = location_joined_query( + EventsTablesUnion( self.start, self.stop, tables="events.calls", - columns=column_list + ["datetime"], + columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=self.subscriber_identifier, - ) - - self.unioned_query = JoinToLocation( - etu, - level=self.level, - column_name=self.column_name, - time_col="datetime", - size=size, - polygon_table=polygon_table, - geom_col=geom_col, - ) + ), + spatial_unit=self.spatial_unit, + time_col="datetime", + ) super().__init__() @property def column_names(self) -> List[str]: return ( ["subscriber"] - + get_columns_for_level(self.level, self.column_name) + + self.spatial_unit.location_id_columns + [f"duration_{self.statistic}"] ) def _make_query(self): - loc_cols = ", ".join(get_columns_for_level(self.level, self.column_name)) + loc_cols = ", ".join(self.spatial_unit.location_id_columns) where_clause = "" if self.direction != "both": where_clause = "WHERE {}outgoing".format( @@ -319,10 +275,6 @@ def __init__( start, stop, *, - level=None, - size=None, - column_name=None, - polygon_table=None, subscriber_identifier="msisdn", statistic="sum", hours="all", @@ -390,31 +342,9 @@ class PairedPerLocationSubscriberCallDurations(SubscriberFeature): If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. - column_name : str - Optionally specify a non-default column name. Required if level is 'polygon'. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum' Defaults to 'sum', aggregation statistic over the durations. @@ -444,22 +374,20 @@ def __init__( self, start, stop, + *, subscriber_identifier="msisdn", - level="admin3", - column_name=None, statistic="sum", - *, + spatial_unit=None, hours="all", subscriber_subset=None, - size=None, - polygon_table=None, - geom_col="geom", ): self.start = start self.stop = stop self.subscriber_identifier = subscriber_identifier - self.level = level - self.column_name = column_name + if spatial_unit is None: + self.spatial_unit = make_spatial_unit("admin", level=3) + else: + self.spatial_unit = spatial_unit self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( @@ -475,35 +403,22 @@ def __init__( "outgoing", "duration", "location_id", + "datetime", ] - unioned_query = EventsTablesUnion( - self.start, - self.stop, - tables="events.calls", - columns=column_list, - hours=hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=self.subscriber_identifier, - ) - if self.level != "cell": - etu = EventsTablesUnion( + unioned_query = location_joined_query( + EventsTablesUnion( self.start, self.stop, tables="events.calls", - columns=column_list + ["datetime"], + columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=self.subscriber_identifier, - ) - unioned_query = JoinToLocation( - etu, - level=self.level, - column_name=self.column_name, - time_col="datetime", - size=size, - polygon_table=polygon_table, - geom_col=geom_col, - ) + ), + spatial_unit=self.spatial_unit, + time_col="datetime", + ) + self.joined = unioned_query.subset("outgoing", "t").join( unioned_query.subset("outgoing", "f"), on_left="id", @@ -518,19 +433,15 @@ def __init__( def column_names(self) -> List[str]: return ( ["subscriber", "msisdn_counterpart"] - + get_columns_for_level(self.level, self.column_name) - + [ - f"{x}_counterpart" - for x in get_columns_for_level(self.level, self.column_name) - ] + + self.spatial_unit.location_id_columns + + [f"{x}_counterpart" for x in self.spatial_unit.location_id_columns] + [f"duration_{self.statistic}"] ) def _make_query(self): - loc_cols = get_columns_for_level(self.level, self.column_name) + loc_cols = self.spatial_unit.location_id_columns loc_cols += [ - "{}_counterpart".format(c) - for c in get_columns_for_level(self.level, self.column_name) + "{}_counterpart".format(c) for c in self.spatial_unit.location_id_columns ] loc_cols = ", ".join(loc_cols) diff --git a/flowmachine/flowmachine/features/utilities/sets.py b/flowmachine/flowmachine/features/utilities/sets.py index f645119c31..53e3609f46 100644 --- a/flowmachine/flowmachine/features/utilities/sets.py +++ b/flowmachine/flowmachine/features/utilities/sets.py @@ -11,8 +11,7 @@ from .event_table_subset import EventTableSubset from .events_tables_union import EventsTablesUnion -from ...core import Query -from flowmachine.utils import get_columns_for_level +from ...core import Query, make_spatial_unit from numpy import inf @@ -137,44 +136,27 @@ class SubscriberLocationSubset(Query): Start time to filter query. stop : datetime Stop time to filter query. - geoms : flowmachine.Query - An object of type - level : str, default 'admin3' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. min_calls : int minimum number of calls a user must have made within a - name_col : str - Name of column with name associated to geometry - geom_col : str - Name of column containing geometry direction : {'in', 'out', 'both'}, default 'both' Whether to consider calls made, received, or both. Defaults to 'both'. + hours : 2-tuple of floats, default 'all' + Restrict the analysis to only a certain set + of hours within each day. + subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' + Either msisdn, or imei, the column that identifies the subscriber. + subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None + If provided, string or list of string which are msisdn or imeis to limit + results to; or, a query or table which has a column with a name matching + subscriber_identifier (typically, msisdn), to limit results to. Examples -------- >>> sls = SubscriberLocationSubset("2016-01-01", "2016-01-07", min_calls=3, - direction="both", level="admin3") + direction="both", spatial_unit=make_spatial_unit("admin", level=3)) >>> sls.head() subscriber name @@ -194,13 +176,9 @@ def __init__( min_calls, subscriber_identifier="msisdn", direction="both", - level="admin3", - column_name=None, + spatial_unit=None, hours="all", subscriber_subset=None, - size=None, - polygon_table=None, - geom_col="geom", ): from ...features import PerLocationSubscriberCallDurations @@ -210,22 +188,20 @@ def __init__( self.min_calls = min_calls self.subscriber_identifier = subscriber_identifier self.direction = direction - self.level = level - self.column_name = column_name + if spatial_unit is None: + self.spatial_unit = make_spatial_unit("admin", level=3) + else: + self.spatial_unit = spatial_unit self.pslds = PerLocationSubscriberCallDurations( start=self.start, stop=self.stop, subscriber_identifier=self.subscriber_identifier, direction=self.direction, - level=self.level, + spatial_unit=self.spatial_unit, statistic="count", - column_name=self.column_name, hours=hours, subscriber_subset=subscriber_subset, - size=size, - polygon_table=polygon_table, - geom_col=geom_col, ) self.pslds_subset = self.pslds.numeric_subset( @@ -236,11 +212,11 @@ def __init__( @property def column_names(self) -> List[str]: - return ["subscriber"] + get_columns_for_level(self.level, self.column_name) + return ["subscriber"] + self.spatial_unit.location_id_columns def _make_query(self): - loc_cols = ", ".join(get_columns_for_level(self.level, self.column_name)) + loc_cols = ", ".join(self.spatial_unit.location_id_columns) sql = f""" SELECT diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index 04bb11d214..e85922cbe0 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -82,7 +82,7 @@ def exemplar_level_param(request): }, { "spatial_unit_type": "polygon", - "region_id_column_name": "id", + "region_id_column_name": "id AS site_id", "geom_table": "infrastructure.sites", "geom_column": "geom_point", }, diff --git a/flowmachine/tests/test_subscriber_call_durations.py b/flowmachine/tests/test_subscriber_call_durations.py index dde1e78cfe..32f86181b4 100644 --- a/flowmachine/tests/test_subscriber_call_durations.py +++ b/flowmachine/tests/test_subscriber_call_durations.py @@ -2,26 +2,40 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.features.subscriber.subscriber_call_durations import * import pytest +from flowmachine.core import make_spatial_unit +from flowmachine.features.subscriber.subscriber_call_durations import * + + +@pytest.mark.parametrize( + "query", [SubscriberCallDurations, PairedSubscriberCallDurations] +) +@pytest.mark.parametrize("stat", valid_stats) +def test_subscriber_call_durations_column_names(query, stat): + """ + Test that column_names property matches head(0) + """ + query_instance = query("2016-01-01", "2016-01-07", statistic=stat) + assert query_instance.head(0).columns.tolist() == query_instance.column_names + @pytest.mark.parametrize( "query", - [ - SubscriberCallDurations, - PairedPerLocationSubscriberCallDurations, - PerLocationSubscriberCallDurations, - PairedSubscriberCallDurations, - ], + [PairedPerLocationSubscriberCallDurations, PerLocationSubscriberCallDurations], ) @pytest.mark.parametrize("stat", valid_stats) -def test_subscriber_call_durations_column_names(query, exemplar_level_param, stat): +def test_per_location_subscriber_call_durations_column_names( + query, exemplar_spatial_unit_param, stat +): """ Test that column_names property matches head(0) """ query_instance = query( - "2016-01-01", "2016-01-07", **exemplar_level_param, statistic=stat + "2016-01-01", + "2016-01-07", + spatial_unit=exemplar_spatial_unit_param, + statistic=stat, ) assert query_instance.head(0).columns.tolist() == query_instance.column_names @@ -33,9 +47,9 @@ def test_polygon_tables(get_dataframe): per_location_durations = PerLocationSubscriberCallDurations( "2016-01-01", "2016-01-07", - level="polygon", - polygon_table="geography.admin3", - column_name="admin3name", + spatial_unit=make_spatial_unit( + "polygon", geom_table="geography.admin3", region_id_column_name="admin3name" + ), ) df = get_dataframe(per_location_durations) @@ -55,9 +69,9 @@ def test_polygon_tables(get_dataframe): paired_per_location_durations = PairedPerLocationSubscriberCallDurations( "2016-01-01", "2016-01-07", - level="polygon", - polygon_table="geography.admin3", - column_name="admin3name", + spatial_unit=make_spatial_unit( + "polygon", geom_table="geography.admin3", region_id_column_name="admin3name" + ), ) df = get_dataframe(paired_per_location_durations) diff --git a/flowmachine/tests/test_subscriber_location_subset.py b/flowmachine/tests/test_subscriber_location_subset.py index 1699d92719..313ceb6379 100644 --- a/flowmachine/tests/test_subscriber_location_subset.py +++ b/flowmachine/tests/test_subscriber_location_subset.py @@ -2,13 +2,16 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.core import Table +from flowmachine.core import Table, make_spatial_unit from flowmachine.features import SubscriberLocationSubset, UniqueSubscribers -def test_subscriber_location_subset_column_names(exemplar_level_param): +def test_subscriber_location_subset_column_names(exemplar_spatial_unit_param): ss = SubscriberLocationSubset( - "2016-01-01", "2016-01-07", min_calls=1, **exemplar_level_param + "2016-01-01", + "2016-01-07", + min_calls=1, + spatial_unit=exemplar_spatial_unit_param, ) assert ss.head(0).columns.tolist() == ss.column_names @@ -22,7 +25,9 @@ def test_subscribers_make_atleast_one_call_in_admin0(): start, stop = "2016-01-01", "2016-01-07" - sls = SubscriberLocationSubset(start, stop, min_calls=1, level="admin0") + sls = SubscriberLocationSubset( + start, stop, min_calls=1, spatial_unit=make_spatial_unit("admin", level=0) + ) us = UniqueSubscribers(start, stop, table="events.calls") sls_subs = set(sls.get_dataframe()["subscriber"]) @@ -46,9 +51,9 @@ def test_subscribers_who_make_atleast_3_calls_in_central_development_region(): start, stop, min_calls=2, - level="polygon", - column_name="admin2pcod", - polygon_table=regions, + spatial_unit=make_spatial_unit( + "polygon", region_id_column_name="admin2pcod", geom_table=regions + ), ) df = sls.get_dataframe() From 310fedcd102a380347316fc77fd3b64c78350ed7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 00:20:52 +0100 Subject: [PATCH 087/138] Update CallDays --- .../features/subscriber/call_days.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/call_days.py b/flowmachine/flowmachine/features/subscriber/call_days.py index 4730e1b09e..3012d31ea8 100644 --- a/flowmachine/flowmachine/features/subscriber/call_days.py +++ b/flowmachine/flowmachine/features/subscriber/call_days.py @@ -13,7 +13,6 @@ from typing import List, Union from ...core import JoinToLocation -from flowmachine.utils import get_columns_for_level from .metaclasses import SubscriberFeature from ..utilities.subscriber_locations import _SubscriberCells @@ -36,15 +35,12 @@ class CallDays(SubscriberFeature): def __init__(self, subscriber_locations: Union[JoinToLocation, _SubscriberCells]): self.ul = subscriber_locations + self.spatial_unit = self.ul.spatial_unit super().__init__() @property def column_names(self) -> List[str]: - return ( - ["subscriber"] - + get_columns_for_level(self.ul.level, self.ul.column_name) - + ["calldays"] - ) + return ["subscriber"] + self.spatial_unit.location_id_columns + ["calldays"] def _make_query(self): """ @@ -52,21 +48,19 @@ def _make_query(self): metaclass Query(). Returns a sorted calldays table. """ - relevant_columns = ", ".join( - get_columns_for_level(self.ul.level, self.ul.column_name) - ) + location_columns = ", ".join(self.spatial_unit.location_id_columns) sql = f""" SELECT * FROM ( SELECT connections.subscriber, - {relevant_columns}, + {location_columns}, COUNT(*) AS calldays FROM ( - SELECT DISTINCT locations.subscriber, {relevant_columns}, locations.time::date + SELECT DISTINCT locations.subscriber, {location_columns}, locations.time::date FROM ({self.ul.get_query()}) AS locations ) AS connections - GROUP BY connections.subscriber, {relevant_columns} + GROUP BY connections.subscriber, {location_columns} ) calldays ORDER BY calldays.subscriber ASC, calldays.calldays DESC """ From 6318bf4cd8cc2706ef21cc543f3b998e2e6d3e9a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 00:21:09 +0100 Subject: [PATCH 088/138] Update subscriber_location_cluster --- .../features/subscriber/subscriber_location_cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py index c7adf1f845..ffc8c1e78d 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py @@ -14,6 +14,7 @@ from typing import List, Union from ..utilities import subscriber_locations +from ...core import make_spatial_unit from ...core.query import Query from ...core.mixins import GeoDataMixin from .call_days import CallDays @@ -133,7 +134,7 @@ def subscriber_location_cluster( start=start, stop=stop, hours=hours, - level="versioned-site", + spatial_unit=make_spatial_unit("versioned-site"), table=table, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, From b0415375d45734cbb1cf3006d0da932bbc123ee6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 01:04:04 +0100 Subject: [PATCH 089/138] Update MeaningfulLocationsAggregate and MeaningfulLocationsOD --- .../subscriber/meaningful_locations.py | 141 ++++-------------- .../tests/test_meaningful_locations.py | 52 ++++--- 2 files changed, 59 insertions(+), 134 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py index 1ec634574b..904ddcaf31 100644 --- a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py +++ b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py @@ -4,10 +4,8 @@ from typing import Dict, Any, List, Union -from flowmachine.core.errors import BadLevelError -from ...core import GeoTable, Query, Grid +from ...core import GeoTable, Query, Grid, make_spatial_unit from . import LabelEventScore, HartiganCluster, EventScore -from flowmachine.utils import get_columns_for_level class MeaningfulLocations(Query): @@ -78,79 +76,37 @@ class MeaningfulLocationsAggregate(Query): ---------- meaningful_locations : MeaningfulLocations A per-subscriber meaningful locations object to aggregate - level : {"admin3", "admin2", "admin1", "grid", "polygon"}, default "admin3" + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to aggregate to - column_name : str or list of str, default None - Optionally specify a non-default column name or names from the spatial unit table - polygon_table : str, default None - When using the "polygon" level, you must specify the fully qualified name of a table - containing polygons. - geom_column : str, default "geom" - When using the "polygon" level, you must specify the name of column containing geometry - size : int, default None - When using the "grid" level, you must specify the size of the grid to use in KM """ - allowed_levels = {"admin3", "admin2", "admin1", "grid", "polygon"} - def __init__( - self, - *, - meaningful_locations: MeaningfulLocations, - level: str = "admin3", - column_name: Union[str, None, List[str]] = None, - polygon_table: str = None, - geom_column: str = "geom", - size: int = None, + self, *, meaningful_locations: MeaningfulLocations, spatial_unit=None ) -> None: self.meaningful_locations = meaningful_locations - level_cols = get_columns_for_level(level, column_name) - self.column_name = column_name - self.level = level - if level.startswith("admin"): - if level_cols == ["pcod"]: - level_cols = [f"{level}pcod"] - self.aggregator = GeoTable( - f"geography.{level}", geom_column="geom", columns=["geom"] + level_cols - ) - elif level == "polygon": - self.aggregator = GeoTable( - polygon_table, - geom_column=geom_column, - columns=[geom_column] + level_cols, - ) - elif level == "grid": - self.aggregator = Grid(size=size) + if spatial_unit is None: + self.spatial_unit == make_spatial_unit("admin", level=3) else: - raise BadLevelError( - f"'{level}' is not an allowed level for meaningful locations, must be one of {MeaningfulLocationsOD.allowed_levels}'" - ) + self.spatial_unit = spatial_unit + self.spatial_unit.verify_criterion("is_polygon") super().__init__() @property def column_names(self) -> List[str]: - return ( - ["label"] + get_columns_for_level(self.level, self.column_name) + ["total"] - ) + return ["label"] + self.spatial_unit.location_id_columns + ["total"] def _make_query(self): - agg_query, agg_cols = self.aggregator._geo_augmented_query() - level_cols = get_columns_for_level(self.level, self.column_name) - level_cols_aliased = level_cols - if level_cols == ["pcod"]: - level_cols_aliased = [f"{self.level}pcod as pcod"] + location_cols = ", ".join(self.spatial_unit.location_id_columns) - level_cols = ", ".join(level_cols) - level_cols_aliased = ", ".join(level_cols_aliased) return f""" - SELECT label, {level_cols_aliased}, sum(1./n_clusters) as total FROM + SELECT label, {location_cols}, sum(1./n_clusters) as total FROM ({self.meaningful_locations.get_query()}) meaningful_locations LEFT JOIN - ({agg_query}) agg + ({self.spatial_unit.get_geom_query()}) agg ON ST_Contains(agg.geom::geometry, meaningful_locations.cluster::geometry) - GROUP BY label, {level_cols} + GROUP BY label, {location_cols} HAVING sum(1./n_clusters) > 15 - ORDER BY label, {level_cols} + ORDER BY label, {location_cols} """ @@ -165,31 +121,16 @@ class MeaningfulLocationsOD(Query): ---------- meaningful_locations_a, meaningful_locations_a : MeaningfulLocations Per-subscriber meaningful locations objects calculate an OD between - level : {"admin3", "admin2", "admin1", "grid", "polygon"}, default "admin3" + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to aggregate to - column_name : str or list of str, default None - Optionally specify a non-default column name or names from the spatial unit table - polygon_table : str, default None - When using the "polygon" level, you must specify the fully qualified name of a table - containing polygons. - geom_column : str, default "geom" - When using the "polygon" level, you must specify the name of column containing geometry - size : int, default None - When using the "grid" level, you must specify the size of the grid to use in KM """ - allowed_levels = {"admin3", "admin2", "admin1", "grid", "polygon"} - def __init__( self, *, meaningful_locations_a: MeaningfulLocations, meaningful_locations_b: MeaningfulLocations, - level: str = "admin3", - column_name: Union[str, None, List[str]] = None, - polygon_table: str = None, - geom_column: str = "geom", - size: int = None, + spatial_unit=None, ) -> None: self.flow = meaningful_locations_a.join( meaningful_locations_b, @@ -197,60 +138,38 @@ def __init__( left_append="_from", right_append="_to", ) - level_cols = get_columns_for_level(level, column_name) - self.column_name = column_name - - self.level = level - if level.startswith("admin"): - if level_cols == ["pcod"]: - level_cols = [f"{level}pcod"] - self.aggregator = GeoTable( - f"geography.{level}", geom_column="geom", columns=["geom"] + level_cols - ) - elif level == "polygon": - self.aggregator = GeoTable( - polygon_table, - geom_column=geom_column, - columns=[geom_column] + level_cols, - ) - elif level == "grid": - self.aggregator = Grid(size=size) + if spatial_unit is None: + self.spatial_unit = make_spatial_unit("admin", level=3) else: - raise BadLevelError( - f"'{level}' is not an allowed level for meaningful locations, must be one of {MeaningfulLocationsOD.allowed_levels}'" - ) + self.spatial_unit = spatial_unit + self.spatial_unit.verify_criterion("is_polygon") super().__init__() @property def column_names(self) -> List[str]: return [ f"{col}_{direction}" - for col in ["label"] + get_columns_for_level(self.level, self.column_name) + for col in ["label"] + self.spatial_unit.location_id_columns for direction in ("from", "to") ] + ["total"] def _make_query(self): - agg_query, agg_cols = self.aggregator._geo_augmented_query() - level_cols = [ + agg_query = self.spatial_unit.get_geom_query() + location_cols = [ f"{col}_{direction}" - for col in get_columns_for_level(self.level, self.column_name) + for col in self.spatial_unit.location_id_columns for direction in ("from", "to") ] - level_cols_aliased = [ + location_cols_aliased = [ f"{direction}_q.{col} as {col}_{direction}" - for col in get_columns_for_level(self.level, self.column_name) + for col in self.spatial_unit.location_id_columns for direction in ("from", "to") ] - if level_cols == ["pcod_from", "pcod_to"]: - level_cols_aliased = [ - f"from_q.{self.level}pcod as pcod_from", - f"to_q.{self.level}pcod as pcod_to", - ] - level_cols = ", ".join(level_cols) - level_cols_aliased = ", ".join(level_cols_aliased) + location_cols = ", ".join(location_cols) + location_cols_aliased = ", ".join(location_cols_aliased) return f""" - SELECT label_from, label_to, {level_cols_aliased}, sum(1./(n_clusters_from*n_clusters_to)) as total FROM + SELECT label_from, label_to, {location_cols_aliased}, sum(1./(n_clusters_from*n_clusters_to)) as total FROM ({self.flow.get_query()}) meaningful_locations LEFT JOIN ({agg_query}) from_q @@ -258,7 +177,7 @@ def _make_query(self): LEFT JOIN ({agg_query}) to_q ON ST_Contains(to_q.geom::geometry, meaningful_locations.cluster_to::geometry) - GROUP BY label_from, label_to, {level_cols} + GROUP BY label_from, label_to, {location_cols} HAVING sum(1./(n_clusters_from*n_clusters_to)) > 15 - ORDER BY label_from, label_to, {level_cols} + ORDER BY label_from, label_to, {location_cols} """ diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 1a23021115..874a60c72c 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -3,7 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import pytest -from flowmachine.core.errors import BadLevelError +from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.core import make_spatial_unit from flowmachine.features import ( HartiganCluster, @@ -55,12 +55,12 @@ def test_column_names_meaningful_locations(get_column_names_from_run): def test_column_names_meaningful_locations_aggregate( - exemplar_level_param, get_column_names_from_run + exemplar_spatial_unit_param, get_column_names_from_run ): """ Test that column_names property matches head(0) for aggregated meaningful locations""" - if exemplar_level_param["level"] not in MeaningfulLocationsAggregate.allowed_levels: + if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( - f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' + f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for MeaningfulLocations." ) mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( @@ -82,16 +82,16 @@ def test_column_names_meaningful_locations_aggregate( labels=labels, label="evening", ), - **exemplar_level_param, + spatial_unit=exemplar_spatial_unit_param, ) assert get_column_names_from_run(mfl_agg) == mfl_agg.column_names -def test_meaningful_locations_aggregate_disallowed_level_raises(): - """ Test that a bad level raises a BadLevelError""" +def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises(): + """ Test that a bad spatial unit raises an InvalidSpatialUnitError""" - with pytest.raises(BadLevelError): + with pytest.raises(InvalidSpatialUnitError): mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( @@ -112,17 +112,17 @@ def test_meaningful_locations_aggregate_disallowed_level_raises(): labels=labels, label="evening", ), - level="NOT_A_LEVEL", + spatial_unit=make_spatial_unit("lat-lon"), ) def test_column_names_meaningful_locations_od( - exemplar_level_param, get_column_names_from_run + exemplar_spatial_unit_param, get_column_names_from_run ): """ Test that column_names property matches head(0) for an od matrix between meaningful locations""" - if exemplar_level_param["level"] not in MeaningfulLocationsAggregate.allowed_levels: + if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( - f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for ODs between MeaningfulLocations.' + f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for ODs between MeaningfulLocations." ) mfl_a = MeaningfulLocations( clusters=HartiganCluster( @@ -166,7 +166,7 @@ def test_column_names_meaningful_locations_od( mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, - **exemplar_level_param, + spatial_unit=exemplar_spatial_unit_param, ) assert get_column_names_from_run(mfl_od) == mfl_od.column_names @@ -210,13 +210,15 @@ def test_meaningful_locations_results( assert all(count_clusters.n_clusters == count_clusters.cluster) -def test_meaningful_locations_aggregation_results(exemplar_level_param, get_dataframe): +def test_meaningful_locations_aggregation_results( + exemplar_spatial_unit_param, get_dataframe +): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ - if exemplar_level_param["level"] not in MeaningfulLocationsAggregate.allowed_levels: + if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( - f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' + f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for MeaningfulLocations." ) mfl = MeaningfulLocations( clusters=HartiganCluster( @@ -238,7 +240,7 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data label="evening", ) mfl_agg = MeaningfulLocationsAggregate( - meaningful_locations=mfl, **exemplar_level_param + meaningful_locations=mfl, spatial_unit=exemplar_spatial_unit_param ) mfl_df = get_dataframe(mfl) mfl_agg_df = get_dataframe(mfl_agg) @@ -248,11 +250,11 @@ def test_meaningful_locations_aggregation_results(exemplar_level_param, get_data assert mfl_agg_df.total.sum() < mfl_df.subscriber.nunique() -def test_meaningful_locations_od_raises_for_bad_level( - exemplar_level_param, get_dataframe +def test_meaningful_locations_od_raises_for_bad_spatial_unit( + exemplar_spatial_unit_param, get_dataframe ): """ - Test that od on meaningful locations raises a BadLevelError for a bad level. + Test that od on meaningful locations raises an InvalidSpatialUnitError for a bad spatial unit. """ mfl = MeaningfulLocations( clusters=HartiganCluster( @@ -274,9 +276,11 @@ def test_meaningful_locations_od_raises_for_bad_level( label="evening", ) - with pytest.raises(BadLevelError): + with pytest.raises(InvalidSpatialUnitError): mfl_od = MeaningfulLocationsOD( - meaningful_locations_a=mfl, meaningful_locations_b=mfl, level="NOT_A_LEVEL" + meaningful_locations_a=mfl, + meaningful_locations_b=mfl, + spatial_unit=make_spatial_unit("lat-lon"), ) @@ -326,7 +330,9 @@ def test_meaningful_locations_od_results(get_dataframe): label="unknown", ) mfl_od = MeaningfulLocationsOD( - meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, level="admin1" + meaningful_locations_a=mfl_a, + meaningful_locations_b=mfl_b, + spatial_unit=make_spatial_unit("admin", level=1), ) mfl_od_df = get_dataframe(mfl_od) # Aggregate should not include any counts below 15 From 47f759f70e37661bfc15793dc925fba44edf015a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 17:37:14 +0100 Subject: [PATCH 090/138] Combine _SubscriberCells and subscriber_locations --- .../query_schemas/meaningful_locations.py | 4 +- flowmachine/flowmachine/features/__init__.py | 2 +- .../location/unique_subscriber_counts.py | 4 +- .../features/subscriber/call_days.py | 7 +- .../features/subscriber/displacement.py | 4 +- .../features/subscriber/entropy.py | 4 +- .../features/subscriber/first_location.py | 4 +- .../features/subscriber/label_event_score.py | 2 +- .../features/subscriber/last_location.py | 4 +- .../subscriber/most_frequent_location.py | 4 +- .../features/subscriber/radius_of_gyration.py | 4 +- .../subscriber/subscriber_location_cluster.py | 10 +- .../subscriber/unique_location_counts.py | 4 +- .../features/utilities/__init__.py | 2 +- .../utilities/subscriber_locations.py | 169 ++++++++---------- flowmachine/tests/test_calldays.py | 10 +- flowmachine/tests/test_join_to_location.py | 18 +- .../tests/test_meaningful_locations.py | 22 +-- flowmachine/tests/test_subscriber_cells.py | 16 -- .../tests/test_subscriber_location_cluster.py | 24 +-- .../tests/test_subscriber_locations.py | 21 ++- .../tests/test_unique_location_counts.py | 4 +- .../tests/test_unique_subscriber_counts.py | 4 +- 23 files changed, 160 insertions(+), 187 deletions(-) delete mode 100644 flowmachine/tests/test_subscriber_cells.py diff --git a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py index 420336dfbb..c3e7aaaeaf 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py +++ b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py @@ -13,7 +13,7 @@ HartiganCluster, CallDays, EventScore, - subscriber_locations, + SubscriberLocations, ) from .base_exposed_query import BaseExposedQuery from .custom_fields import ( @@ -65,7 +65,7 @@ def _make_meaningful_locations_object( tower_day_of_week_scores, tower_hour_of_day_scores, ): - q_subscriber_locations = subscriber_locations( + q_subscriber_locations = SubscriberLocations( start=start_date, stop=end_date, level="versioned-site", # note this 'level' is not the same as the exposed parameter 'aggregation_unit' diff --git a/flowmachine/flowmachine/features/__init__.py b/flowmachine/flowmachine/features/__init__.py index c5953c8b92..f1e4041139 100644 --- a/flowmachine/flowmachine/features/__init__.py +++ b/flowmachine/flowmachine/features/__init__.py @@ -92,7 +92,7 @@ ut = [ "GroupValues", "feature_collection", - "subscriber_locations", + "SubscriberLocations", "EventTableSubset", "UniqueSubscribers", "EventsTablesUnion", diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index c3c3dc1a33..350789ece6 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -19,7 +19,7 @@ from ...core.mixins import GeoDataMixin from ...core import make_spatial_unit -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations class UniqueSubscriberCounts(GeoDataMixin, Query): @@ -87,7 +87,7 @@ def __init__( self.spatial_unit = spatial_unit self.hours = hours self.table = table - self.ul = subscriber_locations( + self.ul = SubscriberLocations( start=self.start, stop=self.stop, spatial_unit=self.spatial_unit, diff --git a/flowmachine/flowmachine/features/subscriber/call_days.py b/flowmachine/flowmachine/features/subscriber/call_days.py index 3012d31ea8..900c5dc425 100644 --- a/flowmachine/flowmachine/features/subscriber/call_days.py +++ b/flowmachine/flowmachine/features/subscriber/call_days.py @@ -12,9 +12,8 @@ """ from typing import List, Union -from ...core import JoinToLocation from .metaclasses import SubscriberFeature -from ..utilities.subscriber_locations import _SubscriberCells +from ..utilities.subscriber_locations import SubscriberLocations class CallDays(SubscriberFeature): @@ -25,7 +24,7 @@ class CallDays(SubscriberFeature): Parameters ---------- - subscriber_locations : JoinToLocation, _SubscriberCells + subscriber_locations : SubscriberLocations Locations of subscribers' interactions See Also @@ -33,7 +32,7 @@ class CallDays(SubscriberFeature): flowmachine.features.subscriber_locations """ - def __init__(self, subscriber_locations: Union[JoinToLocation, _SubscriberCells]): + def __init__(self, subscriber_locations: SubscriberLocations): self.ul = subscriber_locations self.spatial_unit = self.ul.spatial_unit super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index 79a8e7a3d3..ba0e3bd266 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -15,7 +15,7 @@ from flowmachine.features.subscriber import daily_location from .metaclasses import SubscriberFeature from . import ModalLocation -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations from flowmachine.utils import parse_datestring, get_dist_string, list_of_dates from flowmachine.core import make_spatial_unit @@ -140,7 +140,7 @@ def __init__( ] ) - sl = subscriber_locations( + sl = SubscriberLocations( self.start, self.stop_sl, spatial_unit=make_spatial_unit("lat-lon"), diff --git a/flowmachine/flowmachine/features/subscriber/entropy.py b/flowmachine/flowmachine/features/subscriber/entropy.py index abe517d1d5..9239d75d6e 100644 --- a/flowmachine/flowmachine/features/subscriber/entropy.py +++ b/flowmachine/flowmachine/features/subscriber/entropy.py @@ -14,7 +14,7 @@ from .metaclasses import SubscriberFeature from .contact_balance import ContactBalance from ..utilities.sets import EventsTablesUnion -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations from flowmachine.utils import get_columns_for_level @@ -286,7 +286,7 @@ def __init__( ignore_nulls=True, ): - self.subscriber_locations = subscriber_locations( + self.subscriber_locations = SubscriberLocations( start=start, stop=stop, level=level, diff --git a/flowmachine/flowmachine/features/subscriber/first_location.py b/flowmachine/flowmachine/features/subscriber/first_location.py index b9cda63143..70423a53c5 100644 --- a/flowmachine/flowmachine/features/subscriber/first_location.py +++ b/flowmachine/flowmachine/features/subscriber/first_location.py @@ -14,7 +14,7 @@ from flowmachine.utils import get_columns_for_level from .metaclasses import SubscriberFeature -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations class FirstLocation(SubscriberFeature): @@ -82,7 +82,7 @@ def __init__( self.stop = stop self.location = location - self.ul = subscriber_locations( + self.ul = SubscriberLocations( self.start, self.stop, level=level, diff --git a/flowmachine/flowmachine/features/subscriber/label_event_score.py b/flowmachine/flowmachine/features/subscriber/label_event_score.py index 1a62fe92cc..c2123d9a26 100644 --- a/flowmachine/flowmachine/features/subscriber/label_event_score.py +++ b/flowmachine/flowmachine/features/subscriber/label_event_score.py @@ -38,7 +38,7 @@ class LabelEventScore(Query): Examples -------- - >>> es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") + >>> es = EventScore(start="2016-01-01", stop="2016-01-05", spatial_unit=make_spatial_unit("versioned-site")) >>> es.head() subscriber site_id version lon lat score_hour score_dow 0 ZYPxqVGLzlQy6l7n QeBRM8 0 82.914285 29.358975 1.0 -1.0 diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index 00a50c65a9..7542cf3d6b 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -14,7 +14,7 @@ from flowmachine.core import Query, make_spatial_unit from ..utilities.subscriber_locations import BaseLocation -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations class LastLocation(BaseLocation, Query): @@ -86,7 +86,7 @@ def __init__( self.hours = hours self.table = table self.subscriber_identifier = subscriber_identifier - self.subscriber_locs = subscriber_locations( + self.subscriber_locs = SubscriberLocations( start=self.start, stop=self.stop, spatial_unit=self.spatial_unit, diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 12e8f441cd..34d193bd15 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -12,7 +12,7 @@ from typing import List from flowmachine.core import Query, make_spatial_unit -from ..utilities.subscriber_locations import BaseLocation, subscriber_locations +from ..utilities.subscriber_locations import BaseLocation, SubscriberLocations class MostFrequentLocation(BaseLocation, Query): @@ -88,7 +88,7 @@ def __init__( self.hours = hours self.table = table self.subscriber_identifier = subscriber_identifier - self.subscriber_locs = subscriber_locations( + self.subscriber_locs = SubscriberLocations( start=self.start, stop=self.stop, spatial_unit=self.spatial_unit, diff --git a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py index 9e4b8cc62c..df5bd08700 100644 --- a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py +++ b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py @@ -13,7 +13,7 @@ from typing import List from .metaclasses import SubscriberFeature -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations from flowmachine.core import make_spatial_unit @@ -99,7 +99,7 @@ def __init__( self.start = start self.stop = stop - self.ul = subscriber_locations( + self.ul = SubscriberLocations( self.start, self.stop, spatial_unit=make_spatial_unit("lat-lon"), diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py index ffc8c1e78d..1eef18bf68 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_location_cluster.py @@ -13,7 +13,7 @@ from typing import List, Union -from ..utilities import subscriber_locations +from ..utilities import SubscriberLocations from ...core import make_spatial_unit from ...core.query import Query from ...core.mixins import GeoDataMixin @@ -130,7 +130,7 @@ def subscriber_location_cluster( buffer = kwargs.pop("buffer", 0) cd = CallDays( - subscriber_locations( + SubscriberLocations( start=start, stop=stop, hours=hours, @@ -192,9 +192,7 @@ class HartiganCluster(BaseCluster): Examples -------- - >>> cd = CallDays('2016-01-01', '2016-01-04', level='polygon', - polygon_table='infrastructure.sites', - geom_col='geom_point') + >>> cd = CallDays( '2016-01-01', '2016-01-04', spatial_unit=make_spatial_unit('versioned-site')) >>> har = HartiganCluster(cd, 2.5) @@ -295,7 +293,7 @@ def join_to_cluster_components(self, query): spatial_unit=make_spatial_unit('versioned-site')) >>> cd = CallDays(start='2016-01-01', stop='2016-01-04', - level='versioned-site') + spatial_unit=make_spatial_unit('versioned-site')) >>> har = HartiganCluster(cd, 50, call_threshold=1) diff --git a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py index 6db14ffe25..f5c966f440 100644 --- a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py +++ b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py @@ -14,7 +14,7 @@ from typing import List from flowmachine.utils import get_columns_for_level -from ..utilities.subscriber_locations import subscriber_locations +from ..utilities.subscriber_locations import SubscriberLocations from .metaclasses import SubscriberFeature @@ -103,7 +103,7 @@ def __init__( size=None, ): - self.ul = subscriber_locations( + self.ul = SubscriberLocations( start=start, stop=stop, level=level, diff --git a/flowmachine/flowmachine/features/utilities/__init__.py b/flowmachine/flowmachine/features/utilities/__init__.py index f00a24d0bd..a1f0bbc6f9 100644 --- a/flowmachine/flowmachine/features/utilities/__init__.py +++ b/flowmachine/flowmachine/features/utilities/__init__.py @@ -6,7 +6,7 @@ Utilities for working with features. """ from .group_values import GroupValues -from .subscriber_locations import subscriber_locations +from .subscriber_locations import SubscriberLocations from .feature_collection import feature_collection diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 7ead644d70..7203cd450d 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -24,45 +24,101 @@ logger = structlog.get_logger("flowmachine.debug", submodule=__name__) -class _SubscriberCells(Query): - # Passing table='all' means it will look at all tables with location - # data. +class SubscriberLocations(Query): + """ + Class representing all the locations for which a subscriber has been found. + Can be at the level of a tower, lat-lon, or an admin unit. + + Parameters + ---------- + start : str + iso format date range for the beginning of the time frame, + e.g. 2016-01-01 or 2016-01-01 14:03:01 + stop : str + As above + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. + hours : tuple of ints, default 'all' + subset the result within certain hours, e.g. (4,17) + This will subset the query only with these hours, but + across all specified days. Or set to 'all' to include + all hours. + table : str, default 'all' + schema qualified name of the table which the analysis is + based upon. If 'all' it will pull together all of the tables + specified as flowmachine.yml under 'location_tables' + subscriber_identifier : str, default 'msisdn' + whether to identify a subscriber by either msisdn, imei or imsi. + ignore_nulls : bool, default True + ignores those values that are null. Sometime data appears for which + the cell is null. If set to true this will ignore those lines. If false + these lines with null cells should still be present, although they contain + no information on the subscribers location, they still tell us that the subscriber made + a call at that time. + + Notes + ----- + * A date without a hours and mins will be interpreted as + midnight of that day, so to get data within a single day + pass (e.g.) '2016-01-01', '2016-01-02'. + + * Use 24 hr format! + + Examples + -------- + >>> subscriber_locs = SubscriberLocations('2016-01-01 13:30:30', + '2016-01-02 16:25:00') + >>> subscriber_locs.head() + subscriber time cell + subscriberA 2016-01-01 12:42:11 233241 + subscriberA 2016-01-01 12:52:11 234111 + subscriberB 2016-01-01 12:52:16 234111 + ... + + """ + def __init__( self, start, stop, + *, + spatial_unit=make_spatial_unit("cell"), hours="all", table="all", subscriber_identifier="msisdn", ignore_nulls=True, - *, subscriber_subset=None, ): self.start = start self.stop = stop + self.spatial_unit = spatial_unit self.hours = hours self.table = table self.subscriber_identifier = subscriber_identifier self.ignore_nulls = ignore_nulls - self.spatial_unit = make_spatial_unit("cell") self.tables = table cols = [self.subscriber_identifier, "datetime", "location_id"] - self.unioned = EventsTablesUnion( - self.start, - self.stop, - columns=cols, - tables=self.table, - hours=self.hours, - subscriber_subset=subscriber_subset, - subscriber_identifier=self.subscriber_identifier, + self.unioned = location_joined_query( + EventsTablesUnion( + self.start, + self.stop, + columns=cols, + tables=self.table, + hours=self.hours, + subscriber_subset=subscriber_subset, + subscriber_identifier=self.subscriber_identifier, + ), + spatial_unit=self.spatial_unit, + time_col="datetime", ) super().__init__() @property def column_names(self) -> List[str]: - return ["subscriber", "time", "location_id"] + return ["subscriber", "time"] + self.spatial_unit.location_id_columns def _make_query(self): @@ -71,9 +127,11 @@ def _make_query(self): else: where_clause = "" + location_cols = ", ".join(self.spatial_unit.location_id_columns) + sql = f""" SELECT - subscriber, datetime as time, location_id + subscriber, datetime as time, {location_cols} FROM ({self.unioned.get_query()}) AS foo {where_clause} @@ -115,84 +173,3 @@ def join_aggregate(self, metric, method="avg"): def __getitem__(self, item): return self.subset(col="subscriber", subset=item) - - -def subscriber_locations( - start, - stop, - *, - spatial_unit=make_spatial_unit("cell"), - hours="all", - table="all", - subscriber_identifier="msisdn", - ignore_nulls=True, - subscriber_subset=None, -): - """ - Class representing all the locations for which a subscriber has been found. - Can be at the level of a tower, lat-lon, or an admin unit. - - Parameters - ---------- - start : str - iso format date range for the beginning of the time frame, - e.g. 2016-01-01 or 2016-01-01 14:03:01 - stop : str - As above - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell - Spatial unit to which subscriber locations will be mapped. See the - docstring of make_spatial_unit for more information. - hours : tuple of ints, default 'all' - subset the result within certain hours, e.g. (4,17) - This will subset the query only with these hours, but - across all specified days. Or set to 'all' to include - all hours. - table : str, default 'all' - schema qualified name of the table which the analysis is - based upon. If 'all' it will pull together all of the tables - specified as flowmachine.yml under 'location_tables' - subscriber_identifier : str, default 'msisdn' - whether to identify a subscriber by either msisdn, imei or imsi. - ignore_nulls : bool, default True - ignores those values that are null. Sometime data appears for which - the cell is null. If set to true this will ignore those lines. If false - these lines with null cells should still be present, although they contain - no information on the subscribers location, they still tell us that the subscriber made - a call at that time. - - Notes - ----- - * A date without a hours and mins will be interpreted as - midnight of that day, so to get data within a single day - pass (e.g.) '2016-01-01', '2016-01-02'. - - * Use 24 hr format! - - Examples - -------- - >>> subscriber_locs = subscriber_locations('2016-01-01 13:30:30', - '2016-01-02 16:25:00' - spatial_unit = None) - >>> subscriber_locs.head() - subscriber time cell - subscriberA 2016-01-01 12:42:11 233241 - subscriberA 2016-01-01 12:52:11 234111 - subscriberB 2016-01-01 12:52:16 234111 - ... - - """ - # Here we call the hidden class _SubscriberCells which is every spotting - # of all subscribers. We then join to the appropriate level if necessary. - subscriber_cells = _SubscriberCells( - start, - stop, - hours, - table=table, - subscriber_subset=subscriber_subset, - subscriber_identifier=subscriber_identifier, - ignore_nulls=ignore_nulls, - ) - - return location_joined_query( - subscriber_cells, spatial_unit=spatial_unit, time_col="time" - ) diff --git a/flowmachine/tests/test_calldays.py b/flowmachine/tests/test_calldays.py index aa7ea18e26..65783d9527 100644 --- a/flowmachine/tests/test_calldays.py +++ b/flowmachine/tests/test_calldays.py @@ -12,7 +12,7 @@ from flowmachine.core import make_spatial_unit -from flowmachine.features import CallDays, subscriber_locations +from flowmachine.features import CallDays, SubscriberLocations import numpy as np @@ -20,7 +20,7 @@ def test_calldays_column_names(exemplar_spatial_unit_param): """Test that CallDays column_names property is correct""" cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-03", spatial_unit=exemplar_spatial_unit_param ) ) @@ -39,7 +39,7 @@ def test_call_days_returns_expected_counts_per_subscriber(get_dataframe): ) for (subscriber, start, end, calls) in test_values: cd = CallDays( - subscriber_locations( + SubscriberLocations( start, end, spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -59,7 +59,7 @@ def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): ) for (subscriber, location, start, end, calls) in test_values: cd = CallDays( - subscriber_locations( + SubscriberLocations( start, end, spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -75,7 +75,7 @@ def test_locations_are_only_repeated_once_per_subscriber(get_dataframe): """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-03", spatial_unit=make_spatial_unit("cell") ) ) diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 3c0c53e723..a039a6aeb5 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -8,7 +8,7 @@ import numpy as np -from flowmachine.features import subscriber_locations +from flowmachine.features import SubscriberLocations from flowmachine.core import JoinToLocation, location_joined_query, make_spatial_unit @@ -16,7 +16,7 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): """ Test that JoinToLocation's column_names property is accurate.""" if not exemplar_spatial_unit_param.has_geography: pytest.skip("JoinToLocation does not accept CellSpatialUnit objects") - table = subscriber_locations( + table = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) joined = JoinToLocation(table, spatial_unit=exemplar_spatial_unit_param) @@ -29,7 +29,7 @@ def test_join_to_location_raises_value_error(): geography information. """ with pytest.raises(ValueError): - table = subscriber_locations( + table = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) joined = JoinToLocation(table, spatial_unit=make_spatial_unit("cell")) @@ -53,7 +53,7 @@ def test_join_with_versioned_cells(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can fetch the cell version. """ - ul = subscriber_locations( + ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) df = get_dataframe( @@ -80,7 +80,7 @@ def test_join_with_lat_lon(get_dataframe): """ Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ - ul = subscriber_locations( + ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lat-lon"))) @@ -106,7 +106,7 @@ def test_join_with_polygon(get_dataframe, get_length): Test that flowmachine.JoinToLocation can get the (arbitrary) polygon of each cell. """ - ul = subscriber_locations( + ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) j = JoinToLocation( @@ -129,7 +129,7 @@ def test_join_to_admin(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can join to a admin region. """ - ul = subscriber_locations( + ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) df = get_dataframe( @@ -144,7 +144,7 @@ def test_join_to_grid(get_dataframe, get_length): """ Test that we can join to a grid square """ - ul = subscriber_locations( + ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) df = get_dataframe( @@ -159,7 +159,7 @@ def test_location_joined_query_return_type(exemplar_spatial_unit_param): JoinToLocation object when spatial_unit != CellSpatialUnit(), and returns query when spatial_unit == CellSpatialUnit(). """ - table = subscriber_locations( + table = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) joined = location_joined_query(table, spatial_unit=exemplar_spatial_unit_param) diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index 874a60c72c..e5059823af 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -8,7 +8,7 @@ from flowmachine.features import ( HartiganCluster, CallDays, - subscriber_locations, + SubscriberLocations, EventScore, ) from flowmachine.features.subscriber.meaningful_locations import ( @@ -34,7 +34,7 @@ def test_column_names_meaningful_locations(get_column_names_from_run): mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -66,7 +66,7 @@ def test_column_names_meaningful_locations_aggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -96,7 +96,7 @@ def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises(): meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -127,7 +127,7 @@ def test_column_names_meaningful_locations_od( mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -147,7 +147,7 @@ def test_column_names_meaningful_locations_od( mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -185,7 +185,7 @@ def test_meaningful_locations_results( mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -223,7 +223,7 @@ def test_meaningful_locations_aggregation_results( mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -259,7 +259,7 @@ def test_meaningful_locations_od_raises_for_bad_spatial_unit( mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -293,7 +293,7 @@ def test_meaningful_locations_od_results(get_dataframe): mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), @@ -313,7 +313,7 @@ def test_meaningful_locations_od_results(get_dataframe): mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( - subscriber_locations=subscriber_locations( + subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), diff --git a/flowmachine/tests/test_subscriber_cells.py b/flowmachine/tests/test_subscriber_cells.py deleted file mode 100644 index 70a1e231ac..0000000000 --- a/flowmachine/tests/test_subscriber_cells.py +++ /dev/null @@ -1,16 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -from flowmachine.features.utilities.subscriber_locations import _SubscriberCells - -import pytest - -pytestmark = pytest.mark.usefixtures("skip_datecheck") - - -@pytest.mark.parametrize("ignore_nulls", [True, False]) -def test_column_names(ignore_nulls): - """ Test that column_names property matches head(0) for _SubscriberCells""" - sc = _SubscriberCells("2016-01-01", "2016-01-04", ignore_nulls=ignore_nulls) - assert sc.head(0).columns.tolist() == sc.column_names diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 65b1a53add..360224abc0 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -24,7 +24,7 @@ HartiganCluster, subscriber_location_cluster, EventScore, - subscriber_locations, + SubscriberLocations, ) @@ -32,7 +32,7 @@ def test_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -44,7 +44,7 @@ def test_hartigan_column_names(get_column_names_from_run): def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -68,7 +68,7 @@ def test_hartigan_type_error(): def test_joined_hartigan_type_error(): """Test that joining hartigan to something which isn't query like raises a type error.""" cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -109,7 +109,7 @@ def test_cluster_is_within_envelope(get_dataframe): Test that all the clusters are within the enveloped formed by all the towers in the cluster. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -127,7 +127,7 @@ def test_first_call_day_in_first_cluster(get_dataframe): Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -154,7 +154,7 @@ def test_bigger_radius_yields_fewer_clusters(get_dataframe): """ radius = [1, 2, 5, 10, 50] cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -174,7 +174,7 @@ def test_different_call_days_format(get_dataframe): Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -200,7 +200,7 @@ def test_call_threshold_works(get_dataframe): Test whether a call threshold above 1 limits the number of clusters. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -219,7 +219,7 @@ def test_buffered_hartigan(): Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -239,7 +239,7 @@ def test_all_options_hartigan(): Test whether Hartigan works when changing all options. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) @@ -257,7 +257,7 @@ def test_join_returns_the_same_clusters(): Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ cd = CallDays( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) diff --git a/flowmachine/tests/test_subscriber_locations.py b/flowmachine/tests/test_subscriber_locations.py index 83ef6f513b..ce0cc81e00 100644 --- a/flowmachine/tests/test_subscriber_locations.py +++ b/flowmachine/tests/test_subscriber_locations.py @@ -2,17 +2,32 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +import pytest from flowmachine.core import make_spatial_unit -from flowmachine.features.utilities.subscriber_locations import subscriber_locations +from flowmachine.features.utilities.subscriber_locations import SubscriberLocations + +pytestmark = pytest.mark.usefixtures("skip_datecheck") + + +@pytest.mark.parametrize("ignore_nulls", [True, False]) +def test_column_names(exemplar_spatial_unit_param, ignore_nulls): + """ Test that column_names property matches head(0) for SubscriberLocations""" + sl = SubscriberLocations( + "2016-01-01", + "2016-01-04", + spatial_unit=exemplar_spatial_unit_param, + ignore_nulls=ignore_nulls, + ) + assert sl.head(0).columns.tolist() == sl.column_names def test_can_get_pcods(get_dataframe): """ - subscriber_locations() can make queries at the p-code level. + SubscriberLocations() can make queries at the p-code level. """ - subscriber_pcod = subscriber_locations( + subscriber_pcod = SubscriberLocations( "2016-01-01 13:30:30", "2016-01-02 16:25:00", spatial_unit=make_spatial_unit( diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index 64b8562c9a..5af2beffa7 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -6,7 +6,7 @@ from flowmachine.core.errors import BadLevelError from flowmachine.core import make_spatial_unit -from flowmachine.features import UniqueLocationCounts, subscriber_locations +from flowmachine.features import UniqueLocationCounts, SubscriberLocations def test_returns_errors(): @@ -32,7 +32,7 @@ def test_correct_counts(get_dataframe): ulc = UniqueLocationCounts("2016-01-01", "2016-01-02", level="cell", hours=(5, 17)) df = get_dataframe(ulc) dful = get_dataframe( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("cell"), diff --git a/flowmachine/tests/test_unique_subscriber_counts.py b/flowmachine/tests/test_unique_subscriber_counts.py index 2666db8f8b..7c5ccb85ff 100644 --- a/flowmachine/tests/test_unique_subscriber_counts.py +++ b/flowmachine/tests/test_unique_subscriber_counts.py @@ -10,7 +10,7 @@ from flowmachine.core import make_spatial_unit from flowmachine.features import UniqueSubscriberCounts -from flowmachine.features.utilities import subscriber_locations +from flowmachine.features.utilities import SubscriberLocations @pytest.mark.usefixtures("skip_datecheck") @@ -36,7 +36,7 @@ def test_correct_counts(get_dataframe): ) df = get_dataframe(usc) dful = get_dataframe( - subscriber_locations( + SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("cell"), From 135aca0e4c9e837cf9d4bc75b88c81cdfc925973 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 17:45:35 +0100 Subject: [PATCH 091/138] Fix test_utils.py --- flowmachine/tests/test_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index dd1c4afc37..71447f4b9c 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -239,8 +239,8 @@ def test_print_dependency_tree(): expected_output = textwrap.dedent( """\ - - - - + - + - - - - @@ -250,6 +250,8 @@ def test_print_dependency_tree(): - - - + - + - - - - From f916a34550eda24c74019953ea4ca46b89ae1023 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 20:29:41 +0100 Subject: [PATCH 092/138] Update ContactReferenceLocationStats --- .../contact_reference_locations_stats.py | 23 +++++-------- .../test_contact_reference_locations_stats.py | 34 +++++++++++-------- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py b/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py index 6392f6d4af..ea16b20613 100644 --- a/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py +++ b/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py @@ -10,13 +10,6 @@ from .metaclasses import SubscriberFeature -from .modal_location import ModalLocation -from .contact_balance import ContactBalance -from ..spatial.distance_matrix import DistanceMatrix -from .daily_location import daily_location - -from flowmachine.utils import get_columns_for_level, list_of_dates - valid_stats = {"count", "sum", "avg", "max", "min", "median", "stddev", "variance"} @@ -31,13 +24,14 @@ class ContactReferenceLocationStats(SubscriberFeature): targeted subscribers along with the number of events between them. contact_locations: flowmachine.core.Query A flowmachine Query instance that contains a subscriber column. In - addition to that the query must have a spatial level or the target + addition to that the query must have a spatial unit or the target geometry column that contains the subscribers' reference locations. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum' Defaults to sum, aggregation statistic over the durations. geom_column: The column containing the subscribers' reference locations. This is - only required if the Query does not contain a spatial level. + required if the Query does not contain a spatial unit with 'lat' and + 'lon' columns. Example ------- @@ -76,14 +70,13 @@ def __init__( ) if self.geom_column is None: - level = getattr(self.contact_locations_query, "level", None) - if level is None: - raise ValueError( - "The contact locations must have a spatial level whenever the geometry column is not specified." + try: + self.contact_locations_query.spatial_unit.verify_criterion( + "has_lat_lon_columns" ) - if not level in ["versioned-cell", "versioned-site", "lat-lon"]: + except AttributeError: raise ValueError( - f"The {level} for the contact_locations_query is not supported." + "The contact locations must have a spatial unit whenever the geometry column is not specified." ) super().__init__() diff --git a/flowmachine/tests/test_contact_reference_locations_stats.py b/flowmachine/tests/test_contact_reference_locations_stats.py index 50fdfbc7e8..3487ea7c2a 100644 --- a/flowmachine/tests/test_contact_reference_locations_stats.py +++ b/flowmachine/tests/test_contact_reference_locations_stats.py @@ -5,13 +5,14 @@ from flowmachine.features.subscriber.contact_reference_locations_stats import * import pytest -from flowmachine.core import CustomQuery +from flowmachine.core import CustomQuery, make_spatial_unit +from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.utils import list_of_dates -from flowmachine.features import daily_location, ContactBalance +from flowmachine.features import daily_location, ContactBalance, ModalLocation @pytest.mark.parametrize( - "statistic,msisdn,level,want", + "statistic,msisdn,spatial_unit_type,want", [ ("avg", "gwAynWXp4eWvxGP7", "versioned-cell", 298.7215), ("avg", "gwAynWXp4eWvxGP7", "versioned-site", 298.7215), @@ -20,7 +21,7 @@ ], ) def test_contact_reference_location_stats( - get_dataframe, statistic, msisdn, level, want + get_dataframe, statistic, msisdn, spatial_unit_type, want ): """ Test a few hand-picked ContactReferenceLocationStats. """ cb = ContactBalance("2016-01-01", "2016-01-03") @@ -28,7 +29,7 @@ def test_contact_reference_location_stats( *[ daily_location( d, - level=level, + spatial_unit=make_spatial_unit(spatial_unit_type), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") @@ -48,7 +49,7 @@ def test_contact_reference_location_stats_custom_geometry(get_dataframe): *[ daily_location( d, - level="versioned-cell", + spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") @@ -72,7 +73,7 @@ def test_contact_reference_location_stats_false_statistic_raises(): *[ daily_location( d, - level="versioned-cell", + spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") @@ -82,33 +83,36 @@ def test_contact_reference_location_stats_false_statistic_raises(): query = ContactReferenceLocationStats(cb, ml, statistic="error") -def test_contact_reference_location_false_level_raises(): - """ Test ValueError is raised for contact_location with non-compliant level. """ +def test_contact_reference_location_bad_spatial_unit_raises(): + """ + Test InvalidSpatialUnitError is raised for contact_location with + non-compliant spatial unit. + """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation( *[ daily_location( d, - level="admin3", + spatial_unit=make_spatial_unit("admin", level=3), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ] ) - with pytest.raises(ValueError): + with pytest.raises(InvalidSpatialUnitError): query = ContactReferenceLocationStats(cb, ml) -def test_contact_reference_location_no_level_raises(): - """ Test ValueError is raised for contact_location without level attribute. """ +def test_contact_reference_location_no_spatial_unit_raises(): + """ Test ValueError is raised for contact_location without spatial_unit attribute. """ cb = ContactBalance("2016-01-01", "2016-01-03") - # by encapsulating ModalLocations in a CustomQuery we remove the level + # by encapsulating ModalLocations in a CustomQuery we remove the spatial_unit # attribute from it which should raise an error ml = ModalLocation( *[ daily_location( d, - level="versioned-cell", + spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") From ea49cca3ada105255e423354b77c1ed44fd3eba7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 20:53:24 +0100 Subject: [PATCH 093/138] Update LocationEntropy --- .../features/subscriber/entropy.py | 43 +++++-------------- flowmachine/tests/test_subscriber_entropy.py | 10 +++-- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/entropy.py b/flowmachine/flowmachine/features/subscriber/entropy.py index 9239d75d6e..14f7fe1c8b 100644 --- a/flowmachine/flowmachine/features/subscriber/entropy.py +++ b/flowmachine/flowmachine/features/subscriber/entropy.py @@ -15,7 +15,7 @@ from .contact_balance import ContactBalance from ..utilities.sets import EventsTablesUnion from ..utilities.subscriber_locations import SubscriberLocations -from flowmachine.utils import get_columns_for_level +from flowmachine.core import make_spatial_unit class BaseEntropy(SubscriberFeature, metaclass=ABCMeta): @@ -220,29 +220,9 @@ class LocationEntropy(BaseEntropy): ---------- start, stop : str iso-format start and stop datetimes - level : str, default 'cell' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None @@ -277,8 +257,7 @@ def __init__( start, stop, *, - level="cell", - column_name=None, + spatial_unit=make_spatial_unit("cell"), subscriber_identifier="msisdn", hours="all", subscriber_subset=None, @@ -289,26 +268,26 @@ def __init__( self.subscriber_locations = SubscriberLocations( start=start, stop=stop, - level=level, - column_name=column_name, + spatial_unit=spatial_unit, table=tables, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ignore_nulls=ignore_nulls, ) - self.location_cols = ", ".join( - get_columns_for_level(level=level, column_name=column_name) - ) + super().__init__() @property def _absolute_freq_query(self): + location_cols = ", ".join( + self.subscriber_locations.spatial_unit.location_id_columns + ) return f""" SELECT subscriber, COUNT(*) AS absolute_freq FROM ({self.subscriber_locations.get_query()}) u - GROUP BY subscriber, {self.location_cols} + GROUP BY subscriber, {location_cols} HAVING COUNT(*) > 0 """ diff --git a/flowmachine/tests/test_subscriber_entropy.py b/flowmachine/tests/test_subscriber_entropy.py index a006d62a26..b5ec5fa3bf 100644 --- a/flowmachine/tests/test_subscriber_entropy.py +++ b/flowmachine/tests/test_subscriber_entropy.py @@ -2,10 +2,12 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from flowmachine.features.subscriber.entropy import * -import numpy as np import pytest +import numpy as np + +from flowmachine.core import make_spatial_unit +from flowmachine.features.subscriber.entropy import * class MockEntropy(BaseEntropy): @@ -67,7 +69,9 @@ def test_subscriber_location_entropy(get_dataframe): df = get_dataframe(query).set_index("subscriber") assert df.loc["0DB8zw67E9mZAPK2"].entropy == pytest.approx(2.996_587) - query = LocationEntropy("2016-01-02", "2016-01-05", level="admin1") + query = LocationEntropy( + "2016-01-02", "2016-01-05", spatial_unit=make_spatial_unit("admin", level=1) + ) df = get_dataframe(query).set_index("subscriber") assert df.loc["0DB8zw67E9mZAPK2"].entropy == pytest.approx(1.214_889_6) From 75a5f2940db427e05463b71667c09c9ed3153669 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 22:43:41 +0100 Subject: [PATCH 094/138] Update LocationVisits --- .../features/subscriber/location_visits.py | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/location_visits.py b/flowmachine/flowmachine/features/subscriber/location_visits.py index d81d20ccb5..ad7e6066b6 100644 --- a/flowmachine/flowmachine/features/subscriber/location_visits.py +++ b/flowmachine/flowmachine/features/subscriber/location_visits.py @@ -14,7 +14,6 @@ from typing import List from flowmachine.features.subscriber.metaclasses import SubscriberFeature -from flowmachine.utils import get_columns_for_level class LocationVisits(SubscriberFeature): @@ -25,7 +24,8 @@ class LocationVisits(SubscriberFeature): Examples -------- >>> lv = LocationVisits('2016-01-01', '2016-01-04', - level = 'admin3', method = 'last', hours = (5,17)) + spatial_unit=make_spatial_unit('admin', level=3), + method='last', hours=(5,17)) >>> lv.head(4) subscriber name dl_count 0 038OVABN11Ak4W5P Dolpa 5 @@ -36,42 +36,33 @@ class LocationVisits(SubscriberFeature): def __init__(self, day_trajectories): self.day_trajectories = day_trajectories - self.level = day_trajectories.level - self.column_name = day_trajectories.column_name + self.spatial_unit = day_trajectories.spatial_unit super().__init__() @property def column_names(self) -> List[str]: - return ( - ["subscriber"] - + get_columns_for_level(self.level, self.column_name) - + ["dl_count"] - ) + return ["subscriber"] + self.spatial_unit.location_id_columns + ["dl_count"] def _make_query(self): """ Default query method implemented in the metaclass Query(). """ - relevant_columns = ", ".join( - get_columns_for_level(self.level, self.column_name) - ) + location_columns = ", ".join(self.spatial_unit.location_id_columns) - sql = """ + sql = f""" SELECT day_trajectories.subscriber, - day_trajectories.{rc}, + day_trajectories.{location_columns}, COUNT(*) AS dl_count FROM - ({day_trajectories}) AS day_trajectories + ({self.day_trajectories.get_query()}) AS day_trajectories GROUP BY day_trajectories.subscriber, - day_trajectories.{rc} + day_trajectories.{location_columns} ORDER BY day_trajectories.subscriber, COUNT(*) DESC - """.format( - rc=relevant_columns, day_trajectories=self.day_trajectories.get_query() - ) + """ return sql From 4e9cc22be846624c7115b4916652b74963c18637 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 23:05:58 +0100 Subject: [PATCH 095/138] Raise InvalidSpatialUnitError in location_joined_query --- .../flowmachine/core/join_to_location.py | 4 ++++ flowmachine/tests/test_join_to_location.py | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index ae9403e2e8..23f1371c6c 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -15,6 +15,8 @@ from typing import List from .query import Query +from .spatial_unit import SpatialUnitMixin +from .errors import InvalidSpatialUnitError class JoinToLocation(Query): @@ -129,6 +131,8 @@ def location_joined_query(left, *, spatial_unit, time_col="time"): flowmachine.Query Either a JoinToLocation object, or the input parameter 'left' """ + if not isinstance(spatial_unit, SpatialUnitMixin): + raise InvalidSpatialUnitError(f"{spatial_unit} is not a spatial unit.") if spatial_unit.has_geography: return JoinToLocation(left, spatial_unit=spatial_unit, time_col=time_col) else: diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index a039a6aeb5..7687e148de 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -10,6 +10,7 @@ from flowmachine.features import SubscriberLocations from flowmachine.core import JoinToLocation, location_joined_query, make_spatial_unit +from flowmachine.core.errors import InvalidSpatialUnitError def test_join_to_location_column_names(exemplar_spatial_unit_param): @@ -25,10 +26,10 @@ def test_join_to_location_column_names(exemplar_spatial_unit_param): def test_join_to_location_raises_value_error(): """ - Test that JoinToLocation raises a ValueError if spatial_unit does not have - geography information. + Test that JoinToLocation raises an InvalidSpatialUnitError if spatial_unit + does not have geography information. """ - with pytest.raises(ValueError): + with pytest.raises(InvalidSpatialUnitError): table = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) @@ -167,3 +168,15 @@ def test_location_joined_query_return_type(exemplar_spatial_unit_param): assert joined is table else: assert isinstance(joined, JoinToLocation) + + +def test_ocation_joined_query_raises_error(): + """ + Test that location_joined_query raises an error if spatial_unit is not a + SpatialUnit object. + """ + table = SubscriberLocations( + "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") + ) + with pytest.raises(InvalidSpatialUnitError): + location_joined_query(table, spatial_unit="foo") From 514f1dd61b54f28fae4cb54fc25d17493925574b Mon Sep 17 00:00:00 2001 From: James Harrison Date: Sun, 9 Jun 2019 23:06:31 +0100 Subject: [PATCH 096/138] Update UniqueLocationCounts --- .../subscriber/unique_location_counts.py | 55 +++++-------------- .../tests/test_unique_location_counts.py | 22 +++++--- 2 files changed, 28 insertions(+), 49 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py index f5c966f440..cbc0b1938a 100644 --- a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py +++ b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py @@ -13,7 +13,7 @@ """ from typing import List -from flowmachine.utils import get_columns_for_level +from flowmachine.core import make_spatial_unit from ..utilities.subscriber_locations import SubscriberLocations from .metaclasses import SubscriberFeature @@ -31,29 +31,9 @@ class UniqueLocationCounts(SubscriberFeature): e.g. 2016-01-01 or 2016-01-01 14:03:01 stop : str As above - level : str, default 'cell' - Levels can be one of: - 'cell': - The identifier as it is found in the CDR itself - 'versioned-cell': - The identifier as found in the CDR combined with the version from - the cells table. - 'versioned-site': - The ID found in the sites table, coupled with the version - number. - 'polygon': - A custom set of polygons that live in the database. In which - case you can pass the parameters column_name, which is the column - you want to return after the join, and table_name, the table where - the polygons reside (with the schema), and additionally geom_col - which is the column with the geometry information (will default to - 'geom') - 'admin*': - An admin region of interest, such as admin3. Must live in the - database in the standard location. - 'grid': - A square in a regular grid, in addition pass size to - determine the size of the polygon. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. hours : tuple of ints, default 'all' subset the result within certain hours, e.g. (4,17) This will subset the query only with these hours, but @@ -79,7 +59,8 @@ class UniqueLocationCounts(SubscriberFeature): Examples -------- >>> ulc = UniqueLocationCounts('2016-01-01', '2016-01-04', - level = 'admin3', method = 'last', hours = (5,17)) + spatial_unit=make_spatial_unit('admin', level=3), + method='last', hours=(5,17)) >>> ulc.head(4) subscriber unique_location_counts 0 038OVABN11Ak4W5P 3 @@ -92,29 +73,23 @@ def __init__( start, stop, *, - level="cell", + spatial_unit=make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", ignore_nulls=True, - column_name=None, subscriber_subset=None, - polygon_table=None, - size=None, ): self.ul = SubscriberLocations( start=start, stop=stop, - level=level, + spatial_unit=spatial_unit, hours=hours, table=tables, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, - column_name=column_name, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, ) super().__init__() @@ -128,18 +103,14 @@ def _make_query(self): metaclass Query(). """ - relevant_columns = ",".join( - get_columns_for_level(self.ul.level, self.ul.column_name) - ) - sql = """ + location_columns = ",".join(self.ul.spatial_unit.location_id_columns) + sql = f""" SELECT subscriber, COUNT(*) as unique_location_counts FROM - (SELECT DISTINCT subscriber, {rc} - FROM ({all_locs}) AS all_locs) AS _ + (SELECT DISTINCT subscriber, {location_columns} + FROM ({self.ul.get_query()}) AS all_locs) AS _ GROUP BY subscriber - """.format( - all_locs=self.ul.get_query(), rc=relevant_columns - ) + """ return sql diff --git a/flowmachine/tests/test_unique_location_counts.py b/flowmachine/tests/test_unique_location_counts.py index 5af2beffa7..6ac067f060 100644 --- a/flowmachine/tests/test_unique_location_counts.py +++ b/flowmachine/tests/test_unique_location_counts.py @@ -4,23 +4,26 @@ import pytest -from flowmachine.core.errors import BadLevelError +from flowmachine.core.errors import InvalidSpatialUnitError from flowmachine.core import make_spatial_unit from flowmachine.features import UniqueLocationCounts, SubscriberLocations def test_returns_errors(): """ - Test level exists + Test spatial unit exists """ - with pytest.raises(BadLevelError): - UniqueLocationCounts("2016-01-01", "2016-01-02", level="foo") + with pytest.raises(InvalidSpatialUnitError): + UniqueLocationCounts("2016-01-01", "2016-01-02", spatial_unit="foo") -def test_column_names_unique_location_counts(exemplar_level_param): +def test_column_names_unique_location_counts(exemplar_spatial_unit_param): """ Test that column_names property matches head(0) for UniqueLocationCounts""" lv = UniqueLocationCounts( - "2016-01-01", "2016-01-02", **exemplar_level_param, hours=(5, 17) + "2016-01-01", + "2016-01-02", + spatial_unit=exemplar_spatial_unit_param, + hours=(5, 17), ) assert lv.head(0).columns.tolist() == lv.column_names @@ -29,7 +32,12 @@ def test_correct_counts(get_dataframe): """ UniqueLocationCounts returns correct counts. """ - ulc = UniqueLocationCounts("2016-01-01", "2016-01-02", level="cell", hours=(5, 17)) + ulc = UniqueLocationCounts( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("cell"), + hours=(5, 17), + ) df = get_dataframe(ulc) dful = get_dataframe( SubscriberLocations( From 0db0e8e47a5b426dd9313920f6e2aa74ca79d6d0 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 01:08:02 +0100 Subject: [PATCH 097/138] Update FirstLocation --- .../features/subscriber/first_location.py | 91 +++++-------------- flowmachine/tests/test_first_location.py | 8 +- 2 files changed, 28 insertions(+), 71 deletions(-) diff --git a/flowmachine/flowmachine/features/subscriber/first_location.py b/flowmachine/flowmachine/features/subscriber/first_location.py index 70423a53c5..28f02a83b9 100644 --- a/flowmachine/flowmachine/features/subscriber/first_location.py +++ b/flowmachine/flowmachine/features/subscriber/first_location.py @@ -12,7 +12,7 @@ """ from typing import List -from flowmachine.utils import get_columns_for_level +from flowmachine.core import make_spatial_unit from .metaclasses import SubscriberFeature from ..utilities.subscriber_locations import SubscriberLocations @@ -30,16 +30,22 @@ class FirstLocation(SubscriberFeature): String representing the beginning of the focal time period stop : str String representing the end of the focal period - location : str, tuple, list of str, or list of tuple + location : str, dict, tuple, list of str, list of dict, or list of tuple str representing the location of interest. Could be a cell or an admin region for instance. You must specify - level to match this. i.e. location='ER0980', level='cell'. + spatial_unit to match this. i.e. location='ER0980', + spatial_unit=make_spatial_unit('cell'). Can also pass a list of strings e.g. ['ER0980', 'CW2020'] will return the time at which the subscriber was first at any of these locations. Pass the argument 'any', to find the first time a subscriber pops up at any location. - For the levels versioned-cell, versioned-site may be a tuple or list thereof. - For the level lat-lon, this _must_ be a tuple. + For spatial units with multiple location_id_columns, see + `SpatialUnitMixin.location_subset_clause` or + `LatLonSpatialUnit.location_subset_clause` for a description of the + allowed formats for the location argument. + spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell + Spatial unit to which subscriber locations will be mapped. See the + docstring of make_spatial_unit for more information. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None @@ -58,24 +64,21 @@ def __init__( stop, *, location, - level="cell", + spatial_unit=make_spatial_unit("cell"), hours="all", table="all", subscriber_identifier="msisdn", ignore_nulls=True, - column_name=None, subscriber_subset=None, - polygon_table=None, - size=None, ): """ """ - if location == "any" and level != "cell": + if location == "any" and spatial_unit != make_spatial_unit("cell"): raise ValueError( - "Invalid parameter combination: location='any' can only be used with level='cell'." + "Invalid parameter combination: location='any' can only be used with cell spatial unit." ) self.start = start @@ -85,15 +88,12 @@ def __init__( self.ul = SubscriberLocations( self.start, self.stop, - level=level, + spatial_unit=spatial_unit, hours=hours, table=table, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, - column_name=column_name, subscriber_subset=subscriber_subset, - polygon_table=polygon_table, - size=size, ) self.table = self.ul.table @@ -110,73 +110,26 @@ def _make_query(self): Default query method implemented in the metaclass Query(). """ + clause = self._get_locations_clause(self.location) - column_name = get_columns_for_level(self.ul.level) - - clause = self._get_locations_clause(self.location, column_name) - - sql = """ + sql = f""" SELECT relevant_locs.subscriber, min(time) AS time FROM - (SELECT * FROM ({subscriber_locs}) AS subscriber_locs + (SELECT * FROM ({self.ul.get_query()}) AS subscriber_locs {clause}) AS relevant_locs GROUP BY relevant_locs.subscriber - """.format( - subscriber_locs=self.ul.get_query(), clause=clause - ) + """ return sql - def _get_locations_clause(self, location, column_name): + def _get_locations_clause(self, location): """ Private method for getting location clause in statement. """ if location == "any": return "" - if len(column_name) == 1: # polygon, admin, cell, grid - if isinstance(location, tuple) or isinstance(location, list): - in_list = "('" + "','".join(location) + "')" - return "WHERE {} in {}".format(column_name[0], in_list) - else: - return "WHERE {} = '{}'".format(column_name[0], location) - elif self.ul.level == "lat-lon": - if isinstance(location, tuple) or isinstance(location, list): - in_list = ( - "('" - + "','".join( - "ST_SetSRID(ST_Point({}, {}), 4326)".format(lon, lat) - for lon, lat in location - ) - + "')" - ) - return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) in {}".format( - in_list - ) - else: - return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point({}, {}), 4326)'".format( - *location - ) - else: # Versioned things - if isinstance(location, str): # Deal with single string - location = (location,) - elif isinstance( - location, list - ): # Deal with possible single strings in list - location = [l if isinstance(l, tuple) else (l,) for l in location] - if isinstance(location, tuple): - return "WHERE " + " AND ".join( - "{} = '{}'".format(c, l) for c, l in zip(column_name, location) - ) - else: - ands = " OR ".join( - "({})".format( - " AND ".join( - "{} = '{}'".format(c, l) for c, l in zip(column_name, loc) - ) - ) - for loc in location - ) - return "WHERE " + ands + else: + return self.ul.spatial_unit.location_subset_clause(self.location) diff --git a/flowmachine/tests/test_first_location.py b/flowmachine/tests/test_first_location.py index 0520dda177..275d03d16a 100644 --- a/flowmachine/tests/test_first_location.py +++ b/flowmachine/tests/test_first_location.py @@ -6,6 +6,7 @@ Tests for the class FirstLocation """ +from flowmachine.core import make_spatial_unit from flowmachine.features.subscriber import FirstLocation @@ -14,7 +15,10 @@ def test_time_at_first_location_correct(get_dataframe): FirstLocation() dataframe contains hand-picked records. """ dfl = FirstLocation( - "2016-01-01", "2016-01-04", location="QeBRM8", level="versioned-site" + "2016-01-01", + "2016-01-04", + location="QeBRM8", + spatial_unit=make_spatial_unit("versioned-site"), ) df = get_dataframe(dfl) @@ -32,7 +36,7 @@ def test_handles_list_of_locations(get_dataframe): "2016-01-01", "2016-01-04", location=["QeBRM8", "m9jL23" "LVnDQL"], - level="versioned-site", + spatial_unit=make_spatial_unit("versioned-site"), ) df = get_dataframe(dfl) From 942e2e3484aa1c7630d8cbc99ac8f3cbc7eb0f5b Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 01:08:23 +0100 Subject: [PATCH 098/138] Start implementing location_subset_clause --- flowmachine/flowmachine/core/spatial_unit.py | 165 ++++++++++++++++++- 1 file changed, 157 insertions(+), 8 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 68aac76734..9e851eecee 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -115,6 +115,104 @@ def verify_criterion(self, criterion, negate=False): + criteria[criterion]["message"] ) + def location_subset_clause(self, locations, check_column_names=True): + """ + Return a SQL "WHERE" clause to subset a query (joined to this spatial + unit) to a location or set of locations. + + Parameters + ---------- + locations : str, dict, list of str or list of dict + Location or list of locations to subset to. + If this has type str or list of str, the values are assumed to + correspond to the first column in self.location_id_columns. + If it has type dict or list of dict, the dict keys should + correspond to the column names in self.location_id_columns. + check_column_names : bool, default True + If True, check that all dict keys can be found in + self.location_id_columns. + + Returns + ------- + str + SQL where clause. + + See also + -------- + LatLonSpatialUnit.location_subset_clause + """ + raise NotImplementedError( + "location_subset_clause is not fully implemented yet." + ) + if isinstance(locations, list) or isinstance(locations, tuple): + if isinstance(locations[0], dict): + # multiple locations, multiple columns + # TODO: Check keys are subset of self.location_id_columns when check_column_names==True + ands = [ + " AND ".join(f"{key} = '{value}'" for key, value in loc.items()) + for loc in locations + ] + return "WHERE (" + ") OR (".join(ands) + ")" + else: + # multiple locations, first column + locs_list_string = ", ".join(f"'{l}'" for l in locations) + return f"WHERE {self.location_id_columns[0]} IN ({locs_list_string})" + elif isinstance(locations, dict): + # one location, multiple columns + # TODO: Check keys are subset of self.location_id_columns when check_column_names==True + return "WHERE " + " AND ".join( + f"{key} = '{value}'" for key, value in locations.items() + ) + else: + # one location, first column + return f"WHERE {self.location_id_columns[0]} = '{locations}'" + + # From FirstLocation._get_locations_clause: + # if len(column_name) == 1: # polygon, admin, cell, grid + # if isinstance(location, tuple) or isinstance(location, list): + # in_list = "('" + "','".join(location) + "')" + # return "WHERE {} in {}".format(column_name[0], in_list) + # else: + # return "WHERE {} = '{}'".format(column_name[0], location) + # elif self.ul.level == "lat-lon": + # if isinstance(location, tuple) or isinstance(location, list): + # in_list = ( + # "('" + # + "','".join( + # "ST_SetSRID(ST_Point({}, {}), 4326)".format(lon, lat) + # for lon, lat in location + # ) + # + "')" + # ) + # return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) in {}".format( + # in_list + # ) + # else: + # return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point({}, {}), 4326)'".format( + # *location + # ) + # else: # Versioned things + # if isinstance(location, str): # Deal with single string + # location = (location,) + # elif isinstance( + # location, list + # ): # Deal with possible single strings in list + # location = [l if isinstance(l, tuple) else (l,) for l in location] + # if isinstance(location, tuple): + # return "WHERE " + " AND ".join( + # "{} = '{}'".format(c, l) for c, l in zip(column_name, location) + # ) + # else: + # ands = " OR ".join( + # "({})".format( + # " AND ".join( + # "{} = '{}'".format(c, l) for c, l in zip(column_name, loc) + # ) + # ) + # for loc in location + # ) + # return "WHERE " + ands + class CellSpatialUnit(SpatialUnitMixin): """ @@ -303,9 +401,9 @@ class LatLonSpatialUnit(GeomSpatialUnit): Parameters ---------- - geom_table_column_names : str or list + geom_table_column_names : str or list, default [] Name(s) of the column(s) to fetch from geom_table. - location_id_column_names : str or list + location_id_column_names : str or list, default [] Name(s) of the column(s) which identify the locations. Must be a subset of the column_names for this query. geom_table : str or flowmachine.Query, optional @@ -316,17 +414,19 @@ class LatLonSpatialUnit(GeomSpatialUnit): geom_column : str, default "geom_point" Name of the column in geom_table that defines the point geometry from which latitude and longitude will be extracted. - geom_table_join_on : str + geom_table_join_on : str, optional Name of the column from geom_table to join on. - location_table_join_on : str + Required if geom_table != connection.location_table. + location_table_join_on : str, optional Name of the column from connection.location_table to join on. + Required if geom_table != connection.location_table. """ def __init__( self, *, geom_table_column_names=(), - location_id_column_names=("lat", "lon"), + location_id_column_names=(), geom_table=None, geom_column="geom_point", geom_table_join_on=None, @@ -336,7 +436,7 @@ def __init__( self._loc_on = location_table_join_on super().__init__( geom_table_column_names=geom_table_column_names, - location_id_column_names=location_id_column_names, + location_id_column_names=location_id_column_names + ("lon", "lat"), geom_table=geom_table, geom_column=geom_column, ) @@ -356,6 +456,55 @@ def _join_clause(self, loc_table_alias, geom_table_alias): ON {loc_table_alias}.{self._loc_on} = {geom_table_alias}.{self._geom_on} """ + def location_subset_clause(self, locations, check_column_names=True): + """ + Return a SQL "WHERE" clause to subset a query (joined to this spatial + unit) to a location or set of locations. This method differs from the + default implementation in its handling of lat-lon values, i.e. it returns + WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = ST_SetSRID(ST_Point(, ), 4326)' + instead of + WHERE lon = '' AND lat = '' + + Parameters + ---------- + locations : str, tuple, dict, list of str, list of tuple or list of dict + Location or list of locations to subset to. + If this has type tuple (length 2) or list of tuple, the values are + assumed to correspond to the 'lon' and 'lat' columns. + If this has type str or list of str, the values are assumed to + correspond to the first column in self.location_id_columns. + If it has type dict or list of dict, the dict keys should + correspond to the column names in self.location_id_columns. + check_column_names : bool, default True + If True, check that all dict keys can be found in + self.location_id_columns. + + Returns + ------- + str + SQL where clause. + + See also + -------- + SpatialUnitMixin.location_subset_clause + """ + raise NotImplementedError( + "LatLonSpatialUnit.location_subset_clause is not implemented yet." + ) + # TODO: Implement this. + # Should raise an error if locations is a string or list of string and + # location_id_columns[0] == 'lon'. + # Should return example in docstring (or + # "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) IN (...)") if + # locations is a tuple or list of tuples. + # Should return same as SpatialUnitMixin.location_subset_clause if + # locations is a dict or list of dict and "lat" and "lon" columns are + # not included in keys. + # If lat" and "lon" are included in the dict keys, should be able to + # create a new dict with "lat", "lon" keys replaced by + # "ST_SetSRID(ST_Point(lon, lat), 4326)" (and set corresponding value), + # and then call super().location_subset_clause with check_column_names=False + class PolygonSpatialUnit(GeomSpatialUnit): """ @@ -414,7 +563,7 @@ def versioned_cell_spatial_unit(): return LatLonSpatialUnit( geom_table_column_names=["version"], - location_id_column_names=["location_id", "version", "lon", "lat"], + location_id_column_names=["location_id", "version"], geom_table="infrastructure.cells", ) @@ -430,7 +579,7 @@ def versioned_site_spatial_unit(): """ return LatLonSpatialUnit( geom_table_column_names=["id AS site_id", "version"], - location_id_column_names=["site_id", "version", "lon", "lat"], + location_id_column_names=["site_id", "version"], geom_table="infrastructure.sites", geom_table_join_on="id", location_table_join_on="site_id", From 117f61cfbc444279e32883238361ab39896a708d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 10:25:15 +0100 Subject: [PATCH 099/138] Fix LatLonSpatialUnit --- flowmachine/flowmachine/core/spatial_unit.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 9e851eecee..692501e305 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -406,6 +406,7 @@ class LatLonSpatialUnit(GeomSpatialUnit): location_id_column_names : str or list, default [] Name(s) of the column(s) which identify the locations. Must be a subset of the column_names for this query. + "lon" and "lat" will be appended to this list of names. geom_table : str or flowmachine.Query, optional Name of the table containing the geography information. Can be either the name of a table, with the schema, or a @@ -436,7 +437,7 @@ def __init__( self._loc_on = location_table_join_on super().__init__( geom_table_column_names=geom_table_column_names, - location_id_column_names=location_id_column_names + ("lon", "lat"), + location_id_column_names=location_id_column_names, geom_table=geom_table, geom_column=geom_column, ) @@ -456,6 +457,13 @@ def _join_clause(self, loc_table_alias, geom_table_alias): ON {loc_table_alias}.{self._loc_on} = {geom_table_alias}.{self._geom_on} """ + @property + def location_id_columns(self) -> List[str]: + """ + Names of the columns that identify a location. + """ + return list(self._locid_cols) + ["lon", "lat"] + def location_subset_clause(self, locations, check_column_names=True): """ Return a SQL "WHERE" clause to subset a query (joined to this spatial From 73b0da4242a48cd117ef36623963459f728996b8 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 12:30:51 +0100 Subject: [PATCH 100/138] Fix test_cache_utils.py --- flowmachine/tests/test_cache_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_cache_utils.py b/flowmachine/tests/test_cache_utils.py index c23aaedfb1..d2121ed434 100644 --- a/flowmachine/tests/test_cache_utils.py +++ b/flowmachine/tests/test_cache_utils.py @@ -471,7 +471,7 @@ def test_cache_reset_protects_tables(flowmachine_connect): Resetting the cache should preserve Table entries. """ # Regression test for https://github.com/Flowminder/FlowKit/issues/832 - dl_query = daily_location(date="2016-01-03", level="admin3", method="last") + dl_query = daily_location(date="2016-01-03", method="last") reset_cache(flowmachine_connect, dl_query.redis) for dep in dl_query._get_stored_dependencies(): assert dep.md5 in [x.md5 for x in Query.get_stored()] @@ -499,7 +499,7 @@ def test_cache_metadata_write_error(flowmachine_connect, dummy_redis, monkeypatc # Regression test for https://github.com/Flowminder/FlowKit/issues/833 writer_mock = Mock(side_effect=TestException) - dl_query = daily_location(date="2016-01-03", level="admin3", method="last") + dl_query = daily_location(date="2016-01-03", method="last") assert not dl_query.is_stored monkeypatch.setattr("flowmachine.core.cache.write_cache_metadata", writer_mock) From 1d6c18eadd912a422372fbf103b37b1c6f710f53 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 12:31:26 +0100 Subject: [PATCH 101/138] Fix versioned-site spatial unit --- flowmachine/flowmachine/core/spatial_unit.py | 70 +++++++++++++++----- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 692501e305..2e8a1ed109 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -122,12 +122,16 @@ def location_subset_clause(self, locations, check_column_names=True): Parameters ---------- - locations : str, dict, list of str or list of dict + locations : str, dict, or list/tuple thereof Location or list of locations to subset to. - If this has type str or list of str, the values are assumed to - correspond to the first column in self.location_id_columns. - If it has type dict or list of dict, the dict keys should - correspond to the column names in self.location_id_columns. + This should have one of the following formats: + str, or list/tuple of str + Values correspond to the first column in + self.location_id_columns. + dict, or list/tuple of dict + Dict keys correspond to the column names in + self.location_id_columns, and values correspond to the + values in those columns. check_column_names : bool, default True If True, check that all dict keys can be found in self.location_id_columns. @@ -346,11 +350,25 @@ def _make_query(self): self._get_aliased_geom_table_cols(geom_table_alias) ) + loc_table_cols_string = f"{loc_table_alias}.id AS location_id" + + geom_table_col_aliases = [ + get_name_and_alias(c)[1] for c in self._geom_table_cols + ] + if not ( + "date_of_first_service" in geom_table_col_aliases + and "date_of_last_service" in geom_table_col_aliases + ): + # If we're not selecting dates from the geom table, we need to + # select them from the location table + loc_table_cols_string += f""", + {loc_table_alias}.date_of_first_service, + {loc_table_alias}.date_of_last_service + """ + sql = f""" SELECT - {loc_table_alias}.id AS location_id, - {loc_table_alias}.date_of_first_service, - {loc_table_alias}.date_of_last_service, + {loc_table_cols_string}, {geom_table_cols_string} FROM {self.connection.location_table} AS {loc_table_alias} {join_clause} @@ -360,10 +378,18 @@ def _make_query(self): @property def column_names(self) -> List[str]: - return ["location_id", "date_of_first_service", "date_of_last_service"] + [ + cols = ["location_id"] + geom_table_cols = [ get_name_and_alias(c)[1] for c in self._get_aliased_geom_table_cols("geom_table") ] + if not ( + "date_of_first_service" in geom_table_cols + and "date_of_last_service" in geom_table_cols + ): + cols += ["date_of_first_service", "date_of_last_service"] + cols += geom_table_cols + return cols def get_geom_query(self): """ @@ -475,14 +501,19 @@ def location_subset_clause(self, locations, check_column_names=True): Parameters ---------- - locations : str, tuple, dict, list of str, list of tuple or list of dict + locations : tuple, str, dict, or list/tuple thereof Location or list of locations to subset to. - If this has type tuple (length 2) or list of tuple, the values are - assumed to correspond to the 'lon' and 'lat' columns. - If this has type str or list of str, the values are assumed to - correspond to the first column in self.location_id_columns. - If it has type dict or list of dict, the dict keys should - correspond to the column names in self.location_id_columns. + This should have one of the following formats: + tuple (length 2), or list/tuple of tuple + Values are (longitude, latitude) pairs, corresponding to + the 'lon' and 'lat' columns. + str, or list/tuple of str + Values correspond to the first column in + self.location_id_columns. + dict, or list/tuple of dict + Dict keys correspond to the column names in + self.location_id_columns, and values correspond to the + values in those columns. check_column_names : bool, default True If True, check that all dict keys can be found in self.location_id_columns. @@ -586,7 +617,12 @@ def versioned_site_spatial_unit(): flowmachine.core.spatial_unit.LatLonSpatialUnit """ return LatLonSpatialUnit( - geom_table_column_names=["id AS site_id", "version"], + geom_table_column_names=[ + "date_of_first_service", + "date_of_last_service", + "id AS site_id", + "version", + ], location_id_column_names=["site_id", "version"], geom_table="infrastructure.sites", geom_table_join_on="id", From 6ac9ca884c1fad3b8aa592549d6eb71163b9e567 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 13:25:40 +0100 Subject: [PATCH 102/138] Use (lon,lat) order everywhere, for consistency with ST_Point --- flowmachine/flowmachine/core/grid.py | 10 ++-- .../flowmachine/core/join_to_location.py | 2 +- flowmachine/flowmachine/core/spatial_unit.py | 54 +++++++++---------- .../flowmachine/features/spatial/circles.py | 10 ++-- .../features/spatial/distance_matrix.py | 6 +-- .../features/spatial/location_area.py | 2 +- .../contact_reference_locations_stats.py | 6 +-- .../features/subscriber/displacement.py | 8 +-- .../features/subscriber/first_location.py | 4 +- .../features/subscriber/radius_of_gyration.py | 8 +-- .../features/utilities/spatial_aggregates.py | 2 +- .../utilities/subscriber_locations.py | 2 +- flowmachine/flowmachine/models/pwo.py | 2 +- flowmachine/flowmachine/utils.py | 14 +++-- flowmachine/tests/conftest.py | 2 +- flowmachine/tests/test_circles.py | 4 +- .../test_contact_reference_locations_stats.py | 2 +- flowmachine/tests/test_displacement.py | 10 ++-- flowmachine/tests/test_geomixin.py | 8 +-- flowmachine/tests/test_indexes.py | 4 +- flowmachine/tests/test_join_to_location.py | 18 +++---- flowmachine/tests/test_last_location.py | 8 +-- .../tests/test_location_introversion.py | 8 +-- .../tests/test_meaningful_locations.py | 4 +- .../tests/test_most_frequent_locations.py | 8 +-- flowmachine/tests/test_spatial_aggregate.py | 8 +-- .../tests/test_spatial_distancematrix.py | 2 +- flowmachine/tests/test_spatial_unit.py | 14 +++-- .../tests/test_total_network_objects.py | 8 +-- 29 files changed, 117 insertions(+), 121 deletions(-) diff --git a/flowmachine/flowmachine/core/grid.py b/flowmachine/flowmachine/core/grid.py index 5e6b2ef44f..a56552d1e0 100644 --- a/flowmachine/flowmachine/core/grid.py +++ b/flowmachine/flowmachine/core/grid.py @@ -49,13 +49,13 @@ def _geo_augmented_query(self): """ return ( - f"SELECT grid_id, geom_square as geom, row_number() OVER (ORDER BY latitude, longitude) as gid FROM ({self.get_query()}) as x", + f"SELECT grid_id, geom_square as geom, row_number() OVER (ORDER BY longitude, latitude) as gid FROM ({self.get_query()}) as x", ["grid_id", "geom", "gid"], ) @property def column_names(self) -> List[str]: - return ["grid_id", "geom_square", "geom_point", "latitude", "longitude"] + return ["grid_id", "geom_square", "geom_point", "longitude", "latitude"] def _make_query(self): @@ -75,11 +75,11 @@ def _make_query(self): sql = f""" SELECT - '{str(self.size).replace(".", "_")}' || '_' || row_number() OVER (ORDER BY latitude, longitude) AS grid_id, + '{str(self.size).replace(".", "_")}' || '_' || row_number() OVER (ORDER BY longitude, latitude) AS grid_id, geom_square, geom_point, - latitude, - longitude + longitude, + latitude FROM ({grid_sql}) AS grid """ diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 23f1371c6c..29baa1022c 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -23,7 +23,7 @@ class JoinToLocation(Query): """ Intermediate class which joins any query object, or python string representing a query object, to some geographical level. - This can be simply the site with a version a lat-lon value, an + This can be simply the site with a version, a lon-lat value, an admin region, a gridded map, or any arbitrary polygon. This will return everything in the original query, plus an additional column or columns representing the spatial region that the infrastructure diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 2e8a1ed109..05b3c9b3d9 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -42,11 +42,11 @@ def has_geography(self): return hasattr(self, "get_geom_query") @property - def has_lat_lon_columns(self): + def has_lon_lat_columns(self): """ - True if spatial unit has lat/lon columns. + True if spatial unit has lon/lat columns. """ - return "lat" in self.location_id_columns and "lon" in self.location_id_columns + return "lon" in self.location_id_columns and "lat" in self.location_id_columns @property def is_network_object(self): @@ -75,7 +75,7 @@ def verify_criterion(self, criterion, negate=False): criterion : str One of: 'has_geography' - 'has_lat_lon_columns' + 'has_lon_lat_columns' 'is_network_object' 'is_polygon' negate : bool, default False @@ -94,9 +94,9 @@ def verify_criterion(self, criterion, negate=False): "property": self.has_geography, "message": f"{'has' if negate else 'does not have'} geography information.", }, - "has_lat_lon_columns": { - "property": self.has_lat_lon_columns, - "message": f"{'has' if negate else 'does not have'} latitude/longitude columns.", + "has_lon_lat_columns": { + "property": self.has_lon_lat_columns, + "message": f"{'has' if negate else 'does not have'} longitude/latitude columns.", }, "is_network_object": { "property": self.is_network_object, @@ -143,7 +143,7 @@ def location_subset_clause(self, locations, check_column_names=True): See also -------- - LatLonSpatialUnit.location_subset_clause + LonLatSpatialUnit.location_subset_clause """ raise NotImplementedError( "location_subset_clause is not fully implemented yet." @@ -417,13 +417,13 @@ def get_geom_query(self): return sql -class LatLonSpatialUnit(GeomSpatialUnit): +class LonLatSpatialUnit(GeomSpatialUnit): """ Class that provides a mapping from cell/site IDs in the location table to - latitude and longitude. + longitude and latitude. In addition to the requested geom_table_column_names, this query returns - latitude and longitude values in columns "lat" and "lon". + longitude and latitude values in columns "lon" and "lat". Parameters ---------- @@ -440,7 +440,7 @@ class LatLonSpatialUnit(GeomSpatialUnit): Defaults to connection.location_table geom_column : str, default "geom_point" Name of the column in geom_table that defines the point geometry from - which latitude and longitude will be extracted. + which longitude and latitude will be extracted. geom_table_join_on : str, optional Name of the column from geom_table to join on. Required if geom_table != connection.location_table. @@ -494,7 +494,7 @@ def location_subset_clause(self, locations, check_column_names=True): """ Return a SQL "WHERE" clause to subset a query (joined to this spatial unit) to a location or set of locations. This method differs from the - default implementation in its handling of lat-lon values, i.e. it returns + default implementation in its handling of lon-lat values, i.e. it returns WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = ST_SetSRID(ST_Point(, ), 4326)' instead of WHERE lon = '' AND lat = '' @@ -528,7 +528,7 @@ def location_subset_clause(self, locations, check_column_names=True): SpatialUnitMixin.location_subset_clause """ raise NotImplementedError( - "LatLonSpatialUnit.location_subset_clause is not implemented yet." + "LonLatSpatialUnit.location_subset_clause is not implemented yet." ) # TODO: Implement this. # Should raise an error if locations is a string or list of string and @@ -537,7 +537,7 @@ def location_subset_clause(self, locations, check_column_names=True): # "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) IN (...)") if # locations is a tuple or list of tuples. # Should return same as SpatialUnitMixin.location_subset_clause if - # locations is a dict or list of dict and "lat" and "lon" columns are + # locations is a dict or list of dict and "lon" and "lat" columns are # not included in keys. # If lat" and "lon" are included in the dict keys, should be able to # create a new dict with "lat", "lon" keys replaced by @@ -590,17 +590,17 @@ def _join_clause(self, loc_table_alias, geom_table_alias): def versioned_cell_spatial_unit(): """ - Returns a LatLonSpatialUnit that maps cell location_id to a cell version - and lat-lon coordinates. + Returns a LonLatSpatialUnit that maps cell location_id to a cell version + and lon-lat coordinates. Returns ------- - flowmachine.core.spatial_unit.LatLonSpatialUnit + flowmachine.core.spatial_unit.LonLatSpatialUnit """ if Query.connection.location_table != "infrastructure.cells": raise InvalidSpatialUnitError("Versioned cell spatial unit is unavailable.") - return LatLonSpatialUnit( + return LonLatSpatialUnit( geom_table_column_names=["version"], location_id_column_names=["location_id", "version"], geom_table="infrastructure.cells", @@ -609,14 +609,14 @@ def versioned_cell_spatial_unit(): def versioned_site_spatial_unit(): """ - Returns a LatLonSpatialUnit that maps cell location_id to a site version - and lat-lon coordinates. + Returns a LonLatSpatialUnit that maps cell location_id to a site version + and lon-lat coordinates. Returns ------- - flowmachine.core.spatial_unit.LatLonSpatialUnit + flowmachine.core.spatial_unit.LonLatSpatialUnit """ - return LatLonSpatialUnit( + return LonLatSpatialUnit( geom_table_column_names=[ "date_of_first_service", "date_of_last_service", @@ -704,8 +704,8 @@ def make_spatial_unit( Can be one of: 'cell' The identifier as found in the CDR. - 'lat-lon' - Latitude and longitude of cell/site locations. + 'lon-lat' + Longitude and latitude of cell/site locations. 'versioned-cell' The identifier as found in the CDR combined with the version from the cells table. @@ -758,8 +758,8 @@ def make_spatial_unit( return versioned_cell_spatial_unit() elif spatial_unit_type == "versioned-site": return versioned_site_spatial_unit() - elif spatial_unit_type == "lat-lon": - return LatLonSpatialUnit() + elif spatial_unit_type == "lon-lat": + return LonLatSpatialUnit() elif spatial_unit_type == "admin": if level is None: raise ValueError( diff --git a/flowmachine/flowmachine/features/spatial/circles.py b/flowmachine/flowmachine/features/spatial/circles.py index f7d8b95bc3..abfd810e18 100644 --- a/flowmachine/flowmachine/features/spatial/circles.py +++ b/flowmachine/flowmachine/features/spatial/circles.py @@ -27,7 +27,7 @@ class Circle: Parameters ---------- lon, lat : int - The latitude and longitude of the circle centre + The longitude and latitude of the circle centre radius : float The radius in meters to use as the extent of the circle names : str @@ -51,7 +51,7 @@ def __init__(self, lon, lat, radius, name): def __repr__(self): - return f"Circle(lat={self.lat},long={self.lon},radius={self.radius},name={self.name})" + return f"Circle(lon={self.lon},lat={self.lat},radius={self.radius},name={self.name})" class CircleGeometries(GeoDataMixin, Query): @@ -123,12 +123,12 @@ def raster_sum(self, raster): Examples -------- - >>> lats = [85.3240,83.9956] - >>> lons = [27.7172,28.2380] + >>> lons = [85.3240,83.9956] + >>> lats = [27.7172,28.2380] >>> names = ['Kathmandu','Pokhara'] >>> radii = [4000,11000] - >>> circles = [Circle(*vals) for vals in zip(lats,lons,radii,names)] + >>> circles = [Circle(*vals) for vals in zip(lons,lats,radii,names)] >>> cp = CircleGeometries(circles) >>> rs = cp.raster_sum('population.small_nepal_raster') >>> rs.get_dataframe() diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 93c0adc0b2..4606d21c13 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -25,9 +25,9 @@ class DistanceMatrix(GraphMixin, Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.LatLonSpatialUnit, default versioned-cell + spatial_unit : flowmachine.core.spatial_unit.LonLatSpatialUnit, default versioned-cell Locations to compute distances for. - Note: only point locations (i.e. spatial_unit.has_lat_lon_columns) are + Note: only point locations (i.e. spatial_unit.has_lon_lat_columns) are supported at this time. return_geometry : bool If True, geometries are returned in query @@ -43,7 +43,7 @@ def __init__(self, spatial_unit=None, return_geometry=False): else: self.spatial_unit = spatial_unit - self.spatial_unit.verify_criterion("has_lat_lon_columns") + self.spatial_unit.verify_criterion("has_lon_lat_columns") self.return_geometry = return_geometry diff --git a/flowmachine/flowmachine/features/spatial/location_area.py b/flowmachine/flowmachine/features/spatial/location_area.py index dff07d9f5b..c3e44bf010 100644 --- a/flowmachine/flowmachine/features/spatial/location_area.py +++ b/flowmachine/flowmachine/features/spatial/location_area.py @@ -215,7 +215,7 @@ class LocationArea(GeoDataMixin, Query): Parameters ---------- point_collection : str or list, default 'sites' - A point collection with latitude and longitudes. + A point collection with longitude and latitudes. This parameter can fetch a table in the database (if a str is passed) or a list collection of tuples. diff --git a/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py b/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py index ea16b20613..fe2e305582 100644 --- a/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py +++ b/flowmachine/flowmachine/features/subscriber/contact_reference_locations_stats.py @@ -30,8 +30,8 @@ class ContactReferenceLocationStats(SubscriberFeature): Defaults to sum, aggregation statistic over the durations. geom_column: The column containing the subscribers' reference locations. This is - required if the Query does not contain a spatial unit with 'lat' and - 'lon' columns. + required if the Query does not contain a spatial unit with 'lon' and + 'lat' columns. Example ------- @@ -72,7 +72,7 @@ def __init__( if self.geom_column is None: try: self.contact_locations_query.spatial_unit.verify_criterion( - "has_lat_lon_columns" + "has_lon_lat_columns" ) except AttributeError: raise ValueError( diff --git a/flowmachine/flowmachine/features/subscriber/displacement.py b/flowmachine/flowmachine/features/subscriber/displacement.py index ba0e3bd266..f4676fe9c1 100644 --- a/flowmachine/flowmachine/features/subscriber/displacement.py +++ b/flowmachine/flowmachine/features/subscriber/displacement.py @@ -122,13 +122,13 @@ def __init__( raise ValueError( "Argument 'modal_locations' should be an instance of ModalLocation class" ) - hl.spatial_unit.verify_criterion("has_lat_lon_columns") + hl.spatial_unit.verify_criterion("has_lon_lat_columns") else: hl = ModalLocation( *[ daily_location( date, - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), hours=hours, method=method, table=table, @@ -143,7 +143,7 @@ def __init__( sl = SubscriberLocations( self.start, self.stop_sl, - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), hours=hours, table=table, subscriber_identifier=subscriber_identifier, @@ -178,7 +178,7 @@ def column_names(self) -> List[str]: def _make_query(self): - dist_string = get_dist_string("lat_home_loc", "lon_home_loc", "lat", "lon") + dist_string = get_dist_string("lon_home_loc", "lat_home_loc", "lon", "lat") if self.unit == "km": divisor = 1000 diff --git a/flowmachine/flowmachine/features/subscriber/first_location.py b/flowmachine/flowmachine/features/subscriber/first_location.py index 28f02a83b9..dba4a6c826 100644 --- a/flowmachine/flowmachine/features/subscriber/first_location.py +++ b/flowmachine/flowmachine/features/subscriber/first_location.py @@ -30,7 +30,7 @@ class FirstLocation(SubscriberFeature): String representing the beginning of the focal time period stop : str String representing the end of the focal period - location : str, dict, tuple, list of str, list of dict, or list of tuple + location : str, dict, tuple, or list/tuple thereof str representing the location of interest. Could be a cell or an admin region for instance. You must specify spatial_unit to match this. i.e. location='ER0980', @@ -41,7 +41,7 @@ class FirstLocation(SubscriberFeature): first time a subscriber pops up at any location. For spatial units with multiple location_id_columns, see `SpatialUnitMixin.location_subset_clause` or - `LatLonSpatialUnit.location_subset_clause` for a description of the + `LonLatSpatialUnit.location_subset_clause` for a description of the allowed formats for the location argument. spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell Spatial unit to which subscriber locations will be mapped. See the diff --git a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py index df5bd08700..bcf68a5d78 100644 --- a/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py +++ b/flowmachine/flowmachine/features/subscriber/radius_of_gyration.py @@ -102,7 +102,7 @@ def __init__( self.ul = SubscriberLocations( self.start, self.stop, - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), hours=hours, table=table, subscriber_subset=subscriber_subset, @@ -121,15 +121,15 @@ def _make_query(self): av_dist = f""" SELECT subscriber_locs.subscriber, - avg(lat) AS av_lat, - avg(lon) AS av_long + avg(lon) AS av_lon, + avg(lat) AS av_lat FROM ({self.ul.get_query()}) AS subscriber_locs GROUP BY subscriber_locs.subscriber """ distance_string = """ ST_Distance(ST_Point(locs.lon, locs.lat)::geography, - ST_point(mean.av_long, mean.av_lat)::geography) + ST_point(mean.av_lon, mean.av_lat)::geography) """ # It seems like I'm creating the sub query twice here diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index 56941c8ac3..3ed6282215 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -136,7 +136,7 @@ def _make_query(self): # We need to do this because it may be the case that # a location is identified by more than one column, as - # is the case for lat-lon values + # is the case for lon-lat values loc_list = ", ".join(f"location.{lc}" for lc in location_cols) loc_list_no_schema = ", ".join(location_cols) diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 7203cd450d..07f40c03d4 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -27,7 +27,7 @@ class SubscriberLocations(Query): """ Class representing all the locations for which a subscriber has been found. - Can be at the level of a tower, lat-lon, or an admin unit. + Can be at the level of a tower, lon-lat, or an admin unit. Parameters ---------- diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 32347e2559..87920f17d8 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -193,7 +193,7 @@ class PopulationWeightedOpportunities(Model): documentation for other available methods. spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default versioned-site - Note: DistanceMatrix only supports spatial units with 'lat' and 'lon' + Note: DistanceMatrix only supports spatial units with 'lon' and 'lat' columns at this time. **kwargs : arguments diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index 73d1d73dc5..102c8ac9c6 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -204,17 +204,15 @@ def time_period_add(date, n, unit="days"): return date_string -def get_dist_string(lo1, la1, lo2, la2): +def get_dist_string(lon1, lat1, lon2, lat2): """ function for getting the distance - query string between to lat-lon points. + query string between two lon-lat points. + """ + return f""" + ST_Distance(ST_Point({lon1}, {lat1})::geography, + ST_point({lon2}, {lat2})::geography) """ - return """ - ST_Distance(ST_Point({}, {})::geography, - ST_point({}, {})::geography) - """.format( - lo1, la1, lo2, la2 - ) def proj4string(conn, crs=None): diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index e85922cbe0..e091cda620 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -73,7 +73,7 @@ def exemplar_level_param(request): {"spatial_unit_type": "versioned-site"}, {"spatial_unit_type": "versioned-cell"}, {"spatial_unit_type": "cell"}, - {"spatial_unit_type": "lat-lon"}, + {"spatial_unit_type": "lon-lat"}, {"spatial_unit_type": "grid", "size": 5}, { "spatial_unit_type": "polygon", diff --git a/flowmachine/tests/test_circles.py b/flowmachine/tests/test_circles.py index 29d2a4ffbd..4768864686 100644 --- a/flowmachine/tests/test_circles.py +++ b/flowmachine/tests/test_circles.py @@ -127,9 +127,9 @@ def test_circle_column_names(): def test_circle_string_rep(): """Test that Circle objects have correct string representation.""" - lat, lon, radius, name = 2, 3, 4, "bob" + lon, lat, radius, name = 2, 3, 4, "bob" cl = Circle(lon, lat, radius, name) - assert f"Circle(lat={lat},long={lon},radius={radius},name={name})" == str(cl) + assert f"Circle(lon={lon},lat={lat},radius={radius},name={name})" == str(cl) def test_circle_loc_creation(): diff --git a/flowmachine/tests/test_contact_reference_locations_stats.py b/flowmachine/tests/test_contact_reference_locations_stats.py index 3487ea7c2a..c50f0c342e 100644 --- a/flowmachine/tests/test_contact_reference_locations_stats.py +++ b/flowmachine/tests/test_contact_reference_locations_stats.py @@ -16,7 +16,7 @@ [ ("avg", "gwAynWXp4eWvxGP7", "versioned-cell", 298.7215), ("avg", "gwAynWXp4eWvxGP7", "versioned-site", 298.7215), - ("avg", "gwAynWXp4eWvxGP7", "lat-lon", 298.7215), + ("avg", "gwAynWXp4eWvxGP7", "lon-lat", 298.7215), ("stddev", "V7MBRewnwQGE91gY", "versioned-cell", 182.519_128), ], ) diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index e0ecc7faa3..002fdab2db 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -51,7 +51,7 @@ def test_pass_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=make_spatial_unit("lat-lon")) + daily_location(d, spatial_unit=make_spatial_unit("lon-lat")) for d in list_of_dates("2016-01-01", "2016-01-06") ] ) @@ -65,10 +65,10 @@ def test_pass_modal_location(get_dataframe): assert val == pytest.approx(169.926194) -def test_error_when_modal_location_not_latlong(): +def test_error_when_modal_location_not_lon_lat(): """ Test that error is raised if home location passed to class - is not using lat-lon spatial unit + is not using lon-lat spatial unit """ ml = ModalLocation( @@ -90,7 +90,7 @@ def test_get_all_users_in_modal_location(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), hours=(12, 13)) + daily_location(d, spatial_unit=make_spatial_unit("lon-lat"), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) @@ -114,7 +114,7 @@ def test_subscriber_with_home_loc_but_no_calls_is_nan(get_dataframe): ml = ModalLocation( *[ - daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), hours=(12, 13)) + daily_location(d, spatial_unit=make_spatial_unit("lon-lat"), hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ] ) diff --git a/flowmachine/tests/test_geomixin.py b/flowmachine/tests/test_geomixin.py index 962d2e78ce..f518c744a4 100644 --- a/flowmachine/tests/test_geomixin.py +++ b/flowmachine/tests/test_geomixin.py @@ -149,7 +149,7 @@ def test_reprojection(): """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js["features"][0]["geometry"]["coordinates"] == [ @@ -164,7 +164,7 @@ def test_geojson_cache(): Test geojson is cached locally. """ dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert js == dl._geojson[proj4string(dl.connection, 2770)] @@ -173,7 +173,7 @@ def test_geojson_cache(): def test_geojson_cache_exluded_from_pickle(): """Test that cached geojson is not going to get pickled.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 assert "_geojson" not in dl.__getstate__() # Check excluded from pickle @@ -182,7 +182,7 @@ def test_geojson_cache_exluded_from_pickle(): def test_geojson_caching_off(): """Test that switching off caching clears the cache, and doesn't add to it.""" dl = daily_location( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ).aggregate() js = dl.to_geojson(crs=2770) # OSGB36 dl.turn_off_caching() # Check caching for geojson switches off diff --git a/flowmachine/tests/test_indexes.py b/flowmachine/tests/test_indexes.py index bb3700f7d6..0fd0b4ade7 100644 --- a/flowmachine/tests/test_indexes.py +++ b/flowmachine/tests/test_indexes.py @@ -15,8 +15,8 @@ def test_default_indexes(): '"subscriber"', ] assert daily_location( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") - ).index_cols == [["lat", "lon"], '"subscriber"'] + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") + ).index_cols == [["lon", "lat"], '"subscriber"'] assert SubscriberDegree("2016-01-01", "2016-01-02").index_cols == ['"subscriber"'] diff --git a/flowmachine/tests/test_join_to_location.py b/flowmachine/tests/test_join_to_location.py index 7687e148de..0b16384f0a 100644 --- a/flowmachine/tests/test_join_to_location.py +++ b/flowmachine/tests/test_join_to_location.py @@ -77,29 +77,29 @@ def test_join_with_versioned_cells(get_dataframe, get_length): assert (should_be_version_one.version == 1).all() -def test_join_with_lat_lon(get_dataframe): +def test_join_with_lon_lat(get_dataframe): """ - Test that flowmachine.JoinToLocation can get the lat-lon values of the cell + Test that flowmachine.JoinToLocation can get the lon-lat values of the cell """ ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) - df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lat-lon"))) + df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lon-lat"))) - expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) + expected_cols = sorted(["subscriber", "time", "location_id", "lon", "lat"]) assert sorted(df.columns) == expected_cols # Pick out one cell that moves location and assert that the - # lat-lons are right + # lon-lats are right focal_cell = "dJb0Wd" - lat1, long1 = (27.648837800000003, 83.09284486) - lat2, long2 = (27.661443318109132, 83.25769074752517) + lon1, lat1 = (83.09284486, 27.648837800000003) + lon2, lat2 = (83.25769074752517, 27.661443318109132) post_move = df[(df.time > move_date) & (df["location_id"] == focal_cell)] pre_move = df[(df.time < move_date) & (df["location_id"] == focal_cell)] # And check them all one-by-one + np.isclose(pre_move.lon, lon1).all() np.isclose(pre_move.lat, lat1).all() - np.isclose(pre_move.lon, long1).all() + np.isclose(post_move.lon, lon2).all() np.isclose(post_move.lat, lat2).all() - np.isclose(post_move.lon, long2).all() def test_join_with_polygon(get_dataframe, get_length): diff --git a/flowmachine/tests/test_last_location.py b/flowmachine/tests/test_last_location.py index 7e7cfce884..6dabaa3060 100644 --- a/flowmachine/tests/test_last_location.py +++ b/flowmachine/tests/test_last_location.py @@ -35,15 +35,15 @@ def test_last_loc_vsite(get_dataframe): assert "dJb0Wd" == df.loc["zGWn8opVmOQAD6xY"].site_id -def test_last_loc_lat_lon(get_dataframe): +def test_last_loc_lon_lat(get_dataframe): """ - LastLocation() can make queries at the lat-lon level. + LastLocation() can make queries at the lon-lat level. """ last_loc = LastLocation( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ) df = get_dataframe(last_loc) df.set_index("subscriber", inplace=True) - assert pytest.approx(29.135638957790576) == float(df.loc["yqw50eNyEwOxNDGL"].lat) assert pytest.approx(83.09669810947962) == float(df.loc["yqw50eNyEwOxNDGL"].lon) + assert pytest.approx(29.135638957790576) == float(df.loc["yqw50eNyEwOxNDGL"].lat) diff --git a/flowmachine/tests/test_location_introversion.py b/flowmachine/tests/test_location_introversion.py index a4f26c57d2..a0be69e526 100644 --- a/flowmachine/tests/test_location_introversion.py +++ b/flowmachine/tests/test_location_introversion.py @@ -43,17 +43,17 @@ def test_some_results(get_dataframe): ) -def test_lat_lng_introversion(get_dataframe): +def test_lon_lat_introversion(get_dataframe): df = get_dataframe( LocationIntroversion( - "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat") ) ) assert pytest.approx(0.0681818181818182) == df.introversion.max() assert 1.0 == df.extroversion.max() - assert [28.2715052907426, 83.7762949093138] == df.sort_values("extroversion").iloc[ + assert [83.7762949093138, 28.2715052907426] == df.sort_values("extroversion").iloc[ -1 - ][["lat", "lon"]].tolist() + ][["lon", "lat"]].tolist() def test_no_result_is_greater_than_one(get_dataframe): diff --git a/flowmachine/tests/test_meaningful_locations.py b/flowmachine/tests/test_meaningful_locations.py index e5059823af..a0182c91b6 100644 --- a/flowmachine/tests/test_meaningful_locations.py +++ b/flowmachine/tests/test_meaningful_locations.py @@ -112,7 +112,7 @@ def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises(): labels=labels, label="evening", ), - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), ) @@ -280,7 +280,7 @@ def test_meaningful_locations_od_raises_for_bad_spatial_unit( mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl, meaningful_locations_b=mfl, - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), ) diff --git a/flowmachine/tests/test_most_frequent_locations.py b/flowmachine/tests/test_most_frequent_locations.py index 4259f373c6..b7e167a2a9 100644 --- a/flowmachine/tests/test_most_frequent_locations.py +++ b/flowmachine/tests/test_most_frequent_locations.py @@ -37,19 +37,19 @@ def test_vsites(get_dataframe): assert "qvkp6J" == df.loc["zvaOknzKbEVD2eME"].site_id -def test_lat_lons(get_dataframe): +def test_lon_lats(get_dataframe): """ - MostFrequentLocations() has the correct values at the lat-lon spatial unit. + MostFrequentLocations() has the correct values at the lon-lat spatial unit. """ mfl = MostFrequentLocation( - "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lat-lon") + "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("lon-lat") ) df = get_dataframe(mfl) df.set_index("subscriber", inplace=True) - assert pytest.approx(28.941925079951545) == float(df.loc["1QBlwRo4Kd5v3Ogz"].lat) assert pytest.approx(82.61895799084449) == float(df.loc["1QBlwRo4Kd5v3Ogz"].lon) + assert pytest.approx(28.941925079951545) == float(df.loc["1QBlwRo4Kd5v3Ogz"].lat) def test_most_fequent_admin(get_dataframe): diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index b1c8cfd2ab..2b1807668d 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -23,16 +23,16 @@ def test_can_be_aggregated_admin3(get_dataframe): assert ["pcod", "total"] == list(df.columns) -def test_can_be_aggregated_latlong(get_dataframe): +def test_can_be_aggregated_lon_lat(get_dataframe): """ - Query can be aggregated to a spatial level with lat-lon data. + Query can be aggregated to a spatial level with lon-lat data. """ hl = ModalLocation( *[ - daily_location(d, spatial_unit=make_spatial_unit("lat-lon"), method="last") + daily_location(d, spatial_unit=make_spatial_unit("lon-lat"), method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ] ) agg = hl.aggregate() df = get_dataframe(agg) - assert ["lat", "lon", "total"] == list(df.columns) + assert ["lon", "lat", "total"] == list(df.columns) diff --git a/flowmachine/tests/test_spatial_distancematrix.py b/flowmachine/tests/test_spatial_distancematrix.py index 17952bcf32..b9802cd014 100644 --- a/flowmachine/tests/test_spatial_distancematrix.py +++ b/flowmachine/tests/test_spatial_distancematrix.py @@ -26,7 +26,7 @@ def test_some_results(get_dataframe): @pytest.mark.parametrize( "spatial_unit_type, length", - [("versioned-cell", 62), ("versioned-site", 35), ("lat-lon", 62)], + [("versioned-cell", 62), ("versioned-site", 35), ("lon-lat", 62)], ) def test_result_has_correct_length(spatial_unit_type, length, get_length): """ diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 387f59a38b..3aa48835d8 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -37,7 +37,7 @@ def test_get_geom_query_column_names( @pytest.mark.parametrize( "make_spatial_unit_args, loc_cols", [ - ({"spatial_unit_type": "lat-lon"}, ["lat", "lon"]), + ({"spatial_unit_type": "lon-lat"}, ["lon", "lat"]), ( {"spatial_unit_type": "versioned-cell"}, ["location_id", "version", "lon", "lat"], @@ -111,9 +111,7 @@ def test_missing_location_columns_raises_error(): GeomSpatialUnit are not a subset of column_names. """ with pytest.raises(ValueError, match="['NOT_A_COLUMN']"): - su = LatLonSpatialUnit( - location_id_column_names=["location_id", "lat", "lon", "NOT_A_COLUMN"] - ) + su = LonLatSpatialUnit(location_id_column_names=["location_id", "NOT_A_COLUMN"]) @pytest.mark.parametrize( @@ -128,7 +126,7 @@ def test_missing_location_columns_raises_error(): {"spatial_unit_type": "versioned-site"}, {"spatial_unit_type": "versioned-cell"}, {"spatial_unit_type": "cell"}, - {"spatial_unit_type": "lat-lon"}, + {"spatial_unit_type": "lon-lat"}, {"spatial_unit_type": "grid", "size": 5}, { "spatial_unit_type": "polygon", @@ -224,12 +222,12 @@ def test_make_spatial_unit_raises_errors(make_spatial_unit_args): [ ({"spatial_unit_type": "cell"}, "has_geography", False), ({"spatial_unit_type": "versioned-cell"}, "has_geography", True), - ({"spatial_unit_type": "admin", "level": 3}, "has_lat_lon_columns", False), - ({"spatial_unit_type": "lat-lon"}, "has_lat_lon_columns", True), + ({"spatial_unit_type": "admin", "level": 3}, "has_lon_lat_columns", False), + ({"spatial_unit_type": "lon-lat"}, "has_lon_lat_columns", True), ({"spatial_unit_type": "admin", "level": 3}, "is_network_object", False), ({"spatial_unit_type": "cell"}, "is_network_object", True), ({"spatial_unit_type": "versioned-site"}, "is_network_object", True), - ({"spatial_unit_type": "lat-lon"}, "is_polygon", False), + ({"spatial_unit_type": "lon-lat"}, "is_polygon", False), ({"spatial_unit_type": "grid", "size": 10}, "is_polygon", True), ], ) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index bf9e61a295..976d03ee31 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -15,15 +15,15 @@ from flowmachine.features import TotalNetworkObjects, AggregateNetworkObjects -def test_tno_at_lat_lng(get_dataframe): +def test_tno_at_lon_lat(get_dataframe): """ - Regression test for #108. TNO should work at lat-lon level. + Regression test for #108. TNO should work at lon-lat level. """ tno = TotalNetworkObjects( start="2016-01-01", stop="2016-01-07", network_object=make_spatial_unit("versioned-cell"), - spatial_unit=make_spatial_unit("lat-lon"), + spatial_unit=make_spatial_unit("lon-lat"), ) assert tno.get_dataframe().sum().value == 330 @@ -92,7 +92,7 @@ def test_bad_total_by(): @pytest.mark.parametrize( "bad_arg, spatial_unit_type", - [("spatial_unit", "cell"), ("network_object", "lat-lon")], + [("spatial_unit", "cell"), ("network_object", "lon-lat")], ) def test_bad_spatial_units(bad_arg, spatial_unit_type): """ From 23760b1c9fb4a9af6155a45fb00894bbf79740e6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 13:56:45 +0100 Subject: [PATCH 103/138] Use location_id_columns in spatial aggregates --- .../features/utilities/spatial_aggregates.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py index 3ed6282215..f0201eaa4a 100644 --- a/flowmachine/flowmachine/features/utilities/spatial_aggregates.py +++ b/flowmachine/flowmachine/features/utilities/spatial_aggregates.py @@ -35,11 +35,11 @@ def __init__(self, *, locations): @property def column_names(self) -> List[str]: - return self.locations.column_names[1:] + ["total"] + return self.spatial_unit.location_id_columns + ["total"] def _make_query(self): - aggregate_cols = ",".join(self.locations.column_names[1:]) + aggregate_cols = ",".join(self.spatial_unit.location_id_columns) sql = f""" SELECT @@ -57,8 +57,8 @@ def _make_query(self): class JoinedSpatialAggregate(GeoDataMixin, Query): """ Creates spatially aggregated data from two objects, one of which is - a metric of subscribers, and the other of which represents the subscribers - location. + a metric of subscribers, and the other of which represents the subscribers' + locations. A general class that join metric information about a subscriber with location information about a subscriber and aggregates to the geometric level. @@ -67,14 +67,12 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): ---------- metric : Query A query object that represents a subscriber level metric such - as radius of gyration. The underlying data must have the - first column as 'subscriber'. All subsequent columns must be - numeric and will be meaned. + as radius of gyration. The underlying data must have a 'subscriber' + column. All other columns must be numeric and will be aggregated. locations : Query A query object that represents the locations of subscribers. - The first column should be 'subscriber', and subsequent columns - locations. - method : {"mean", "median", "mode"} + Must have a 'subscriber' column, and a 'spatial_unit' attribute. + method : {"avg", "max", "min", "median", "mode", "stddev", "variance"} Method of aggregation. Examples @@ -97,7 +95,7 @@ class JoinedSpatialAggregate(GeoDataMixin, Query): allowed_methods = {"avg", "max", "min", "median", "mode", "stddev", "variance"} - def __init__(self, *, metric, locations, method="mean"): + def __init__(self, *, metric, locations, method="avg"): self.metric = metric self.locations = locations # self.spatial_unit is used in self._geo_augmented_query @@ -129,7 +127,8 @@ def __init__(self, *, metric, locations, method="mean"): def _make_query(self): metric_cols = self.metric.column_names - location_cols = [cn for cn in self.locations.column_names if cn != "subscriber"] + metric_cols_no_subscriber = [cn for cn in metric_cols if cn != "subscriber"] + location_cols = self.spatial_unit.location_id_columns # Make some comma separated strings for use in the SQL query metric_list = ", ".join(f"metric.{c}" for c in metric_cols) @@ -157,11 +156,11 @@ def _make_query(self): if self.method == "mode": av_cols = ", ".join( f"pg_catalog.mode() WITHIN GROUP(ORDER BY {mc}) AS {mc}" - for mc in metric_cols[1:] + for mc in metric_cols_no_subscriber ) else: av_cols = ", ".join( - f"{self.method}({mc}) AS {mc}" for mc in metric_cols[1:] + f"{self.method}({mc}) AS {mc}" for mc in metric_cols_no_subscriber ) # Now do the group by bit @@ -177,6 +176,6 @@ def _make_query(self): @property def column_names(self) -> List[str]: - return [ - cn for cn in self.locations.column_names if cn != "subscriber" - ] + self.metric.column_names[1:] + return self.spatial_unit.location_id_columns + [ + cn for cn in self.metric.column_names if cn != "subscriber" + ] From 7166f8a477c186812a2762c3d808b30b3da850b3 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 16:02:16 +0100 Subject: [PATCH 104/138] Fix test_query_formatting --- flowmachine/tests/test_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_query.py b/flowmachine/tests/test_query.py index b31686277e..a9fa8e37ae 100644 --- a/flowmachine/tests/test_query.py +++ b/flowmachine/tests/test_query.py @@ -145,8 +145,8 @@ def test_query_formatting(): dl = daily_location("2016-01-01", method="last") assert "" == format(dl) assert ( - "" - == f"{dl:level,column_names}" + ", column_names: ['subscriber', 'pcod']>" + == f"{dl:spatial_unit,column_names}" ) with pytest.raises( From 567c59c0bd1aa7c18d4bab4b660e4a4ef89b8899 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 16:28:10 +0100 Subject: [PATCH 105/138] No need for GeomSpatialUnit to be abstract --- flowmachine/flowmachine/core/spatial_unit.py | 39 +++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 05b3c9b3d9..2bc5303601 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -8,7 +8,6 @@ The helper function 'make_spatial_unit' can be used to create spatial unit objects. """ from typing import List -from abc import ABCMeta, abstractmethod from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError @@ -237,14 +236,11 @@ def __hash__(self): return hash(self.__class__.__name__) -class GeomSpatialUnit(SpatialUnitMixin, Query, metaclass=ABCMeta): +class GeomSpatialUnit(SpatialUnitMixin, Query): """ Base class for spatial units that map location IDs in connection.location_table to geographic locations. - Derived classes must implement the _join_clause method, to determine how to - join the location table to the table with geography data. - Parameters ---------- geom_table_column_names : str or list @@ -259,6 +255,12 @@ class GeomSpatialUnit(SpatialUnitMixin, Query, metaclass=ABCMeta): Defaults to connection.location_table geom_column : str, default "geom" Name of the column in geom_table that defines the geometry. + geom_table_join_on : str, optional + Name of the column from geom_table to join on. + Required if geom_table != connection.location_table. + location_table_join_on : str, optional + Name of the column from connection.location_table to join on. + Required if geom_table != connection.location_table. """ def __init__( @@ -268,6 +270,8 @@ def __init__( location_id_column_names, geom_table=None, geom_column="geom", + geom_table_join_on=None, + location_table_join_on=None, ): if isinstance(geom_table_column_names, str): self._geom_table_cols = (geom_table_column_names,) @@ -297,6 +301,9 @@ def __init__( else: self.geom_table = Table(name=geom_table) + self._geom_on = geom_table_join_on + self._loc_on = location_table_join_on + super().__init__() def __eq__(self, other): @@ -312,7 +319,6 @@ def __hash__(self): def _get_aliased_geom_table_cols(self, table_alias): return [f"{table_alias}.{c}" for c in self._geom_table_cols] - @abstractmethod def _join_clause(self, loc_table_alias, geom_table_alias): """ Returns a SQL join clause to join the location table to the geography @@ -331,7 +337,13 @@ def _join_clause(self, loc_table_alias, geom_table_alias): str SQL join clause """ - raise NotImplementedError + if self._loc_on is None or self._geom_on is None: + raise ValueError("No columns specified for join.") + return f""" + LEFT JOIN + ({self.geom_table.get_query()}) AS {geom_table_alias} + ON {loc_table_alias}.{self._loc_on} = {geom_table_alias}.{self._geom_on} + """ def _make_query(self): loc_table_alias = "loc_table" @@ -459,13 +471,13 @@ def __init__( geom_table_join_on=None, location_table_join_on=None, ): - self._geom_on = geom_table_join_on - self._loc_on = location_table_join_on super().__init__( geom_table_column_names=geom_table_column_names, location_id_column_names=location_id_column_names, geom_table=geom_table, geom_column=geom_column, + geom_table_join_on=geom_table_join_on, + location_table_join_on=location_table_join_on, ) def _get_aliased_geom_table_cols(self, table_alias): @@ -474,15 +486,6 @@ def _get_aliased_geom_table_cols(self, table_alias): f"ST_Y({table_alias}.{self._geom_col}::geometry) AS lat", ] - def _join_clause(self, loc_table_alias, geom_table_alias): - if self._loc_on is None or self._geom_on is None: - raise ValueError("No columns specified for join.") - return f""" - LEFT JOIN - ({self.geom_table.get_query()}) AS {geom_table_alias} - ON {loc_table_alias}.{self._loc_on} = {geom_table_alias}.{self._geom_on} - """ - @property def location_id_columns(self) -> List[str]: """ From ec759f6f50774ebd38fccf9e727dc2ad5172110f Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Jun 2019 18:24:27 +0100 Subject: [PATCH 106/138] Implement location_subset_clause --- flowmachine/flowmachine/core/spatial_unit.py | 165 +++++++++++-------- 1 file changed, 92 insertions(+), 73 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 2bc5303601..1985dc6e93 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -19,6 +19,21 @@ # implement a method to check whether the required data can be found in the DB. +def _substitute_lat_lon(location_dict): + """ + Replace "lat" and "lon" keys in location_dict with "ST_SetSRID(ST_Point(lon, lat), 4326)" + This function is used by `LonLatSpatialUnit.location_subset_clause()` + """ + location_copy = location_dict.copy() + if "lon" in location_copy and "lat" in location_copy: + lon = location_copy.pop("lon") + lat = location_copy.pop("lat") + location_copy[ + "ST_SetSRID(ST_Point(lon, lat), 4326)" + ] = f"ST_SetSRID(ST_Point({lon}, {lat}), 4326)" + return location_copy + + class SpatialUnitMixin: """ Mixin for spatial unit classes, which provides a 'location_id_columns' property @@ -144,78 +159,43 @@ def location_subset_clause(self, locations, check_column_names=True): -------- LonLatSpatialUnit.location_subset_clause """ - raise NotImplementedError( - "location_subset_clause is not fully implemented yet." - ) if isinstance(locations, list) or isinstance(locations, tuple): if isinstance(locations[0], dict): - # multiple locations, multiple columns - # TODO: Check keys are subset of self.location_id_columns when check_column_names==True + # Multiple locations, multiple columns + if check_column_names: + unrecognised_columns = ( + set().union(*locations).difference(self.location_id_columns) + ) + if unrecognised_columns: + raise ValueError( + f"Columns {unrecognised_columns} are not in location_id_columns." + ) ands = [ " AND ".join(f"{key} = '{value}'" for key, value in loc.items()) for loc in locations ] return "WHERE (" + ") OR (".join(ands) + ")" else: - # multiple locations, first column + # Multiple locations, first column locs_list_string = ", ".join(f"'{l}'" for l in locations) return f"WHERE {self.location_id_columns[0]} IN ({locs_list_string})" elif isinstance(locations, dict): - # one location, multiple columns - # TODO: Check keys are subset of self.location_id_columns when check_column_names==True + # Single location, multiple columns + if check_column_names: + unrecognised_columns = set(locations).difference( + self.location_id_columns + ) + if unrecognised_columns: + raise ValueError( + f"Columns {unrecognised_columns} are not in location_id_columns." + ) return "WHERE " + " AND ".join( f"{key} = '{value}'" for key, value in locations.items() ) else: - # one location, first column + # Single location, first column return f"WHERE {self.location_id_columns[0]} = '{locations}'" - # From FirstLocation._get_locations_clause: - # if len(column_name) == 1: # polygon, admin, cell, grid - # if isinstance(location, tuple) or isinstance(location, list): - # in_list = "('" + "','".join(location) + "')" - # return "WHERE {} in {}".format(column_name[0], in_list) - # else: - # return "WHERE {} = '{}'".format(column_name[0], location) - # elif self.ul.level == "lat-lon": - # if isinstance(location, tuple) or isinstance(location, list): - # in_list = ( - # "('" - # + "','".join( - # "ST_SetSRID(ST_Point({}, {}), 4326)".format(lon, lat) - # for lon, lat in location - # ) - # + "')" - # ) - # return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) in {}".format( - # in_list - # ) - # else: - # return "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point({}, {}), 4326)'".format( - # *location - # ) - # else: # Versioned things - # if isinstance(location, str): # Deal with single string - # location = (location,) - # elif isinstance( - # location, list - # ): # Deal with possible single strings in list - # location = [l if isinstance(l, tuple) else (l,) for l in location] - # if isinstance(location, tuple): - # return "WHERE " + " AND ".join( - # "{} = '{}'".format(c, l) for c, l in zip(column_name, location) - # ) - # else: - # ands = " OR ".join( - # "({})".format( - # " AND ".join( - # "{} = '{}'".format(c, l) for c, l in zip(column_name, loc) - # ) - # ) - # for loc in location - # ) - # return "WHERE " + ands - class CellSpatialUnit(SpatialUnitMixin): """ @@ -498,7 +478,7 @@ def location_subset_clause(self, locations, check_column_names=True): Return a SQL "WHERE" clause to subset a query (joined to this spatial unit) to a location or set of locations. This method differs from the default implementation in its handling of lon-lat values, i.e. it returns - WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = ST_SetSRID(ST_Point(, ), 4326)' + WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point(, ), 4326)' instead of WHERE lon = '' AND lat = '' @@ -507,9 +487,13 @@ def location_subset_clause(self, locations, check_column_names=True): locations : tuple, str, dict, or list/tuple thereof Location or list of locations to subset to. This should have one of the following formats: - tuple (length 2), or list/tuple of tuple + list/tuple of tuple Values are (longitude, latitude) pairs, corresponding to the 'lon' and 'lat' columns. + Note: cannot pass a single (lon, lat) tuple, as this would + be ambiguous (could be a tuple of str, see below). For a + single location, either pass a length-1 tuple ((lon, lat),) + or a dict {"lon": lon, "lat": lat}. str, or list/tuple of str Values correspond to the first column in self.location_id_columns. @@ -530,22 +514,57 @@ def location_subset_clause(self, locations, check_column_names=True): -------- SpatialUnitMixin.location_subset_clause """ - raise NotImplementedError( - "LonLatSpatialUnit.location_subset_clause is not implemented yet." - ) - # TODO: Implement this. - # Should raise an error if locations is a string or list of string and - # location_id_columns[0] == 'lon'. - # Should return example in docstring (or - # "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) IN (...)") if - # locations is a tuple or list of tuples. - # Should return same as SpatialUnitMixin.location_subset_clause if - # locations is a dict or list of dict and "lon" and "lat" columns are - # not included in keys. - # If lat" and "lon" are included in the dict keys, should be able to - # create a new dict with "lat", "lon" keys replaced by - # "ST_SetSRID(ST_Point(lon, lat), 4326)" (and set corresponding value), - # and then call super().location_subset_clause with check_column_names=False + # TODO: Once we have a class for representing lon-lat pairs + # (see https://github.com/Flowminder/FlowKit/issues/915), accept these + # instead of tuples to remove ambiguity. + if isinstance(locations, list) or isinstance(locations, tuple): + if isinstance(locations[0], tuple): + if len(locations) == 1: + # Single location, lat-lon columns + lon, lat = locations[0] + return f"WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point({lon}, {lat}), 4326)'" + else: + # Multiple locations, lat-lon columns + locs_list_string = ", ".join( + f"'ST_SetSRID(ST_Point({lon}, {lat}), 4326)'" + for lon, lat in locations + ) + return f"WHERE ST_SetSRID(ST_Point(lon, lat), 4326) IN ({locs_list_string})" + elif isinstance(locations[0], dict): + # Multiple locations, multiple columns + if check_column_names: + unrecognised_columns = ( + set().union(*locations).difference(self.location_id_columns) + ) + if unrecognised_columns: + raise ValueError( + f"Columns {unrecognised_columns} are not in location_id_columns." + ) + locations_copy = [ + _substitute_lat_lon(location) for location in locations + ] + return super().location_subset_clause( + locations_copy, check_column_names=False + ) + else: + # Multiple locations, first column + return super().location_subset_clause(locations) + elif isinstance(locations, dict): + # Single location, multiple columns + if check_column_names: + unrecognised_columns = set(locations).difference( + self.location_id_columns + ) + if unrecognised_columns: + raise ValueError( + f"Columns {unrecognised_columns} are not in location_id_columns." + ) + return super().location_subset_clause( + _substitute_lat_lon(locations), check_column_names=False + ) + else: + # Single location, first column + return super().location_subset_clause(locations) class PolygonSpatialUnit(GeomSpatialUnit): From 5cfd4c010bf4466b37e57d2c91c4a2c70a2e27ed Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 00:01:59 +0100 Subject: [PATCH 107/138] Add tests for location_subset_clause --- flowmachine/flowmachine/core/spatial_unit.py | 2 +- flowmachine/tests/test_first_location.py | 14 ++++ flowmachine/tests/test_spatial_unit.py | 72 ++++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 1985dc6e93..cac93cf6de 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -492,7 +492,7 @@ def location_subset_clause(self, locations, check_column_names=True): the 'lon' and 'lat' columns. Note: cannot pass a single (lon, lat) tuple, as this would be ambiguous (could be a tuple of str, see below). For a - single location, either pass a length-1 tuple ((lon, lat),) + single location, either pass a length-1 list [(lon, lat)] or a dict {"lon": lon, "lat": lat}. str, or list/tuple of str Values correspond to the first column in diff --git a/flowmachine/tests/test_first_location.py b/flowmachine/tests/test_first_location.py index 275d03d16a..dd84f60ae4 100644 --- a/flowmachine/tests/test_first_location.py +++ b/flowmachine/tests/test_first_location.py @@ -5,6 +5,7 @@ """ Tests for the class FirstLocation """ +import pytest from flowmachine.core import make_spatial_unit from flowmachine.features.subscriber import FirstLocation @@ -52,3 +53,16 @@ def test_can_be_called_with_any(get_dataframe): df = get_dataframe(dfl) df.set_index("subscriber", inplace=True) assert str(df.loc["0MQ4RYeKn7lryxGa"]) == "2016-01-03 01:38:56+00:00" + + +def test_raises_error_for_bad_parameters(): + """ + FirstLocation raises a ValueError if called with location="any" and spatial_unit != cell + """ + with pytest.raises(ValueError): + fl = FirstLocation( + "2016-01-03", + "2016-01-04", + location="any", + spatial_unit=make_spatial_unit("versioned-site"), + ) diff --git a/flowmachine/tests/test_spatial_unit.py b/flowmachine/tests/test_spatial_unit.py index 3aa48835d8..938f49be2f 100644 --- a/flowmachine/tests/test_spatial_unit.py +++ b/flowmachine/tests/test_spatial_unit.py @@ -249,3 +249,75 @@ def test_verify_criterion_raises_value_error(): su = CellSpatialUnit() with pytest.raises(ValueError): su.verify_criterion("BAD_CRITERION") + + +@pytest.mark.parametrize("spatial_unit_type", ["cell", "lon-lat"]) +@pytest.mark.parametrize( + "locations", [{"BAD_COLUMN": "DUMMY_VALUE"}, [{"BAD_COLUMN": "DUMMY_VALUE"}]] +) +def test_location_subset_clause_raises_error(spatial_unit_type, locations): + """ + Test that the location_subset_clause method raises a ValueError if + incorrect columns are passed in a dict. + """ + with pytest.raises(ValueError): + make_spatial_unit(spatial_unit_type).location_subset_clause(locations) + + +@pytest.mark.parametrize( + "spatial_unit_type, locations, expected", + [ + ("cell", "loc", "WHERE location_id = 'loc'"), + ("cell", ["loc1", "loc2"], "WHERE location_id IN ('loc1', 'loc2')"), + ("cell", {"location_id": "loc"}, "WHERE location_id = 'loc'"), + ( + "cell", + [{"location_id": "loc1"}, {"location_id": "loc2"}], + "WHERE (location_id = 'loc1') OR (location_id = 'loc2')", + ), + ( + "versioned-site", + {"site_id": "loc", "version": "v"}, + "WHERE site_id = 'loc' AND version = 'v'", + ), + ( + "versioned-site", + [ + {"site_id": "loc1", "version": "v1"}, + {"site_id": "loc2", "version": "v2"}, + ], + "WHERE (site_id = 'loc1' AND version = 'v1') OR (site_id = 'loc2' AND version = 'v2')", + ), + ("versioned-site", "loc", "WHERE site_id = 'loc'"), + ("versioned-site", ["loc1", "loc2"], "WHERE site_id IN ('loc1', 'loc2')"), + ( + "versioned-site", + [("lon1", "lat1")], + "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point(lon1, lat1), 4326)'", + ), + ( + "versioned-site", + [("lon1", "lat1"), ("lon2", "lat2")], + "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) IN ('ST_SetSRID(ST_Point(lon1, lat1), 4326)', 'ST_SetSRID(ST_Point(lon2, lat2), 4326)')", + ), + ( + "versioned-site", + {"lon": "lon1", "lat": "lat1"}, + "WHERE ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point(lon1, lat1), 4326)'", + ), + ( + "versioned-site", + [ + {"site_id": "site1", "lon": "lon1", "lat": "lat1"}, + {"site_id": "site2", "lon": "lon2", "lat": "lat2"}, + ], + "WHERE (site_id = 'site1' AND ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point(lon1, lat1), 4326)') OR (site_id = 'site2' AND ST_SetSRID(ST_Point(lon, lat), 4326) = 'ST_SetSRID(ST_Point(lon2, lat2), 4326)')", + ), + ], +) +def test_location_subset_clause_return_value(spatial_unit_type, locations, expected): + """ + Test that location_subset_clause returns some correct strings. + """ + su = make_spatial_unit(spatial_unit_type) + assert expected == su.location_subset_clause(locations) From b06dd3c1876287680c90940615398b7eccc7eb25 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 00:25:10 +0100 Subject: [PATCH 108/138] Add test for versioned-site SQL --- .../functional_tests/test_sql_strings_and_results.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py index dded120468..2c78f0e66d 100644 --- a/flowmachine/tests/functional_tests/test_sql_strings_and_results.py +++ b/flowmachine/tests/functional_tests/test_sql_strings_and_results.py @@ -235,3 +235,12 @@ def test_daily_location_6_df(get_dataframe, diff_reporter): ) df = get_dataframe(dl) verify(df.to_csv(), diff_reporter) + + +def test_versioned_site_sql(diff_reporter): + """ + Verify the SQL for a versioned-site spatial unit. + """ + su = make_spatial_unit("versioned-site") + sql = pretty_sql(su.get_query()) + verify(sql, diff_reporter) From 5807d02a0925222fb45774d9b3a69652440c27a8 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 00:25:37 +0100 Subject: [PATCH 109/138] Update approved SQL files --- ...lts.test_daily_location_1_sql.approved.txt | 68 +++++++++++------- ...lts.test_daily_location_2_sql.approved.txt | 70 +++++++++++-------- ...lts.test_daily_location_4_sql.approved.txt | 68 +++++++++++------- ...lts.test_daily_location_6_sql.approved.txt | 68 +++++++++++------- ...sults.test_versioned_site_sql.approved.txt | 20 ++++++ 5 files changed, 184 insertions(+), 110 deletions(-) create mode 100644 flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_versioned_site_sql.approved.txt diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt index 1ddc30695c..dd66178bcb 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_1_sql.approved.txt @@ -5,13 +5,13 @@ FROM (SELECT subscriber_locs.subscriber, pcod, row_number() OVER (PARTITION BY subscriber_locs.subscriber ORDER BY time DESC) AS rank - FROM (SELECT l.subscriber, - l.time, - l.location_id, - sites.pcod - FROM (SELECT subscriber, - datetime AS time, - location_id + FROM (SELECT subscriber, + datetime AS time, + pcod + FROM (SELECT l.datetime, + l.location_id, + l.subscriber, + sites.pcod FROM (SELECT events.calls.datetime, events.calls.location_id, events.calls.msisdn AS subscriber @@ -26,24 +26,38 @@ FROM (SELECT subscriber_locs.subscriber, events.sms.msisdn AS subscriber FROM events.sms WHERE (events.sms.datetime >= '2016-01-01 00:00:00') - AND (events.sms.datetime < '2016-01-02 00:00:00')) AS foo - WHERE (location_id IS NOT NULL) - AND (location_id <> '')) AS l - INNER JOIN (SELECT location_id, - version, - date_of_first_service, - date_of_last_service, - admin3pcod AS pcod - FROM (SELECT locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - polygon.admin3pcod - FROM infrastructure.cells AS locinfo - INNER JOIN geography.admin3 AS polygon ON st_within(locinfo.geom_point::geometry, - st_setsrid(polygon.geom, 4326)::geometry)) AS map) AS sites ON (l.location_id = sites.location_id) - AND (l.time)::date BETWEEN COALESCE(sites.date_of_first_service, - ('-infinity')::timestamptz) - AND COALESCE(sites.date_of_last_service, - ('infinity')::timestamptz)) AS subscriber_locs) AS final_time + AND (events.sms.datetime < '2016-01-02 00:00:00')) AS l + INNER JOIN (SELECT loc_table.id AS location_id, + loc_table.date_of_first_service, + loc_table.date_of_last_service, + geom_table.admin3pcod AS pcod + FROM infrastructure.cells AS loc_table + INNER JOIN (SELECT gid, + admin0name, + admin0pcod, + admin1name, + admin1pcod, + admin2name, + admin2pcod, + admin3name, + admin3pcod, + admin3refn, + admin3altn, + admin3al_1, + date, + validon, + validto, + shape_star, + shape_stle, + shape_leng, + shape_area, + geom + FROM geography.admin3) AS geom_table ON st_within(loc_table.geom_point::geometry, + st_setsrid(geom_table.geom, 4326)::geometry)) AS sites ON (l.location_id = sites.location_id) + AND (l.datetime)::date BETWEEN COALESCE(sites.date_of_first_service, + ('-infinity')::timestamptz) + AND COALESCE(sites.date_of_last_service, + ('infinity')::timestamptz)) AS foo + WHERE (location_id IS NOT NULL) + AND (location_id <> '')) AS subscriber_locs) AS final_time WHERE rank = 1 \ No newline at end of file diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt index b0646f18d8..663d3a1f39 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_2_sql.approved.txt @@ -1,19 +1,19 @@ SELECT ranked.subscriber, - admin2pcod + pcod FROM (SELECT times_visited.subscriber, - admin2pcod, + pcod, row_number() OVER (PARTITION BY times_visited.subscriber ORDER BY total DESC) AS rank FROM (SELECT subscriber_locs.subscriber, - admin2pcod, + pcod, count(*) AS total - FROM (SELECT l.subscriber, - l.time, - l.location_id, - sites.admin2pcod - FROM (SELECT subscriber, - datetime AS time, - location_id + FROM (SELECT subscriber, + datetime AS time, + pcod + FROM (SELECT l.datetime, + l.subscriber, + l.location_id, + sites.pcod FROM (SELECT events.calls.datetime, events.calls.imei AS subscriber, events.calls.location_id @@ -34,24 +34,36 @@ FROM (SELECT times_visited.subscriber, AND (events.sms.datetime < '2016-01-05 00:00:00') AND (to_char(events.sms.datetime, 'HH24:MI') >= '03:00') AND (to_char(events.sms.datetime, 'HH24:MI') < '09:00') - AND events.sms.imei IN ('2GJxeNazvlgZbqj6', '7qKmzkeMbmk5nOa0', '8dpPLR15XwR7jQyN', '1NqnrAB9bRd597x2')) AS foo) AS l - INNER JOIN (SELECT location_id, - version, - date_of_first_service, - date_of_last_service, - admin2pcod AS pcod - FROM (SELECT locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - polygon.admin2pcod - FROM infrastructure.cells AS locinfo - INNER JOIN geography.admin2 AS polygon ON st_within(locinfo.geom_point::geometry, - st_setsrid(polygon.geom, 4326)::geometry)) AS map) AS sites ON (l.location_id = sites.location_id) - AND (l.time)::date BETWEEN COALESCE(sites.date_of_first_service, - ('-infinity')::timestamptz) - AND COALESCE(sites.date_of_last_service, - ('infinity')::timestamptz) + AND events.sms.imei IN ('2GJxeNazvlgZbqj6', '7qKmzkeMbmk5nOa0', '8dpPLR15XwR7jQyN', '1NqnrAB9bRd597x2')) AS l + INNER JOIN (SELECT loc_table.id AS location_id, + loc_table.date_of_first_service, + loc_table.date_of_last_service, + geom_table.admin2pcod AS pcod + FROM infrastructure.cells AS loc_table + INNER JOIN (SELECT gid, + admin0name, + admin0pcod, + admin1name, + admin1pcod, + admin2name, + admin2pcod, + admin2refn, + admin2altn, + admin2al_1, + date, + validon, + validto, + shape_star, + shape_stle, + shape_leng, + shape_area, + geom + FROM geography.admin2) AS geom_table ON st_within(loc_table.geom_point::geometry, + st_setsrid(geom_table.geom, 4326)::geometry)) AS sites ON (l.location_id = sites.location_id) + AND (l.datetime)::date BETWEEN COALESCE(sites.date_of_first_service, + ('-infinity')::timestamptz) + AND COALESCE(sites.date_of_last_service, + ('infinity')::timestamptz)) AS foo ORDER BY time) AS subscriber_locs - GROUP BY subscriber_locs.subscriber, admin2pcod) AS times_visited) AS ranked + GROUP BY subscriber_locs.subscriber, pcod) AS times_visited) AS ranked WHERE rank = 1 \ No newline at end of file diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt index d2052511a3..90ce29ced5 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_4_sql.approved.txt @@ -5,13 +5,13 @@ FROM (SELECT subscriber_locs.subscriber, pcod, row_number() OVER (PARTITION BY subscriber_locs.subscriber ORDER BY time DESC) AS rank - FROM (SELECT l.subscriber, - l.time, - l.location_id, - sites.pcod - FROM (SELECT subscriber, - datetime AS time, - location_id + FROM (SELECT subscriber, + datetime AS time, + pcod + FROM (SELECT l.datetime, + l.location_id, + l.subscriber, + sites.pcod FROM (SELECT tbl.datetime, tbl.location_id, tbl.subscriber @@ -24,24 +24,38 @@ FROM (SELECT subscriber_locs.subscriber, AND (( (to_char(events.calls.datetime, 'HH24:MI') < '06:00') OR (to_char(events.calls.datetime, 'HH24:MI') >= '22:00')))) AS tbl INNER JOIN (SELECT * - FROM ((VALUES ('dr9xNYK006wykgXj'))) AS tmp(subscriber)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS foo - WHERE (location_id IS NOT NULL) - AND (location_id <> '')) AS l - INNER JOIN (SELECT location_id, - version, - date_of_first_service, - date_of_last_service, - admin3pcod AS pcod - FROM (SELECT locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - polygon.admin3pcod - FROM infrastructure.cells AS locinfo - INNER JOIN geography.admin3 AS polygon ON st_within(locinfo.geom_point::geometry, - st_setsrid(polygon.geom, 4326)::geometry)) AS map) AS sites ON (l.location_id = sites.location_id) - AND (l.time)::date BETWEEN COALESCE(sites.date_of_first_service, - ('-infinity')::timestamptz) - AND COALESCE(sites.date_of_last_service, - ('infinity')::timestamptz)) AS subscriber_locs) AS final_time + FROM ((VALUES ('dr9xNYK006wykgXj'))) AS tmp(subscriber)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS l + INNER JOIN (SELECT loc_table.id AS location_id, + loc_table.date_of_first_service, + loc_table.date_of_last_service, + geom_table.admin3pcod AS pcod + FROM infrastructure.cells AS loc_table + INNER JOIN (SELECT gid, + admin0name, + admin0pcod, + admin1name, + admin1pcod, + admin2name, + admin2pcod, + admin3name, + admin3pcod, + admin3refn, + admin3altn, + admin3al_1, + date, + validon, + validto, + shape_star, + shape_stle, + shape_leng, + shape_area, + geom + FROM geography.admin3) AS geom_table ON st_within(loc_table.geom_point::geometry, + st_setsrid(geom_table.geom, 4326)::geometry)) AS sites ON (l.location_id = sites.location_id) + AND (l.datetime)::date BETWEEN COALESCE(sites.date_of_first_service, + ('-infinity')::timestamptz) + AND COALESCE(sites.date_of_last_service, + ('infinity')::timestamptz)) AS foo + WHERE (location_id IS NOT NULL) + AND (location_id <> '')) AS subscriber_locs) AS final_time WHERE rank = 1 \ No newline at end of file diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt index 8aa54a7ca4..7479e283ab 100644 --- a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_daily_location_6_sql.approved.txt @@ -5,13 +5,13 @@ FROM (SELECT subscriber_locs.subscriber, pcod, row_number() OVER (PARTITION BY subscriber_locs.subscriber ORDER BY time DESC) AS rank - FROM (SELECT l.subscriber, - l.time, - l.location_id, - sites.pcod - FROM (SELECT subscriber, - datetime AS time, - location_id + FROM (SELECT subscriber, + datetime AS time, + pcod + FROM (SELECT l.datetime, + l.location_id, + l.subscriber, + sites.pcod FROM (SELECT tbl.datetime, tbl.location_id, tbl.subscriber @@ -27,24 +27,38 @@ FROM (SELECT subscriber_locs.subscriber, msisdn AS subscriber FROM events.calls WHERE ((datetime)::date = '2016-01-01') - AND (duration > 2000)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS foo - WHERE (location_id IS NOT NULL) - AND (location_id <> '')) AS l - INNER JOIN (SELECT location_id, - version, - date_of_first_service, - date_of_last_service, - admin3pcod AS pcod - FROM (SELECT locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - polygon.admin3pcod - FROM infrastructure.cells AS locinfo - INNER JOIN geography.admin3 AS polygon ON st_within(locinfo.geom_point::geometry, - st_setsrid(polygon.geom, 4326)::geometry)) AS map) AS sites ON (l.location_id = sites.location_id) - AND (l.time)::date BETWEEN COALESCE(sites.date_of_first_service, - ('-infinity')::timestamptz) - AND COALESCE(sites.date_of_last_service, - ('infinity')::timestamptz)) AS subscriber_locs) AS final_time + AND (duration > 2000)) AS subset_query ON tbl.subscriber = subset_query.subscriber) AS l + INNER JOIN (SELECT loc_table.id AS location_id, + loc_table.date_of_first_service, + loc_table.date_of_last_service, + geom_table.admin3pcod AS pcod + FROM infrastructure.cells AS loc_table + INNER JOIN (SELECT gid, + admin0name, + admin0pcod, + admin1name, + admin1pcod, + admin2name, + admin2pcod, + admin3name, + admin3pcod, + admin3refn, + admin3altn, + admin3al_1, + date, + validon, + validto, + shape_star, + shape_stle, + shape_leng, + shape_area, + geom + FROM geography.admin3) AS geom_table ON st_within(loc_table.geom_point::geometry, + st_setsrid(geom_table.geom, 4326)::geometry)) AS sites ON (l.location_id = sites.location_id) + AND (l.datetime)::date BETWEEN COALESCE(sites.date_of_first_service, + ('-infinity')::timestamptz) + AND COALESCE(sites.date_of_last_service, + ('infinity')::timestamptz)) AS foo + WHERE (location_id IS NOT NULL) + AND (location_id <> '')) AS subscriber_locs) AS final_time WHERE rank = 1 \ No newline at end of file diff --git a/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_versioned_site_sql.approved.txt b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_versioned_site_sql.approved.txt new file mode 100644 index 0000000000..cddcd5c6ee --- /dev/null +++ b/flowmachine/tests/functional_tests/approved_files/test_sql_strings_and_results.test_versioned_site_sql.approved.txt @@ -0,0 +1,20 @@ +SELECT loc_table.id AS location_id, + geom_table.date_of_first_service, + geom_table.date_of_last_service, + geom_table.id AS site_id, + geom_table.version, + st_x(geom_table.geom_point::geometry) AS lon, + st_y(geom_table.geom_point::geometry) AS lat +FROM infrastructure.cells AS loc_table + LEFT JOIN (SELECT id, + version, + name, + type, + status, + structure_type, + is_cow, + date_of_first_service, + date_of_last_service, + geom_point, + geom_polygon + FROM infrastructure.sites) AS geom_table ON loc_table.site_id = geom_table.id \ No newline at end of file From 9108494ce2e5ca0622ab79689ed2347f1f541a68 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 00:36:56 +0100 Subject: [PATCH 110/138] Update _get_stored_dependencies tests --- flowmachine/tests/test_cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flowmachine/tests/test_cache.py b/flowmachine/tests/test_cache.py index 2823cafa31..62af454bf9 100644 --- a/flowmachine/tests/test_cache.py +++ b/flowmachine/tests/test_cache.py @@ -197,7 +197,7 @@ def test_deps_cache_multi(): dl1.store().result() hl1 = ModalLocation(daily_location("2016-01-01"), daily_location("2016-01-02")) dep = dl1.md5 - assert 3 == len(hl1._get_stored_dependencies()) + assert 4 == len(hl1._get_stored_dependencies()) assert dep in [x.md5 for x in hl1._get_stored_dependencies()] @@ -214,7 +214,7 @@ def test_deps_cache_chain(): flow = Flows(hl1, hl2) bad_dep = dl1.md5 good_dep = hl1.md5 - assert 5 == len(flow._get_stored_dependencies()) + assert 6 == len(flow._get_stored_dependencies()) assert good_dep in [x.md5 for x in flow._get_stored_dependencies()] assert bad_dep not in [x.md5 for x in flow._get_stored_dependencies()] @@ -231,7 +231,7 @@ def test_deps_cache_broken_chain(): hl2 = ModalLocation(daily_location("2016-01-03"), daily_location("2016-01-04")) flow = Flows(hl1, hl2) dep = dl1.md5 - assert 7 == len(flow._get_stored_dependencies()) + assert 8 == len(flow._get_stored_dependencies()) assert dep in [x.md5 for x in flow._get_stored_dependencies()] From 365c9f6f37bab4d12b30e6707e3505b8836310fa Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 01:06:01 +0100 Subject: [PATCH 111/138] Ensure differrent instances of CellSpatialUnit don't result in different query_ids --- flowmachine/flowmachine/core/spatial_unit.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index cac93cf6de..f12e241773 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -207,13 +207,19 @@ class CellSpatialUnit(SpatialUnitMixin): _locid_cols = ("location_id",) + def __repr__(self): + # Define this so that str(CellSpatialUnit()) will always return the + # same string (otherwise 2 identical queries with different instances + # of this spatial unit will have different query_ids). + return self.__class__.__name__ + "()" + def __eq__(self, other): return isinstance(other, CellSpatialUnit) def __hash__(self): # We may never need CellSpatialUnits to be hashable, but we define this # just in case. - return hash(self.__class__.__name__) + return hash(str(self)) class GeomSpatialUnit(SpatialUnitMixin, Query): From 94613b59b658f2c883bd0b23ab71a653c93acbcb Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 10:08:27 +0100 Subject: [PATCH 112/138] Fix test_query_formatting --- flowmachine/tests/test_query.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_query.py b/flowmachine/tests/test_query.py index a9fa8e37ae..293168e2f9 100644 --- a/flowmachine/tests/test_query.py +++ b/flowmachine/tests/test_query.py @@ -11,6 +11,7 @@ import pytest from sqlalchemy.exc import ProgrammingError +from flowmachine.core import make_spatial_unit from flowmachine.core.query import Query from flowmachine.features import daily_location @@ -142,10 +143,12 @@ def test_query_formatting(): Test that query can be formatted as a string, with query attributes specified in the `fmt` argument being included. """ - dl = daily_location("2016-01-01", method="last") + dl = daily_location( + "2016-01-01", spatial_unit=make_spatial_unit("cell"), method="last" + ) assert "" == format(dl) assert ( - ", column_names: ['subscriber', 'pcod']>" + "" == f"{dl:spatial_unit,column_names}" ) From f9bc629b9811829836940bcda56f08ea0e84eb46 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 10:08:53 +0100 Subject: [PATCH 113/138] Fix test_print_dependency_tree --- flowmachine/tests/test_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index 71447f4b9c..c1489a4406 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -240,7 +240,11 @@ def test_print_dependency_tree(): """\ - + - + - - + - + - - - - @@ -250,10 +254,6 @@ def test_print_dependency_tree(): - - - - - - - - - - - - - """ From e7439d74c8b0e6908a6685adfbc02a7cc1ff04bb Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 10:34:26 +0100 Subject: [PATCH 114/138] Remove now-redundant level utilities --- .../flowmachine/core/errors/__init__.py | 2 - .../core/errors/flowmachine_errors.py | 20 -- flowmachine/flowmachine/features/__init__.py | 3 - .../flowmachine/features/spatial/__init__.py | 4 - .../features/spatial/cell_mappings.py | 221 ------------------ flowmachine/flowmachine/utils.py | 56 ----- flowmachine/tests/conftest.py | 34 --- flowmachine/tests/test_cell_to_x.py | 40 ---- flowmachine/tests/test_utils.py | 27 --- 9 files changed, 407 deletions(-) delete mode 100644 flowmachine/flowmachine/features/spatial/cell_mappings.py delete mode 100644 flowmachine/tests/test_cell_to_x.py diff --git a/flowmachine/flowmachine/core/errors/__init__.py b/flowmachine/flowmachine/core/errors/__init__.py index 7dd502b620..4a36a37d12 100644 --- a/flowmachine/flowmachine/core/errors/__init__.py +++ b/flowmachine/flowmachine/core/errors/__init__.py @@ -9,7 +9,6 @@ from .flowmachine_errors import ( NameTooLongError, NotConnectedError, - BadLevelError, InvalidSpatialUnitError, MissingDateError, ) @@ -17,7 +16,6 @@ __all__ = [ "NameTooLongError", "NotConnectedError", - "BadLevelError", "InvalidSpatialUnitError", "MissingDateError", ] diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py index 38112718f5..d4996fadc3 100644 --- a/flowmachine/flowmachine/core/errors/flowmachine_errors.py +++ b/flowmachine/flowmachine/core/errors/flowmachine_errors.py @@ -84,26 +84,6 @@ class NotConnectedError(Exception): def __init__(self): Exception.__init__( self, "No connection found. Do you need to call flowmachine.connect()?" - ) - - -class BadLevelError(Exception): - """ - Raised when any class is given an error it does not recognise. - - Parameters - ---------- - level : str - The bad level. - allowed_levels : list of str, optional - List of allowed levels that the user may pass. - """ - - def __init__(self, level, allowed_levels=None): - msg = "Unrecognised level {}".format(level) - if allowed_levels is not None: - msg += ", level must be one of {}".format(allowed_levels) - Exception.__init__(self, msg) class InvalidSpatialUnitError(ValueError): diff --git a/flowmachine/flowmachine/features/__init__.py b/flowmachine/flowmachine/features/__init__.py index f1e4041139..a76f97e882 100644 --- a/flowmachine/flowmachine/features/__init__.py +++ b/flowmachine/flowmachine/features/__init__.py @@ -82,9 +82,6 @@ "DistanceMatrix", "VersionedInfrastructure", "Grid", - "CellToAdmin", - "CellToPolygon", - "CellToGrid", "Circle", "CircleGeometries", ] diff --git a/flowmachine/flowmachine/features/spatial/__init__.py b/flowmachine/flowmachine/features/spatial/__init__.py index 2bf3dacfc9..1a10c3735f 100644 --- a/flowmachine/flowmachine/features/spatial/__init__.py +++ b/flowmachine/flowmachine/features/spatial/__init__.py @@ -9,7 +9,6 @@ from .distance_matrix import DistanceMatrix from .location_cluster import LocationCluster from .versioned_infrastructure import VersionedInfrastructure -from .cell_mappings import CellToAdmin, CellToPolygon, CellToGrid from .circles import Circle, CircleGeometries __all__ = [ @@ -17,9 +16,6 @@ "DistanceMatrix", "LocationCluster", "VersionedInfrastructure", - "CellToAdmin", - "CellToPolygon", - "CellToGrid", "Circle", "CircleGeometries", ] diff --git a/flowmachine/flowmachine/features/spatial/cell_mappings.py b/flowmachine/flowmachine/features/spatial/cell_mappings.py deleted file mode 100644 index 754f81068c..0000000000 --- a/flowmachine/flowmachine/features/spatial/cell_mappings.py +++ /dev/null @@ -1,221 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -""" -Classes that deal with mapping cells (or towers or sites) -to a spatial level, mostly be performing a spatial join. -Examples of this include CellToAdmin or CellToGrid. -""" -from typing import List - -from ...core import Query, Grid - - -class CellToPolygon(Query): - """ - Class that maps a cell with a lat-lon to a geographical - region. - - Parameters - ---------- - column_name : str, optional - The name of the column to fetch from the geometry - table in the database. Can also be a list of names. - polygon_table : str, or flowmachine.Query optional - name of the table containing the geography information. - Can be either the name of a table, with the schema, a flowmachine.Query - object, or a string representing a query. - geom_col : str, default 'geom' - column that defines the geography. - """ - - def __init__(self, *, column_name, polygon_table, geom_col="geom"): - - if type(column_name) is str: - self.column_name = [column_name] - else: - self.column_name = column_name - self.polygon_table = polygon_table - self.geom_col = geom_col - self.location_info_table_fqn = self.connection.location_table - self.location_info_table = self.connection.location_table.split(".")[-1] - - super().__init__() - - def _get_subtable(self): - """ - Private method which takes the table and returns a query - representing the object. This is necessary as the table can - be passed in a variety of ways. - """ - - if issubclass(self.polygon_table.__class__, Query): - return f"({self.polygon_table.get_query()}) AS polygon" - elif "select " in self.polygon_table.lower(): - return f"({self.polygon_table}) AS polygon" - else: - return f"{self.polygon_table} AS polygon" - - @property - def column_names(self) -> List[str]: - return [ - "location_id", - "version", - "date_of_first_service", - "date_of_last_service", - ] + self.column_name - - def _make_query(self): - - # if the subscriber wants to select a geometry from the sites table there - # is no need to join the table with itself. - if ( - isinstance(self.polygon_table, str) - and self.location_info_table_fqn == self.polygon_table.lower().strip() - ): - - columns = ", ".join(f"locinfo.{c}" for c in self.column_name) - - # Create a table - tower_admins = f""" - SELECT - locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - {columns} - FROM - {self.location_info_table_fqn} AS locinfo - """ - - # otherwise performs the geometric join - else: - columns = ", ".join(f"polygon.{c}" for c in self.column_name) - - # Create a table - tower_admins = f""" - SELECT - locinfo.id AS location_id, - locinfo.version, - locinfo.date_of_first_service, - locinfo.date_of_last_service, - {columns} - FROM - {self.location_info_table_fqn} AS locinfo - INNER JOIN - {self._get_subtable()} - ON ST_within( - locinfo.geom_point::geometry, - ST_SetSRID(polygon.{self.geom_col}, 4326)::geometry - ) - """ - - return tower_admins - - -class CellToAdmin(Query): - """ - Maps all cells (aka sites) to a admin region. This is a thin wrapper to - the more general class CellToPolygon, which assumes that you have - the standard set-up. - - Parameters - ---------- - level : {'adminN'} - One of admin1, admin2 etc. - column_name : str, optional - Pass a string of the column to use as the - identifier of the admin region. By default - this will be admin*pcod. But you may wish - to use something else, such as admin3name. - """ - - def __init__(self, *, level, column_name=None): - self.level = level - # If there is no column_name passed then we can use - # the default, which is of the form admin3name. - if column_name is None: - self.column_name = self._get_standard_name() - else: - self.column_name = column_name - table = f"geography.{self.level}" - self.mapping = CellToPolygon(column_name=self.column_name, polygon_table=table) - - super().__init__() - - def _get_standard_name(self): - """ - Returns the standard name of the column that identifies - the name of the region. - """ - - return f"{self.level}pcod" - - @property - def column_names(self) -> List[str]: - columns = self.mapping.column_names - columns.remove(self.column_name) - # If the user has asked for the standard column_name - # then we will alias this column as 'pcod', otherwise - # we'll won't alias it at all. - if self.column_name == self._get_standard_name(): - col_name = "pcod" - else: - col_name = self.column_name - return columns + [col_name] - - def _make_query(self): - - columns = self.mapping.column_names - columns.remove(self.column_name) - other_cols = ", ".join(columns) - - # If the user has asked for the standard column_name - # then we will alias this column as 'pcod', otherwise - # we'll won't alias it at all. - if self.column_name == self._get_standard_name(): - col_name = f"{self.column_name} AS pcod" - else: - col_name = self.column_name - - sql = f""" - SELECT - {other_cols}, - {col_name} - FROM - ({self.mapping.get_query()}) AS map - """ - - return sql - - -class CellToGrid(Query): - """ - Query representing a mapping between all the sites in the database - and a grid of arbitrary size. - - Parameters - ---------- - size : float or int - Size of the grid in kilometres - """ - - def __init__(self, *, size): - """ - - """ - - self.size = size - self.grid = Grid(self.size) - self.mapping = CellToPolygon( - polygon_table=self.grid, column_name=["grid_id"], geom_col="geom_square" - ) - super().__init__() - - @property - def column_names(self) -> List[str]: - return self.mapping.column_names - - def _make_query(self): - return self.mapping.get_query() diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index 6ccd6396e3..300aee4949 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -21,8 +21,6 @@ from time import sleep from typing import List, Union -from flowmachine.core.errors import BadLevelError - logger = structlog.get_logger("flowmachine.debug", submodule=__name__) @@ -50,60 +48,6 @@ def getsecret(key: str, default: str) -> str: return default -def get_columns_for_level( - level: str, column_name: Union[str, List[str]] = None -) -> List[str]: - """ - Get a list of the location related columns - - Parameters - ---------- - level : {'cell', 'versioned-cell', 'versioned-site', 'lat-lon', 'grid', 'adminX'} - Level to get location columns for - column_name : str, or list of strings, optional - name of the column or list of column names. None by default - if this is not none then the function trivially returns the - column name as a list. - Returns - ------- - relevant_columns : list - A list of the database columns for this level - - Examples - -------- - >>> get_columns_for_level("admin3") - ['name'] - - """ - if level == "polygon" and not column_name: - raise ValueError("Must pass a column name for level=polygon") - - if column_name: - if isinstance(column_name, str): - relevant_columns = [column_name] - elif isinstance(column_name, list): - relevant_columns = list(column_name) - else: - raise TypeError("column name should be a list or a string") - return relevant_columns - - if level.startswith("admin"): - return ["pcod"] - - returns = { - "cell": ["location_id"], - "versioned-cell": ["location_id", "version", "lon", "lat"], - "versioned-site": ["site_id", "version", "lon", "lat"], - "lat-lon": ["lat", "lon"], - "grid": ["grid_id"], - } - - try: - return returns[level] - except KeyError: - raise BadLevelError(level) - - def parse_datestring( datestring: Union[str, datetime.datetime, datetime.date] ) -> datetime.datetime: diff --git a/flowmachine/tests/conftest.py b/flowmachine/tests/conftest.py index e091cda620..c0560ebd53 100644 --- a/flowmachine/tests/conftest.py +++ b/flowmachine/tests/conftest.py @@ -28,40 +28,6 @@ flowkit_toplevel_dir = os.path.join(here, "..", "..") -@pytest.fixture( - params=[ - {"level": "admin2"}, - {"level": "admin2", "column_name": "admin2name"}, - {"level": "versioned-site"}, - {"level": "versioned-cell"}, - {"level": "cell"}, - {"level": "lat-lon"}, - {"level": "grid", "size": 5}, - { - "level": "polygon", - "column_name": "admin3pcod", - "polygon_table": "geography.admin3", - }, - ], - ids=lambda x: x["level"], -) -def exemplar_level_param(request): - """ - A fixture which yields a succession of plausible default parameter - combinations for levels. - - Parameters - ---------- - request - - Yields - ------ - dict - - """ - yield request.param - - @pytest.fixture( params=[ {"spatial_unit_type": "admin", "level": 2}, diff --git a/flowmachine/tests/test_cell_to_x.py b/flowmachine/tests/test_cell_to_x.py deleted file mode 100644 index 820dbb3940..0000000000 --- a/flowmachine/tests/test_cell_to_x.py +++ /dev/null @@ -1,40 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -from flowmachine.features import CellToPolygon, CellToAdmin, CellToGrid -import pytest - - -@pytest.mark.parametrize( - "mapper, args", - [ - (CellToAdmin, {"level": "admin3"}), - (CellToAdmin, {"level": "admin3", "column_name": "admin3pcod"}), - ( - CellToPolygon, - {"column_name": "admin3name", "polygon_table": "geography.admin3"}, - ), - ( - CellToPolygon, - { - "column_name": "id", - "polygon_table": "infrastructure.sites", - "geom_col": "geom_point", - }, - ), - ( - CellToPolygon, - { - "column_name": "id", - "polygon_table": "SELECT * FROM infrastructure.sites", - "geom_col": "geom_point", - }, - ), - (CellToGrid, {"size": 5}), - ], -) -def test_cell_to_x_mapping_column_names(mapper, args): - """Test that the CellToX mappers have accurate column_names properties.""" - instance = mapper(**args) - assert instance.head(0).columns.tolist() == instance.column_names diff --git a/flowmachine/tests/test_utils.py b/flowmachine/tests/test_utils.py index c1489a4406..8d3df0950f 100644 --- a/flowmachine/tests/test_utils.py +++ b/flowmachine/tests/test_utils.py @@ -15,7 +15,6 @@ from flowmachine.core import CustomQuery from flowmachine.core.subscriber_subsetter import make_subscriber_subsetter -from flowmachine.core.errors import BadLevelError from flowmachine.features import daily_location, EventTableSubset from flowmachine.utils import * from flowmachine.utils import _makesafe @@ -111,32 +110,6 @@ def test_sql_validation(): pretty_sql(sql) -@pytest.mark.parametrize( - "level, column_name, error", - [ - ("polygon", None, ValueError), - ("polygon", 9, TypeError), - ("badlevel", None, BadLevelError), - ], -) -def test_columns_for_level_errors(level, column_name, error): - """ - Test that get_columns_for_level raises correct errors - """ - with pytest.raises(error): - get_columns_for_level(level, column_name) - - -def test_column_list(): - """ - Test that supplying the column name as a list returns it as a new list. - """ - passed_cols = ["frogs", "dogs"] - returned_cols = get_columns_for_level("admin0", passed_cols) - assert passed_cols == returned_cols - assert id(passed_cols) != id(returned_cols) - - def test_datestring_parse_error(): """ Test that correct error is raised when failing to parse a datestring. From f72ffd5220b5dc5eda116c3dbeb995c4bcf53341 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 10:36:44 +0100 Subject: [PATCH 115/138] Put back closing bracket --- flowmachine/flowmachine/core/errors/flowmachine_errors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py index d4996fadc3..d7a423651f 100644 --- a/flowmachine/flowmachine/core/errors/flowmachine_errors.py +++ b/flowmachine/flowmachine/core/errors/flowmachine_errors.py @@ -84,6 +84,7 @@ class NotConnectedError(Exception): def __init__(self): Exception.__init__( self, "No connection found. Do you need to call flowmachine.connect()?" + ) class InvalidSpatialUnitError(ValueError): From 13c03000f1360acf6b747d3c5cd2c9e349d50651 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 16:45:53 +0100 Subject: [PATCH 116/138] Create spatial unit objects in marshmallow schemas --- .../aggregate_network_objects.py | 2 +- .../server/query_schemas/aggregation_unit.py | 36 +++++++++++++++++++ .../server/query_schemas/custom_fields.py | 10 ------ .../server/query_schemas/daily_location.py | 5 +-- .../query_schemas/dfs_metric_total_amount.py | 5 +-- .../core/server/query_schemas/flows.py | 2 +- .../core/server/query_schemas/geography.py | 16 ++++----- .../query_schemas/location_event_counts.py | 5 +-- .../query_schemas/location_introversion.py | 4 +-- .../query_schemas/meaningful_locations.py | 24 +++++++------ .../server/query_schemas/modal_location.py | 3 +- .../query_schemas/total_network_objects.py | 5 +-- .../query_schemas/unique_subscriber_counts.py | 6 ++-- 13 files changed, 77 insertions(+), 46 deletions(-) create mode 100644 flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py b/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py index 559fe888a6..e43d2ad42b 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py @@ -9,7 +9,7 @@ from flowmachine.features import AggregateNetworkObjects from .base_exposed_query import BaseExposedQuery from .total_network_objects import TotalNetworkObjectsSchema, TotalNetworkObjectsExposed -from .custom_fields import AggregationUnit, Statistic, AggregateBy +from .custom_fields import Statistic, AggregateBy __all__ = ["AggregateNetworkObjectsSchema", "AggregateNetworkObjectsExposed"] diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py new file mode 100644 index 0000000000..29dc7345dc --- /dev/null +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from flowmachine.core import make_spatial_unit + +from marshmallow.fields import String +from marshmallow.validate import OneOf + + +class AggregationUnit(String): + """ + A string representing an aggregation unit (for example: "admin0", "admin1", "admin2", ...) + """ + + def __init__(self, required=True, **kwargs): + validate = OneOf(["admin0", "admin1", "admin2", "admin3"]) + super().__init__(required=required, validate=validate, **kwargs) + + +def get_spatial_unit_obj(aggregation_unit_string): + """ + Given an aggregation unit string (as validated by AggregationUnit()), + return a FlowMachine spatial unit object. + """ + if "admin" in aggregation_unit_string: + level = int(aggregation_unit_string[-1]) + spatial_unit_args = { + "spatial_unit_type": "admin", + "level": level, + "region_id_column_name": [ + f"{aggregation_unit_string}name", + f"{aggregation_unit_string}pcod", + ], + } + return make_spatial_unit(**spatial_unit_args) diff --git a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py index fa2221d6c5..b55704df26 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py +++ b/flowmachine/flowmachine/core/server/query_schemas/custom_fields.py @@ -9,16 +9,6 @@ from marshmallow.validate import Range, Length, OneOf -class AggregationUnit(fields.String): - """ - A string representing an aggregation unit (for example: "admin0", "admin1", "admin2", ...) - """ - - def __init__(self, required=True, **kwargs): - validate = OneOf(["admin0", "admin1", "admin2", "admin3"]) - super().__init__(required=required, validate=validate, **kwargs) - - class EventTypes(fields.List): """ A string representing an event type, for example "calls", "sms", "mds", "topups". diff --git a/flowmachine/flowmachine/core/server/query_schemas/daily_location.py b/flowmachine/flowmachine/core/server/query_schemas/daily_location.py index 932898e36d..ac18997cab 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/daily_location.py +++ b/flowmachine/flowmachine/core/server/query_schemas/daily_location.py @@ -7,7 +7,8 @@ from flowmachine.features import daily_location from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit, SubscriberSubset +from .custom_fields import SubscriberSubset +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["DailyLocationSchema", "DailyLocationExposed"] @@ -44,7 +45,7 @@ def _flowmachine_query_obj(self): """ return daily_location( date=self.date, - level=self.aggregation_unit, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), method=self.method, subscriber_subset=self.subscriber_subset, ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py index dbeb2565dd..284f43aa1b 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py +++ b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py @@ -7,7 +7,8 @@ from flowmachine.features.dfs import DFSTotalMetricAmount from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit, DFSMetric +from .custom_fields import DFSMetric +from .aggregation_unit import AggregationUnit __all__ = ["DFSTotalMetricAmountSchema", "DFSTotalMetricAmountExposed"] @@ -46,5 +47,5 @@ def _flowmachine_query_obj(self): metric=self.metric, start_date=self.start_date, end_date=self.end_date, - aggregation_unit=self.aggregation_unit, + aggregation_unit=self.aggregation_unit.as_string, ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/flows.py b/flowmachine/flowmachine/core/server/query_schemas/flows.py index 386c89f716..d271e2cd26 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/flows.py +++ b/flowmachine/flowmachine/core/server/query_schemas/flows.py @@ -11,7 +11,7 @@ from .base_exposed_query import BaseExposedQuery from .daily_location import DailyLocationSchema, DailyLocationExposed from .modal_location import ModalLocationSchema, ModalLocationExposed -from .custom_fields import AggregationUnit +from .aggregation_unit import AggregationUnit __all__ = ["FlowsSchema", "FlowsExposed"] diff --git a/flowmachine/flowmachine/core/server/query_schemas/geography.py b/flowmachine/flowmachine/core/server/query_schemas/geography.py index e4dc4b49d7..9ee3e0ccd7 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/geography.py +++ b/flowmachine/flowmachine/core/server/query_schemas/geography.py @@ -5,9 +5,9 @@ from marshmallow import Schema, post_load, fields from marshmallow.validate import OneOf -from flowmachine.core.geotable import GeoTable +from flowmachine.core import CustomQuery from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["GeographySchema", "GeographyExposed"] @@ -36,12 +36,8 @@ def _flowmachine_query_obj(self): ------- Query """ - return GeoTable( - name=self.aggregation_unit, - schema="geography", - columns=[ - f"{self.aggregation_unit}name", - f"{self.aggregation_unit}pcod", - "geom", - ], + spatial_unit = get_spatial_unit_obj(self.aggregation_unit) + return CustomQuery( + sql=spatial_unit.get_geom_query(), + column_names=spatial_unit.location_id_columns + ["geom"], ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py b/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py index d6f1c444ce..51d62d209c 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py +++ b/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py @@ -7,7 +7,8 @@ from flowmachine.features import TotalLocationEvents from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit, EventTypes, SubscriberSubset +from .custom_fields import EventTypes, SubscriberSubset +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["LocationEventCountsSchema", "LocationEventCountsExposed"] @@ -68,6 +69,6 @@ def _flowmachine_query_obj(self): interval=self.interval, direction=self.direction, table=self.event_types, - level=self.aggregation_unit, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), subscriber_subset=self.subscriber_subset, ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py b/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py index 44d74c3b27..1bb1296306 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py +++ b/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py @@ -7,7 +7,7 @@ from flowmachine.features import LocationIntroversion from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["LocationIntroversionSchema", "LocationIntroversionExposed"] @@ -47,6 +47,6 @@ def _flowmachine_query_obj(self): return LocationIntroversion( start=self.start_date, stop=self.end_date, - level=self.aggregation_unit, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), direction=self.direction, ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py index c3e7aaaeaf..3cc962a088 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py +++ b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py @@ -6,6 +6,7 @@ from marshmallow.validate import OneOf from typing import Union, Dict, List +from flowmachine.core import make_spatial_unit from flowmachine.features import ( MeaningfulLocations, MeaningfulLocationsOD, @@ -16,12 +17,8 @@ SubscriberLocations, ) from .base_exposed_query import BaseExposedQuery -from .custom_fields import ( - AggregationUnit, - SubscriberSubset, - TowerHourOfDayScores, - TowerDayOfWeekScores, -) +from .custom_fields import SubscriberSubset, TowerHourOfDayScores, TowerDayOfWeekScores +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = [ "MeaningfulLocationsAggregateSchema", @@ -68,7 +65,9 @@ def _make_meaningful_locations_object( q_subscriber_locations = SubscriberLocations( start=start_date, stop=end_date, - level="versioned-site", # note this 'level' is not the same as the exposed parameter 'aggregation_unit' + spatial_unit=make_spatial_unit( + "versioned-site" + ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' subscriber_subset=subscriber_subset, ) q_call_days = CallDays(subscriber_locations=q_subscriber_locations) @@ -83,7 +82,9 @@ def _make_meaningful_locations_object( stop=end_date, score_hour=tower_hour_of_day_scores, score_dow=tower_day_of_week_scores, - level="versioned-site", # note this 'level' is not the same as the exposed parameter 'aggregation_unit' + spatial_unit=make_spatial_unit( + "versioned-site" + ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' subscriber_subset=subscriber_subset, ) q_meaningful_locations = MeaningfulLocations( @@ -132,7 +133,8 @@ def __init__( tower_hour_of_day_scores=tower_hour_of_day_scores, ) self.q_meaningful_locations_aggreate = MeaningfulLocationsAggregate( - meaningful_locations=q_meaningful_locations, level=aggregation_unit + meaningful_locations=q_meaningful_locations, + spatial_unit=get_spatial_unit_obj(aggregation_unit), ) @property @@ -216,7 +218,7 @@ def __init__( self.q_meaningful_locations_od = MeaningfulLocationsOD( meaningful_locations_a=locs_a, meaningful_locations_b=locs_b, - level=aggregation_unit, + spatial_unit=get_spatial_unit_obj(aggregation_unit), ) @property @@ -306,7 +308,7 @@ def __init__( self.q_meaningful_locations_od = MeaningfulLocationsOD( meaningful_locations_a=locs_a, meaningful_locations_b=locs_b, - level=aggregation_unit, + spatial_unit=get_spatial_unit_obj(aggregation_unit), ) @property diff --git a/flowmachine/flowmachine/core/server/query_schemas/modal_location.py b/flowmachine/flowmachine/core/server/query_schemas/modal_location.py index 941baff5a8..b59f241fdf 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/modal_location.py +++ b/flowmachine/flowmachine/core/server/query_schemas/modal_location.py @@ -7,7 +7,8 @@ from marshmallow_oneofschema import OneOfSchema from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit, SubscriberSubset +from .custom_fields import SubscriberSubset +from .aggregation_unit import AggregationUnit from .daily_location import DailyLocationSchema, DailyLocationExposed diff --git a/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py b/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py index b05fd3d3c8..83a00e9022 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py +++ b/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py @@ -8,7 +8,8 @@ from flowmachine.features import TotalNetworkObjects from flowmachine.features.network.total_network_objects import valid_periods from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit, TotalBy +from .custom_fields import TotalBy +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["TotalNetworkObjectsSchema", "TotalNetworkObjectsExposed"] @@ -46,6 +47,6 @@ def _flowmachine_query_obj(self): return TotalNetworkObjects( start=self.start_date, stop=self.end_date, - level=self.aggregation_unit, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), total_by=self.total_by, ) diff --git a/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py b/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py index b1e837dc75..0a4d00a585 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py @@ -7,7 +7,7 @@ from flowmachine.features import UniqueSubscriberCounts from .base_exposed_query import BaseExposedQuery -from .custom_fields import AggregationUnit +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["UniqueSubscriberCountsSchema", "UniqueSubscriberCountsExposed"] @@ -41,5 +41,7 @@ def _flowmachine_query_obj(self): Query """ return UniqueSubscriberCounts( - start=self.start_date, stop=self.end_date, level=self.aggregation_unit + start=self.start_date, + stop=self.end_date, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), ) From d8bf609eb0b0de9cdb058e321cf0bfcbdf85341d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 17:52:50 +0100 Subject: [PATCH 117/138] Update test_query_object_construction approved --- ..._construction.test_construct_query.approved.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index a313ee266e..1a73699109 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "007f984a39f97f26684116efcf40b8f7": { + "c37c473bd8cc49d51eed2e5a97e76cfd": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -9,7 +9,7 @@ "subscriber_subset": null } }, - "a81e3c6d78666534dd7726f76ff1aaac": { + "bd6b43f14e436ae0052420a9ddb2d661": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -19,7 +19,7 @@ "event_types": null, "subscriber_subset": null }, - "cde391fae9e861c0f1666e8f9cc6a5c5": { + "bb9aa378359757d78a335e3dbe5db0c0": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -42,11 +42,11 @@ ] } }, - "4e8eec45e2c4d396dec9f65dc1b780bd": { + "75e55979926972e18c392b05448c8ea5": { "query_kind": "geography", "aggregation_unit": "admin3" }, - "f698fa2a4cf75254ffdac05cdd4c5377": { + "236ecf7f48f00b894074f7e82c51c920": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -139,7 +139,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "785186014a77aa54378cc4ae67203d53": { + "dc343cb5ccb8adb2bb83cc4709363cdf": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -233,7 +233,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "f769334285671b759f18aefb9b58904d": { + "d219c8404a525589a2e54d2ace5f7039": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", From 06e546a2c1987627a09d62397cef9a0a575d64a6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 17:53:11 +0100 Subject: [PATCH 118/138] Fix integration tests --- docs/cache_queries.py | 59 +++++++++++++------ .../server/query_schemas/aggregation_unit.py | 9 +-- .../query_schemas/dfs_metric_total_amount.py | 2 +- .../test_action_get_sql.py | 6 +- .../test_action_run_query.py | 6 +- .../test_helper_functions.py | 6 +- .../flowmachine_server_tests/test_server.py | 7 ++- 7 files changed, 62 insertions(+), 33 deletions(-) diff --git a/docs/cache_queries.py b/docs/cache_queries.py index a5eaac13ab..df4e013673 100644 --- a/docs/cache_queries.py +++ b/docs/cache_queries.py @@ -16,18 +16,23 @@ print("Constructing query objects") +admin1_spatial_unit = flowmachine.core.make_spatial_unit("admin", level=1) +admin3_spatial_unit = flowmachine.core.make_spatial_unit("admin", level=3) +versioned_site_spatial_unit = flowmachine.core.make_spatial_unit("versioned-site") +versioned_cell_spatial_unit = flowmachine.core.make_spatial_unit("versioned-cell") + # FlowClient example usage example_usage_queries = [ flowmachine.features.utilities.spatial_aggregates.SpatialAggregate( locations=flowmachine.features.daily_location( - date="2016-01-01", level="admin3", method="last" + date="2016-01-01", spatial_unit=admin3_spatial_unit, method="last" ) ), flowmachine.features.utilities.spatial_aggregates.SpatialAggregate( locations=flowmachine.features.ModalLocation( *[ flowmachine.features.daily_location( - date=dl_date, level="admin3", method="last" + date=dl_date, spatial_unit=admin3_spatial_unit, method="last" ) for dl_date in pd.date_range("2016-01-01", "2016-01-03", freq="D") ] @@ -35,14 +40,17 @@ ), flowmachine.features.Flows( flowmachine.features.daily_location( - date="2016-01-01", level="admin1", method="last" + date="2016-01-01", spatial_unit=admin1_spatial_unit, method="last" ), flowmachine.features.daily_location( - date="2016-01-07", level="admin1", method="last" + date="2016-01-07", spatial_unit=admin1_spatial_unit, method="last" ), ), flowmachine.features.TotalLocationEvents( - start="2016-01-01", stop="2016-01-08", level="admin3", interval="hour" + start="2016-01-01", + stop="2016-01-08", + spatial_unit=admin3_spatial_unit, + interval="hour", ), ] @@ -57,7 +65,9 @@ locations=flowmachine.features.ModalLocation( *[ flowmachine.features.daily_location( - date=dl_date.strftime("%Y-%m-%d"), level="admin3", method="last" + date=dl_date.strftime("%Y-%m-%d"), + spatial_unit=admin3_spatial_unit, + method="last", ) for dl_date in dates ] @@ -69,7 +79,9 @@ flowmachine.features.ModalLocation( *[ flowmachine.features.daily_location( - date=dl_date.strftime("%Y-%m-%d"), level="admin3", method="last" + date=dl_date.strftime("%Y-%m-%d"), + spatial_unit=admin3_spatial_unit, + method="last", ) for dl_date in date_ranges["benchmark"] ] @@ -77,7 +89,9 @@ flowmachine.features.ModalLocation( *[ flowmachine.features.daily_location( - date=dl_date.strftime("%Y-%m-%d"), level="admin3", method="last" + date=dl_date.strftime("%Y-%m-%d"), + spatial_unit=admin3_spatial_unit, + method="last", ) for dl_date in date_ranges[period2] ] @@ -135,7 +149,9 @@ clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( subscriber_locations=flowmachine.features.subscriber_locations( - start="2016-01-01", stop="2016-01-07", level="versioned-site" + start="2016-01-01", + stop="2016-01-07", + spatial_unit=versioned_site_spatial_unit, ) ), radius=1.0, @@ -148,7 +164,7 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - level="versioned-site", + spatial_unit=versioned_site_spatial_unit, ), label=label, ), @@ -161,7 +177,9 @@ clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( subscriber_locations=flowmachine.features.subscriber_locations( - start="2016-01-01", stop="2016-01-07", level="versioned-site" + start="2016-01-01", + stop="2016-01-07", + spatial_unit=versioned_site_spatial_unit, ) ), radius=1.0, @@ -174,7 +192,7 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - level="versioned-site", + spatial_unit=versioned_site_spatial_unit, ), label="home", ), @@ -182,7 +200,9 @@ clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( subscriber_locations=flowmachine.features.subscriber_locations( - start="2016-01-01", stop="2016-01-07", level="versioned-site" + start="2016-01-01", + stop="2016-01-07", + spatial_unit=versioned_site_spatial_unit, ) ), radius=1.0, @@ -195,11 +215,11 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - level="versioned-site", + spatial_unit=versioned_site_spatial_unit, ), label="work", ), - level="admin3", + spatial_unit=admin3_spatial_unit, ) ] @@ -209,7 +229,7 @@ start="2016-01-01", stop="2016-01-07", table="events.mds", - level="versioned-cell", + spatial_unit=versioned_cell_spatial_unit, interval="hour", ) ] @@ -217,14 +237,17 @@ # Cell Towers Per Region cell_towers_per_region_queries = [ flowmachine.features.TotalNetworkObjects( - start="2016-01-01", stop="2016-01-08", level="admin3", total_by="month" + start="2016-01-01", + stop="2016-01-08", + spatial_unit=admin3_spatial_unit, + total_by="month", ) ] # Unique Subscriber Counts unique_subscriber_counts_queries = [ flowmachine.features.UniqueSubscriberCounts( - start="2016-01-01", stop="2016-01-08", level="admin3" + start="2016-01-01", stop="2016-01-08", spatial_unit=admin3_spatial_unit ) ] diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py index 29dc7345dc..3f8afab95a 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py @@ -25,12 +25,5 @@ def get_spatial_unit_obj(aggregation_unit_string): """ if "admin" in aggregation_unit_string: level = int(aggregation_unit_string[-1]) - spatial_unit_args = { - "spatial_unit_type": "admin", - "level": level, - "region_id_column_name": [ - f"{aggregation_unit_string}name", - f"{aggregation_unit_string}pcod", - ], - } + spatial_unit_args = {"spatial_unit_type": "admin", "level": level} return make_spatial_unit(**spatial_unit_args) diff --git a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py index 284f43aa1b..33cb07653b 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py +++ b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py @@ -47,5 +47,5 @@ def _flowmachine_query_obj(self): metric=self.metric, start_date=self.start_date, end_date=self.end_date, - aggregation_unit=self.aggregation_unit.as_string, + aggregation_unit=self.aggregation_unit, ) diff --git a/integration_tests/tests/flowmachine_server_tests/test_action_get_sql.py b/integration_tests/tests/flowmachine_server_tests/test_action_get_sql.py index 8df59898e9..ff71f10813 100644 --- a/integration_tests/tests/flowmachine_server_tests/test_action_get_sql.py +++ b/integration_tests/tests/flowmachine_server_tests/test_action_get_sql.py @@ -1,6 +1,7 @@ import pytest from flowmachine.core.server.utils import send_zmq_message_and_receive_reply +from flowmachine.core import make_spatial_unit from flowmachine.features.utilities.spatial_aggregates import SpatialAggregate from flowmachine.features import daily_location from .helpers import poll_until_done @@ -33,7 +34,10 @@ async def test_get_sql(zmq_port, zmq_host): } q = SpatialAggregate( locations=daily_location( - date="2016-01-01", method="last", level="admin3", subscriber_subset=None + date="2016-01-01", + method="last", + spatial_unit=make_spatial_unit("admin", level=3), + subscriber_subset=None, ) ) expected_query_id = q.md5 diff --git a/integration_tests/tests/flowmachine_server_tests/test_action_run_query.py b/integration_tests/tests/flowmachine_server_tests/test_action_run_query.py index 2221af7ad8..1820af482a 100644 --- a/integration_tests/tests/flowmachine_server_tests/test_action_run_query.py +++ b/integration_tests/tests/flowmachine_server_tests/test_action_run_query.py @@ -3,6 +3,7 @@ from flowmachine.core.cache import reset_cache from flowmachine.core.server.utils import send_zmq_message_and_receive_reply +from flowmachine.core import make_spatial_unit from flowmachine.features.utilities.spatial_aggregates import SpatialAggregate from flowmachine.features import daily_location from .helpers import cache_schema_is_empty, get_cache_tables, poll_until_done @@ -31,7 +32,10 @@ async def test_run_query(zmq_port, zmq_host, fm_conn, redis): } q = SpatialAggregate( locations=daily_location( - date="2016-01-01", method="last", level="admin3", subscriber_subset=None + date="2016-01-01", + method="last", + spatial_unit=make_spatial_unit("admin", level=3), + subscriber_subset=None, ) ) expected_query_id = q.md5 diff --git a/integration_tests/tests/flowmachine_server_tests/test_helper_functions.py b/integration_tests/tests/flowmachine_server_tests/test_helper_functions.py index 7a30ec0f02..eb080bdc5a 100644 --- a/integration_tests/tests/flowmachine_server_tests/test_helper_functions.py +++ b/integration_tests/tests/flowmachine_server_tests/test_helper_functions.py @@ -6,6 +6,7 @@ send_zmq_message_and_receive_reply, FM_EXAMPLE_MESSAGE, ) +from flowmachine.core import make_spatial_unit from flowmachine.features.utilities.spatial_aggregates import SpatialAggregate from flowmachine.features import daily_location @@ -34,7 +35,10 @@ def test_send_zmq_message_and_receive_reply(zmq_host, zmq_port): q = SpatialAggregate( locations=daily_location( - date="2016-01-01", method="last", level="admin3", subscriber_subset=None + date="2016-01-01", + method="last", + spatial_unit=make_spatial_unit("admin", level=3), + subscriber_subset=None, ) ) expected_query_id = q.md5 diff --git a/integration_tests/tests/flowmachine_server_tests/test_server.py b/integration_tests/tests/flowmachine_server_tests/test_server.py index 6c64d5ed28..32cdf39c4d 100644 --- a/integration_tests/tests/flowmachine_server_tests/test_server.py +++ b/integration_tests/tests/flowmachine_server_tests/test_server.py @@ -4,6 +4,7 @@ import json +from flowmachine.core import make_spatial_unit from flowmachine.features.utilities.spatial_aggregates import SpatialAggregate from flowmachine.features.dfs.total_amount_for_metric import DFSTotalMetricAmount from flowmachine.features import daily_location, ModalLocation @@ -110,7 +111,7 @@ def test_run_daily_location_query(send_zmq_message_and_receive_reply): locations=daily_location( date="2016-01-01", method="most-common", - level="admin3", + spatial_unit=make_spatial_unit("admin", level=3), subscriber_subset=None, ) ) @@ -160,13 +161,13 @@ def test_run_modal_location_query(send_zmq_message_and_receive_reply): daily_location( date="2016-01-01", method="most-common", - level="admin3", + spatial_unit=make_spatial_unit("admin", level=3), subscriber_subset=None, ), daily_location( date="2016-01-02", method="most-common", - level="admin3", + spatial_unit=make_spatial_unit("admin", level=3), subscriber_subset=None, ), ) From ca35d3f3af9277af0ffe2bcfeff3cbb9a4c80281 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 17:57:31 +0100 Subject: [PATCH 119/138] Update approved query IDs again (due to spatial unit change) --- ..._construction.test_construct_query.approved.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index 1a73699109..ab8e2dcbbd 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -1,5 +1,5 @@ { - "c37c473bd8cc49d51eed2e5a97e76cfd": { + "75d67100b0382c34b4661a5ec8950fa6": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "daily_location", @@ -9,7 +9,7 @@ "subscriber_subset": null } }, - "bd6b43f14e436ae0052420a9ddb2d661": { + "1a1e4e159d05f2ec1f081ac9c2bfd6d5": { "query_kind": "location_event_counts", "start_date": "2016-01-01", "end_date": "2016-01-02", @@ -19,7 +19,7 @@ "event_types": null, "subscriber_subset": null }, - "bb9aa378359757d78a335e3dbe5db0c0": { + "19419d8079d9c8e90afd057eb0871488": { "query_kind": "spatial_aggregate", "locations": { "query_kind": "modal_location", @@ -42,11 +42,11 @@ ] } }, - "75e55979926972e18c392b05448c8ea5": { + "d87422eb50797c91bbd00b8af93a72aa": { "query_kind": "geography", "aggregation_unit": "admin3" }, - "236ecf7f48f00b894074f7e82c51c920": { + "de6836685ca2805baa1081c09f387e50": { "query_kind": "meaningful_locations_aggregate", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -139,7 +139,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "dc343cb5ccb8adb2bb83cc4709363cdf": { + "639b435514358b3e2319333e5016f56e": { "query_kind": "meaningful_locations_between_label_od_matrix", "aggregation_unit": "admin1", "start_date": "2016-01-01", @@ -233,7 +233,7 @@ "tower_cluster_call_threshold": 0, "subscriber_subset": null }, - "d219c8404a525589a2e54d2ace5f7039": { + "98dad1d308b631753b3d4e37c7a63861": { "query_kind": "meaningful_locations_between_dates_od_matrix", "aggregation_unit": "admin1", "start_date_a": "2016-01-01", From 653f1dbb0fb4390095f4d56e69203bacc388236a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 19:24:46 +0100 Subject: [PATCH 120/138] Add Geography class --- .../flowmachine/features/spatial/__init__.py | 2 + .../flowmachine/features/spatial/geography.py | 56 +++++++++++++++++++ flowmachine/tests/test_spatial_geography.py | 52 +++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 flowmachine/flowmachine/features/spatial/geography.py create mode 100644 flowmachine/tests/test_spatial_geography.py diff --git a/flowmachine/flowmachine/features/spatial/__init__.py b/flowmachine/flowmachine/features/spatial/__init__.py index 1a10c3735f..bda6bb8aec 100644 --- a/flowmachine/flowmachine/features/spatial/__init__.py +++ b/flowmachine/flowmachine/features/spatial/__init__.py @@ -7,6 +7,7 @@ """ from .location_area import LocationArea from .distance_matrix import DistanceMatrix +from .geography import Geography from .location_cluster import LocationCluster from .versioned_infrastructure import VersionedInfrastructure from .circles import Circle, CircleGeometries @@ -14,6 +15,7 @@ __all__ = [ "LocationArea", "DistanceMatrix", + "Geography", "LocationCluster", "VersionedInfrastructure", "Circle", diff --git a/flowmachine/flowmachine/features/spatial/geography.py b/flowmachine/flowmachine/features/spatial/geography.py new file mode 100644 index 0000000000..0b395c1a89 --- /dev/null +++ b/flowmachine/flowmachine/features/spatial/geography.py @@ -0,0 +1,56 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# -*- coding: utf-8 -*- +""" +Definition of the Geography class, which returns a table of the +location_id_columns and geometry for a spatial unit. +""" +from typing import List + +from ...core.query import Query +from ...core.mixins import GeoDataMixin + + +class Geography(GeoDataMixin, Query): + """ + This class is a wrapper around the SQL query returned by the + 'get_geom_query' method of spatial unit objects, adding geographic utility + methods (e.g. 'to_geojson'). + + Queries of this type are used to return GeoJSON data via the FlowAPI + 'get_geography' route. + + Parameters + ---------- + spatial_unit : flowmachine.core.spatial_unit.GeomSpatialUnit + Spatial unit to return geography data for. See the docstring of + make_spatial_unit for more information. + """ + + def __init__(self, spatial_unit): + spatial_unit.verify_criterion("has_geography") + self.spatial_unit = spatial_unit + super().__init__() + + @property + def column_names(self) -> List[str]: + return self.spatial_unit.location_id_columns + ["geom"] + + def _geo_augmented_query(self): + locid_columns = self.spatial_unit.location_id_columns + if len(locid_columns) == 1: + gid = f"{locid_columns[0]} AS gid" + else: + gid = "row_number() over() AS gid" + + sql = f""" + SELECT {gid}, * + FROM ({self.get_query()}) AS Q + """ + + return sql, ["gid"] + self.column_names + + def _make_query(self): + return self.spatial_unit.get_geom_query() diff --git a/flowmachine/tests/test_spatial_geography.py b/flowmachine/tests/test_spatial_geography.py new file mode 100644 index 0000000000..46652a25ac --- /dev/null +++ b/flowmachine/tests/test_spatial_geography.py @@ -0,0 +1,52 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Tests for the Geography class. +""" + +import pytest + +import geojson + +from flowmachine.features.spatial import Geography +from flowmachine.core import make_spatial_unit +from flowmachine.core.errors import InvalidSpatialUnitError + + +def test_geography_column_names(exemplar_spatial_unit_param): + """ + Test that column_names property matches head(0) for Geography. + """ + if not exemplar_spatial_unit_param.has_geography: + pytest.skip(f"{exemplar_spatial_unit_param} has no geography.") + geo = Geography(exemplar_spatial_unit_param) + assert geo.head(0).columns.tolist() == geo.column_names + + +def test_geography_raises_error(): + """ + Test that Geography raises an error for an invalid spatial unit. + """ + with pytest.raises(InvalidSpatialUnitError): + geo = Geography(make_spatial_unit("cell")) + + +@pytest.mark.parametrize( + "make_spatial_unit_params", + [ + {"spatial_unit_type": "versioned-cell"}, + {"spatial_unit_type": "versioned-site"}, + {"spatial_unit_type": "lon-lat"}, + {"spatial_unit_type": "admin", "level": 2}, + {"spatial_unit_type": "grid", "size": 5}, + ], +) +def test_valid_geojson(make_spatial_unit_params): + """ + Check that valid geojson is returned. + """ + spatial_unit = make_spatial_unit(**make_spatial_unit_params) + geo = Geography(spatial_unit) + assert geojson.loads(geo.to_geojson_string()).is_valid From f28858237070de3ddfce0b637592fc73f4c68547 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 20:23:52 +0100 Subject: [PATCH 121/138] Use Geography query for getting geography --- .../core/server/action_handlers.py | 52 ++++++++++++------- .../core/server/query_schemas/__init__.py | 1 + .../core/server/query_schemas/geography.py | 8 +-- flowmachine/flowmachine/features/__init__.py | 1 + .../tests/query_tests/test_queries.py | 3 +- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 9af84dac39..5e317a89f3 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -31,7 +31,7 @@ from flowmachine.core.query_state import QueryStateMachine, QueryState from flowmachine.utils import convert_dict_keys_to_strings from .exceptions import FlowmachineServerError -from .query_schemas import FlowmachineQuerySchema +from .query_schemas import FlowmachineQuerySchema, GeographySchema from .zmq_helpers import ZMQReply __all__ = ["perform_action"] @@ -247,29 +247,43 @@ def action_handler__get_geography(aggregation_unit: str) -> ZMQReply: """ Handler for the 'get_query_geography' action. - Returns query parameters of the query with the given `query_id`. + Returns SQL to get geography for the given `aggregation_unit` as GeoJSON. """ - - # TODO: do we still need to validate the aggregation unit or does this happen - # before (e.g. through marshmallow?) - allowed_aggregation_units = ["admin0", "admin1", "admin2", "admin3", "admin4"] - if aggregation_unit not in allowed_aggregation_units: - error_msg = ( - f"Invalid aggregation unit. Must be one of: {allowed_aggregation_units}'" - ) - return ZMQReply(status="error", msg=error_msg) - try: - q = GeoTable( - name=aggregation_unit, - schema="geography", - columns=[f"{aggregation_unit}name", f"{aggregation_unit}pcod", "geom"], + try: + query_obj = GeographySchema().load( + {"query_kind": "geography", "aggregation_unit": aggregation_unit} + ) + except TypeError as exc: + # We need to catch TypeError here, otherwise they propagate up to + # perform_action() and result in a very misleading error message. + orig_error_msg = exc.args[0] + error_msg = ( + f"Internal flowmachine server error: could not create query object using query schema. " + f"The original error was: '{orig_error_msg}'" + ) + return ZMQReply( + status="error", + msg=error_msg, + payload={"params": action_params, "orig_error_msg": orig_error_msg}, + ) + except ValidationError as exc: + # The dictionary of marshmallow errors can contain integers as keys, + # which will raise an error when converting to JSON (where the keys + # must be strings). Therefore we transform the keys to strings here. + error_msg = "Parameter validation failed." + validation_error_messages = convert_dict_keys_to_strings(exc.messages) + return ZMQReply( + status="error", msg=error_msg, payload=validation_error_messages ) - except Exception as e: - return ZMQReply(status="error", msg=f"{e}") + + # We don't cache the query, because it just selects columns from a + # geography table. If we expose an aggregation unit which relies on another + # query to create the geometry (e.g. grid), we may want to reconsider this + # decision. # Explicitly project to WGS84 (SRID=4326) to conform with GeoJSON standard - sql = q.geojson_query(crs=4326) + sql = query_obj.geojson_query(crs=4326) # TODO: put query_run_log back in! # query_run_log.info("get_geography", **run_log_dict) payload = {"query_state": QueryState.COMPLETED, "sql": sql} diff --git a/flowmachine/flowmachine/core/server/query_schemas/__init__.py b/flowmachine/flowmachine/core/server/query_schemas/__init__.py index af9129ca3e..4ba7bdb6ea 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/__init__.py +++ b/flowmachine/flowmachine/core/server/query_schemas/__init__.py @@ -4,3 +4,4 @@ from .base_exposed_query import BaseExposedQuery from .flowmachine_query import FlowmachineQuerySchema +from .geography import GeographySchema diff --git a/flowmachine/flowmachine/core/server/query_schemas/geography.py b/flowmachine/flowmachine/core/server/query_schemas/geography.py index 9ee3e0ccd7..83423670a2 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/geography.py +++ b/flowmachine/flowmachine/core/server/query_schemas/geography.py @@ -5,7 +5,7 @@ from marshmallow import Schema, post_load, fields from marshmallow.validate import OneOf -from flowmachine.core import CustomQuery +from flowmachine.features import Geography from .base_exposed_query import BaseExposedQuery from .aggregation_unit import AggregationUnit, get_spatial_unit_obj @@ -36,8 +36,4 @@ def _flowmachine_query_obj(self): ------- Query """ - spatial_unit = get_spatial_unit_obj(self.aggregation_unit) - return CustomQuery( - sql=spatial_unit.get_geom_query(), - column_names=spatial_unit.location_id_columns + ["geom"], - ) + return Geography(get_spatial_unit_obj(self.aggregation_unit)) diff --git a/flowmachine/flowmachine/features/__init__.py b/flowmachine/flowmachine/features/__init__.py index a76f97e882..ee07ab5590 100644 --- a/flowmachine/flowmachine/features/__init__.py +++ b/flowmachine/flowmachine/features/__init__.py @@ -80,6 +80,7 @@ "LocationArea", "LocationCluster", "DistanceMatrix", + "Geography", "VersionedInfrastructure", "Grid", "Circle", diff --git a/integration_tests/tests/query_tests/test_queries.py b/integration_tests/tests/query_tests/test_queries.py index 9320fd0f6f..05490b675c 100644 --- a/integration_tests/tests/query_tests/test_queries.py +++ b/integration_tests/tests/query_tests/test_queries.py @@ -411,8 +411,7 @@ def test_get_geography(access_token_builder, flowapi_url): assert 0 < len(result_geojson["features"]) feature0 = result_geojson["features"][0] assert "Feature" == feature0["type"] - assert "admin3name" in feature0["properties"] - assert "admin3pcod" in feature0["properties"] + assert "pcod" in feature0["properties"] assert "MultiPolygon" == feature0["geometry"]["type"] assert list == type(feature0["geometry"]["coordinates"]) assert 0 < len(feature0["geometry"]["coordinates"]) From 07f53c8ce327a3791eadc430873d489171850445 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 20:24:11 +0100 Subject: [PATCH 122/138] Update worked examples --- .../worked_examples/cell-towers-per-region.ipynb | 5 ++--- .../worked_examples/commuting-patterns.ipynb | 14 ++++++-------- .../worked_examples/flows-above-normal.ipynb | 8 ++++---- .../source/worked_examples/mobile-data-usage.ipynb | 3 ++- .../worked_examples/unique-subscriber-counts.ipynb | 5 ++--- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/source/worked_examples/cell-towers-per-region.ipynb b/docs/source/worked_examples/cell-towers-per-region.ipynb index 73f5b4daa7..9f4bd53b46 100644 --- a/docs/source/worked_examples/cell-towers-per-region.ipynb +++ b/docs/source/worked_examples/cell-towers-per-region.ipynb @@ -157,14 +157,13 @@ "source": [ "towers_per_admin3_geodataframe = (\n", " regions_geodataframe.join(\n", - " towers_per_admin3.set_index(\"pcod\"), on=\"admin3pcod\", how=\"left\"\n", + " towers_per_admin3.set_index(\"pcod\"), on=\"pcod\", how=\"left\"\n", " )\n", " .fillna(0)\n", " .drop(columns=[\"centroid\", \"datetime\"])\n", " .rename(\n", " columns={\n", - " \"admin3pcod\": \"P-code\",\n", - " \"admin3name\": \"Name\",\n", + " \"pcod\": \"P-code\",\n", " \"value\": \"Number of towers\",\n", " }\n", " )\n", diff --git a/docs/source/worked_examples/commuting-patterns.ipynb b/docs/source/worked_examples/commuting-patterns.ipynb index 1364407459..ee73831a0c 100644 --- a/docs/source/worked_examples/commuting-patterns.ipynb +++ b/docs/source/worked_examples/commuting-patterns.ipynb @@ -199,12 +199,12 @@ " regions_geodataframe.drop(columns=\"centroid\")\n", " .join(\n", " home_locations.drop(columns=\"label\").set_index(\"pcod\"),\n", - " on=\"admin3pcod\",\n", + " on=\"pcod\",\n", " how=\"left\",\n", " )\n", " .join(\n", " work_locations.drop(columns=\"label\").set_index(\"pcod\"),\n", - " on=\"admin3pcod\",\n", + " on=\"pcod\",\n", " lsuffix=\"_home\",\n", " rsuffix=\"_work\",\n", " how=\"left\",\n", @@ -215,8 +215,7 @@ "# Rename columns for map labels\n", "locations_geodataframe = locations_geodataframe.rename(\n", " columns={\n", - " \"admin3pcod\": \"P-code\",\n", - " \"admin3name\": \"Name\",\n", + " \"pcod\": \"P-code\",\n", " \"total_home\": \"Total (home)\",\n", " \"total_work\": \"Total (work)\",\n", " }\n", @@ -341,10 +340,10 @@ "# Join location counts to geography data\n", "commuters_geodataframe = (\n", " regions_geodataframe.drop(columns=\"centroid\")\n", - " .join(commuters_into_region, on=\"admin3pcod\", how=\"left\")\n", + " .join(commuters_into_region, on=\"pcod\", how=\"left\")\n", " .join(\n", " commuters_out_from_region,\n", - " on=\"admin3pcod\",\n", + " on=\"pcod\",\n", " lsuffix=\"_in\",\n", " rsuffix=\"_out\",\n", " how=\"left\",\n", @@ -355,8 +354,7 @@ "# Rename columns for map labels\n", "commuters_geodataframe = commuters_geodataframe.rename(\n", " columns={\n", - " \"admin3pcod\": \"P-code\",\n", - " \"admin3name\": \"Name\",\n", + " \"pcod\": \"P-code\",\n", " \"total_in\": \"Commuters in\",\n", " \"total_out\": \"Commuters out\",\n", " }\n", diff --git a/docs/source/worked_examples/flows-above-normal.ipynb b/docs/source/worked_examples/flows-above-normal.ipynb index 237d242742..6f2239536a 100644 --- a/docs/source/worked_examples/flows-above-normal.ipynb +++ b/docs/source/worked_examples/flows-above-normal.ipynb @@ -253,12 +253,12 @@ " home_locations_results[period]\n", " .set_index(\"pcod\")\n", " .rename(columns={\"total\": f\"Total ({period} period)\"}),\n", - " on=\"admin3pcod\",\n", + " on=\"pcod\",\n", " how=\"left\",\n", " ).fillna(0)\n", "\n", "home_locations_geodataframe = home_locations_geodataframe.rename(\n", - " columns={\"admin3pcod\": \"P-code\", \"admin3name\": \"Name\"}\n", + " columns={\"pcod\": \"P-code\"}\n", ")" ] }, @@ -389,7 +389,7 @@ "outputs": [], "source": [ "in_out_flows_geodataframe = (\n", - " regions_geodataframe.set_index(\"admin3pcod\")\n", + " regions_geodataframe.set_index(\"pcod\")\n", " .join(\n", " [\n", " inflows_above_normal.rename(\n", @@ -407,7 +407,7 @@ "\n", "in_out_flows_geodataframe = in_out_flows_geodataframe.drop(\n", " columns=\"centroid\"\n", - ").rename(columns={\"admin3pcod\": \"P-code\", \"admin3name\": \"Name\"})" + ").rename(columns={\"pcod\": \"P-code\"})" ] }, { diff --git a/docs/source/worked_examples/mobile-data-usage.ipynb b/docs/source/worked_examples/mobile-data-usage.ipynb index 669d09ab92..373965f7bd 100644 --- a/docs/source/worked_examples/mobile-data-usage.ipynb +++ b/docs/source/worked_examples/mobile-data-usage.ipynb @@ -28,6 +28,7 @@ "outputs": [], "source": [ "import flowmachine\n", + "from flowmachine.core import make_spatial_unit\n", "import os\n", "import numpy as np\n", "import geopandas as gpd\n", @@ -82,7 +83,7 @@ " start=\"2016-01-01\",\n", " stop=\"2016-01-07\",\n", " table=\"events.mds\",\n", - " level=\"versioned-cell\",\n", + " spatial_unit=make_spatial_unit(\"versioned-cell\"),\n", " interval=\"hour\",\n", ")" ] diff --git a/docs/source/worked_examples/unique-subscriber-counts.ipynb b/docs/source/worked_examples/unique-subscriber-counts.ipynb index c32020789d..1e45ca370c 100644 --- a/docs/source/worked_examples/unique-subscriber-counts.ipynb +++ b/docs/source/worked_examples/unique-subscriber-counts.ipynb @@ -156,14 +156,13 @@ "source": [ "subscribers_per_admin3_geodataframe = (\n", " regions_geodataframe.join(\n", - " subscribers_per_admin3.set_index(\"pcod\"), on=\"admin3pcod\", how=\"left\"\n", + " subscribers_per_admin3.set_index(\"pcod\"), on=\"pcod\", how=\"left\"\n", " )\n", " .fillna(0)\n", " .drop(columns=[\"centroid\"])\n", " .rename(\n", " columns={\n", - " \"admin3pcod\": \"P-code\",\n", - " \"admin3name\": \"Name\",\n", + " \"pcod\": \"P-code\",\n", " \"unique_subscriber_counts\": \"Number of subscribers\",\n", " }\n", " )\n", From a7eccf73fd15792514c1758e2100198ecfbd9908 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 22:49:44 +0100 Subject: [PATCH 123/138] Correct Geography query_id --- ..._query_object_construction.test_construct_query.approved.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt index ab8e2dcbbd..857f5ed6d6 100644 --- a/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt +++ b/flowmachine/tests/test_query_object_construction.test_construct_query.approved.txt @@ -42,7 +42,7 @@ ] } }, - "d87422eb50797c91bbd00b8af93a72aa": { + "07784743904211f6ad33c26c668df5d0": { "query_kind": "geography", "aggregation_unit": "admin3" }, From f45c97ab5be3a7a42580ce3a590d201bebb73391 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 11 Jun 2019 22:49:55 +0100 Subject: [PATCH 124/138] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5203fbf6c0..b83456b715 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - The dev provisioning Ansible playbook now automatically generates an SSH key pair for the `flowkit` user. [#892](https://github.com/Flowminder/FlowKit/issues/892) +- Added new classes to represent spatial units in FlowMachine. +- Added a `Geography` query class, to get geography data for a spatial unit. ### Changed - The quick-start script now only pulls the docker images for the services that are actually started up. [#898](https://github.com/Flowminder/FlowKit/issues/898) +- Location-related FlowMachine queries now take a `spatial_unit` parameter instead of `level`. ### Fixed @@ -20,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Distances in `Displacement` are now calculated with longitude and latitude the corrcet way around. [#913](https://github.com/Flowminder/FlowKit/issues/913) ### Removed +- Removed `cell_mappings.py`, `get_columns_for_level` and `BadLevelError`. ## [0.6.4] From 5669daeabfa035d37fc5022e2d65301c9e6f7850 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 12:49:29 +0100 Subject: [PATCH 125/138] Return geography error messages through FlowAPI --- flowapi/flowapi/geography.py | 6 +- .../core/server/action_handlers.py | 75 +++++++++++-------- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/flowapi/flowapi/geography.py b/flowapi/flowapi/geography.py index 3956c3a976..a50b14ed1f 100644 --- a/flowapi/flowapi/geography.py +++ b/flowapi/flowapi/geography.py @@ -54,7 +54,11 @@ async def get_geography(aggregation_unit): ) if reply["status"] == "error": - return jsonify({"status": "Error", "msg": "Internal server error"}), 500 + try: + msg = reply["msg"] + except KeyError: + msg = "Internal server error" + return jsonify({"status": "Error", "msg": msg}), 500 try: query_state = reply["payload"]["query_state"] diff --git a/flowmachine/flowmachine/core/server/action_handlers.py b/flowmachine/flowmachine/core/server/action_handlers.py index 5e317a89f3..6160c99ade 100644 --- a/flowmachine/flowmachine/core/server/action_handlers.py +++ b/flowmachine/flowmachine/core/server/action_handlers.py @@ -251,43 +251,52 @@ def action_handler__get_geography(aggregation_unit: str) -> ZMQReply: """ try: try: - query_obj = GeographySchema().load( - {"query_kind": "geography", "aggregation_unit": aggregation_unit} - ) - except TypeError as exc: - # We need to catch TypeError here, otherwise they propagate up to - # perform_action() and result in a very misleading error message. - orig_error_msg = exc.args[0] - error_msg = ( - f"Internal flowmachine server error: could not create query object using query schema. " - f"The original error was: '{orig_error_msg}'" - ) + try: + query_obj = GeographySchema().load( + {"aggregation_unit": aggregation_unit} + ) + except TypeError as exc: + # We need to catch TypeError here, otherwise they propagate up to + # perform_action() and result in a very misleading error message. + orig_error_msg = exc.args[0] + error_msg = ( + f"Internal flowmachine server error: could not create query object using query schema. " + f"The original error was: '{orig_error_msg}'" + ) + return ZMQReply( + status="error", + msg=error_msg, + payload={ + "params": {"aggregation_unit": aggregation_unit}, + "orig_error_msg": orig_error_msg, + }, + ) + except ValidationError as exc: + # The dictionary of marshmallow errors can contain integers as keys, + # which will raise an error when converting to JSON (where the keys + # must be strings). Therefore we transform the keys to strings here. + error_msg = "Parameter validation failed." + validation_error_messages = convert_dict_keys_to_strings(exc.messages) return ZMQReply( - status="error", - msg=error_msg, - payload={"params": action_params, "orig_error_msg": orig_error_msg}, + status="error", msg=error_msg, payload=validation_error_messages ) - except ValidationError as exc: - # The dictionary of marshmallow errors can contain integers as keys, - # which will raise an error when converting to JSON (where the keys - # must be strings). Therefore we transform the keys to strings here. - error_msg = "Parameter validation failed." - validation_error_messages = convert_dict_keys_to_strings(exc.messages) - return ZMQReply( - status="error", msg=error_msg, payload=validation_error_messages - ) - # We don't cache the query, because it just selects columns from a - # geography table. If we expose an aggregation unit which relies on another - # query to create the geometry (e.g. grid), we may want to reconsider this - # decision. + # We don't cache the query, because it just selects columns from a + # geography table. If we expose an aggregation unit which relies on another + # query to create the geometry (e.g. grid), we may want to reconsider this + # decision. - # Explicitly project to WGS84 (SRID=4326) to conform with GeoJSON standard - sql = query_obj.geojson_query(crs=4326) - # TODO: put query_run_log back in! - # query_run_log.info("get_geography", **run_log_dict) - payload = {"query_state": QueryState.COMPLETED, "sql": sql} - return ZMQReply(status="success", payload=payload) + sql = query_obj.geojson_sql + # TODO: put query_run_log back in! + # query_run_log.info("get_geography", **run_log_dict) + payload = {"query_state": QueryState.COMPLETED, "sql": sql} + return ZMQReply(status="success", payload=payload) + except Exception as exc: + # If we don't catch exceptions here, the server will die and FlowAPI will hang indefinitely. + error_msg = f"Internal flowmachine server error: '{exc.args[0]}'" + return ZMQReply( + status="error", msg=error_msg, payload={"error_msg": exc.args[0]} + ) def action_handler__get_available_dates() -> ZMQReply: From d3810ac2d4cde6f86c29af03ec724763242996f6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 12:51:24 +0100 Subject: [PATCH 126/138] Add comment about query_kind parameter --- .../core/server/query_schemas/aggregate_network_objects.py | 1 + .../flowmachine/core/server/query_schemas/daily_location.py | 1 + .../core/server/query_schemas/dfs_metric_total_amount.py | 1 + flowmachine/flowmachine/core/server/query_schemas/dummy_query.py | 1 + flowmachine/flowmachine/core/server/query_schemas/flows.py | 1 + .../core/server/query_schemas/joined_spatial_aggregate.py | 1 + .../core/server/query_schemas/location_event_counts.py | 1 + .../core/server/query_schemas/location_introversion.py | 1 + .../core/server/query_schemas/meaningful_locations.py | 1 + .../flowmachine/core/server/query_schemas/modal_location.py | 1 + .../flowmachine/core/server/query_schemas/radius_of_gyration.py | 1 + .../flowmachine/core/server/query_schemas/spatial_aggregate.py | 1 + .../core/server/query_schemas/total_network_objects.py | 1 + .../core/server/query_schemas/unique_subscriber_counts.py | 1 + 14 files changed, 14 insertions(+) diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py b/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py index e43d2ad42b..5642c8941d 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregate_network_objects.py @@ -20,6 +20,7 @@ class InputToAggregateNetworkObjectsSchema(OneOfSchema): class AggregateNetworkObjectsSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["aggregate_network_objects"])) total_network_objects = fields.Nested( InputToAggregateNetworkObjectsSchema, required=True diff --git a/flowmachine/flowmachine/core/server/query_schemas/daily_location.py b/flowmachine/flowmachine/core/server/query_schemas/daily_location.py index ac18997cab..60ffd88b1f 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/daily_location.py +++ b/flowmachine/flowmachine/core/server/query_schemas/daily_location.py @@ -14,6 +14,7 @@ class DailyLocationSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["daily_location"])) date = fields.Date(required=True) method = fields.String(required=True, validate=OneOf(["last", "most-common"])) diff --git a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py index 33cb07653b..3a548d1a02 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py +++ b/flowmachine/flowmachine/core/server/query_schemas/dfs_metric_total_amount.py @@ -14,6 +14,7 @@ class DFSTotalMetricAmountSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["dfs_metric_total_amount"])) metric = DFSMetric() start_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/dummy_query.py b/flowmachine/flowmachine/core/server/query_schemas/dummy_query.py index 6be74065cc..7128f0d0b3 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/dummy_query.py +++ b/flowmachine/flowmachine/core/server/query_schemas/dummy_query.py @@ -16,6 +16,7 @@ class DummyQuerySchema(Schema): Dummy query useful for testing. """ + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["dummy_query"])) dummy_param = fields.String(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/flows.py b/flowmachine/flowmachine/core/server/query_schemas/flows.py index d271e2cd26..cee63ea02a 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/flows.py +++ b/flowmachine/flowmachine/core/server/query_schemas/flows.py @@ -25,6 +25,7 @@ class InputToFlowsSchema(OneOfSchema): class FlowsSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["flows"])) from_location = fields.Nested(InputToFlowsSchema, required=True) to_location = fields.Nested(InputToFlowsSchema, required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/joined_spatial_aggregate.py b/flowmachine/flowmachine/core/server/query_schemas/joined_spatial_aggregate.py index 5f87f72c85..4878cfba95 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/joined_spatial_aggregate.py +++ b/flowmachine/flowmachine/core/server/query_schemas/joined_spatial_aggregate.py @@ -25,6 +25,7 @@ class JoinableMetrics(OneOfSchema): class JoinedSpatialAggregateSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["joined_spatial_aggregate"])) locations = fields.Nested(InputToSpatialAggregate, required=True) metric = fields.Nested(JoinableMetrics, required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py b/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py index 51d62d209c..101c17e8be 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py +++ b/flowmachine/flowmachine/core/server/query_schemas/location_event_counts.py @@ -14,6 +14,7 @@ class LocationEventCountsSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["location_event_counts"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py b/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py index 1bb1296306..d559a17079 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py +++ b/flowmachine/flowmachine/core/server/query_schemas/location_introversion.py @@ -13,6 +13,7 @@ class LocationIntroversionSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["location_introversion"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py index 3cc962a088..c6b522cbaa 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py +++ b/flowmachine/flowmachine/core/server/query_schemas/meaningful_locations.py @@ -31,6 +31,7 @@ class MeaningfulLocationsAggregateSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["meaningful_locations_aggregate"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/modal_location.py b/flowmachine/flowmachine/core/server/query_schemas/modal_location.py index b59f241fdf..05096c2353 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/modal_location.py +++ b/flowmachine/flowmachine/core/server/query_schemas/modal_location.py @@ -18,6 +18,7 @@ class InputToModalLocationSchema(OneOfSchema): class ModalLocationSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["modal_location"])) locations = fields.Nested( InputToModalLocationSchema, many=True, validate=Length(min=1) diff --git a/flowmachine/flowmachine/core/server/query_schemas/radius_of_gyration.py b/flowmachine/flowmachine/core/server/query_schemas/radius_of_gyration.py index 222e2c90a1..15aea5abae 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/radius_of_gyration.py +++ b/flowmachine/flowmachine/core/server/query_schemas/radius_of_gyration.py @@ -13,6 +13,7 @@ class RadiusOfGyrationSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["radius_of_gyration"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/spatial_aggregate.py b/flowmachine/flowmachine/core/server/query_schemas/spatial_aggregate.py index 844408473a..28e7c3f88b 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/spatial_aggregate.py +++ b/flowmachine/flowmachine/core/server/query_schemas/spatial_aggregate.py @@ -28,6 +28,7 @@ class InputToSpatialAggregate(OneOfSchema): class SpatialAggregateSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["spatial_aggregate"])) locations = fields.Nested(InputToSpatialAggregate, required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py b/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py index 83a00e9022..0da302409a 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py +++ b/flowmachine/flowmachine/core/server/query_schemas/total_network_objects.py @@ -15,6 +15,7 @@ class TotalNetworkObjectsSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["total_network_objects"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) diff --git a/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py b/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py index 0a4d00a585..7827bc5cff 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/core/server/query_schemas/unique_subscriber_counts.py @@ -13,6 +13,7 @@ class UniqueSubscriberCountsSchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["unique_subscriber_counts"])) start_date = fields.Date(required=True) end_date = fields.Date(required=True) From 45b6a108a35c23fd4cdcce787c240be71611b6da Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 12:52:21 +0100 Subject: [PATCH 127/138] Add geojson_sql property --- .../flowmachine/core/server/query_schemas/geography.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/flowmachine/flowmachine/core/server/query_schemas/geography.py b/flowmachine/flowmachine/core/server/query_schemas/geography.py index 83423670a2..80dc17cb1f 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/geography.py +++ b/flowmachine/flowmachine/core/server/query_schemas/geography.py @@ -13,6 +13,7 @@ class GeographySchema(Schema): + # query_kind parameter is required here for claims validation query_kind = fields.String(validate=OneOf(["geography"])) aggregation_unit = AggregationUnit() @@ -37,3 +38,12 @@ def _flowmachine_query_obj(self): Query """ return Geography(get_spatial_unit_obj(self.aggregation_unit)) + + @property + def geojson_sql(self): + """ + Return a SQL string for getting the geography as GeoJSON. + """ + # Explicitly project to WGS84 (SRID=4326) to conform with GeoJSON standard + sql = self._flowmachine_query_obj.geojson_query(crs=4326) + return sql From b0c19b0a707cd6aac67e6272e5d2f102fc692190 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 12:52:59 +0100 Subject: [PATCH 128/138] Docstring and consistent gid --- .../core/server/query_schemas/aggregation_unit.py | 4 ++++ flowmachine/flowmachine/features/spatial/geography.py | 8 +------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py index 3f8afab95a..567b2d2995 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py @@ -1,6 +1,10 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Definition of a custom marshmallow field for aggregation units, and function +for getting the corresponding SpatialUnit object. +""" from flowmachine.core import make_spatial_unit diff --git a/flowmachine/flowmachine/features/spatial/geography.py b/flowmachine/flowmachine/features/spatial/geography.py index 0b395c1a89..59411c2192 100644 --- a/flowmachine/flowmachine/features/spatial/geography.py +++ b/flowmachine/flowmachine/features/spatial/geography.py @@ -39,14 +39,8 @@ def column_names(self) -> List[str]: return self.spatial_unit.location_id_columns + ["geom"] def _geo_augmented_query(self): - locid_columns = self.spatial_unit.location_id_columns - if len(locid_columns) == 1: - gid = f"{locid_columns[0]} AS gid" - else: - gid = "row_number() over() AS gid" - sql = f""" - SELECT {gid}, * + SELECT row_number() over() AS gid, * FROM ({self.get_query()}) AS Q """ From ee3f00b839d866cd5974a4115d4f8fb9652d2f52 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 13:38:04 +0100 Subject: [PATCH 129/138] Change one more 'level' to 'spatial_unit' --- docs/cache_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cache_queries.py b/docs/cache_queries.py index df4e013673..c3cee6cd93 100644 --- a/docs/cache_queries.py +++ b/docs/cache_queries.py @@ -168,7 +168,7 @@ ), label=label, ), - level="admin3", + spatial_unit=admin3_spatial_unit, ) for label in ["home", "work"] ] + [ From 415208f4bf9c4bd2a847ab0dc738e75d229521d2 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 12 Jun 2019 17:48:01 +0100 Subject: [PATCH 130/138] Fix cache_queries.py --- docs/cache_queries.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/cache_queries.py b/docs/cache_queries.py index c3cee6cd93..2147c436f0 100644 --- a/docs/cache_queries.py +++ b/docs/cache_queries.py @@ -18,8 +18,8 @@ admin1_spatial_unit = flowmachine.core.make_spatial_unit("admin", level=1) admin3_spatial_unit = flowmachine.core.make_spatial_unit("admin", level=3) -versioned_site_spatial_unit = flowmachine.core.make_spatial_unit("versioned-site") -versioned_cell_spatial_unit = flowmachine.core.make_spatial_unit("versioned-cell") +vsite_spatial_unit = flowmachine.core.make_spatial_unit("versioned-site") +vcell_spatial_unit = flowmachine.core.make_spatial_unit("versioned-cell") # FlowClient example usage example_usage_queries = [ @@ -148,10 +148,10 @@ meaningful_locations=flowmachine.features.MeaningfulLocations( clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( - subscriber_locations=flowmachine.features.subscriber_locations( + subscriber_locations=flowmachine.features.SubscriberLocations( start="2016-01-01", stop="2016-01-07", - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ) ), radius=1.0, @@ -164,7 +164,7 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ), label=label, ), @@ -176,10 +176,10 @@ meaningful_locations_a=flowmachine.features.MeaningfulLocations( clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( - subscriber_locations=flowmachine.features.subscriber_locations( + subscriber_locations=flowmachine.features.SubscriberLocations( start="2016-01-01", stop="2016-01-07", - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ) ), radius=1.0, @@ -192,17 +192,17 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ), label="home", ), meaningful_locations_b=flowmachine.features.MeaningfulLocations( clusters=flowmachine.features.HartiganCluster( calldays=flowmachine.features.CallDays( - subscriber_locations=flowmachine.features.subscriber_locations( + subscriber_locations=flowmachine.features.SubscriberLocations( start="2016-01-01", stop="2016-01-07", - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ) ), radius=1.0, @@ -215,7 +215,7 @@ stop="2016-01-07", score_hour=hour_scores, score_dow=day_scores, - spatial_unit=versioned_site_spatial_unit, + spatial_unit=vsite_spatial_unit, ), label="work", ), @@ -229,7 +229,7 @@ start="2016-01-01", stop="2016-01-07", table="events.mds", - spatial_unit=versioned_cell_spatial_unit, + spatial_unit=vcell_spatial_unit, interval="hour", ) ] From d846eee7bb16686d1e880ce5accd3c98fa8325a1 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Jun 2019 10:20:52 +0100 Subject: [PATCH 131/138] Update UniqueLocationCountsExposed to pass a spatial_unit parameter --- .../core/server/query_schemas/unique_location_counts.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/unique_location_counts.py b/flowmachine/flowmachine/core/server/query_schemas/unique_location_counts.py index 6d967765b2..b08cdcc870 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/unique_location_counts.py +++ b/flowmachine/flowmachine/core/server/query_schemas/unique_location_counts.py @@ -7,7 +7,8 @@ from flowmachine.features import UniqueLocationCounts from .base_exposed_query import BaseExposedQuery -from .custom_fields import SubscriberSubset, AggregationUnit +from .custom_fields import SubscriberSubset +from .aggregation_unit import AggregationUnit, get_spatial_unit_obj __all__ = ["UniqueLocationCountsSchema", "UniqueLocationCountsExposed"] @@ -47,6 +48,6 @@ def _flowmachine_query_obj(self): return UniqueLocationCounts( start=self.start_date, stop=self.end_date, - level=self.aggregation_unit, + spatial_unit=get_spatial_unit_obj(self.aggregation_unit), subscriber_subset=self.subscriber_subset, ) From d4439c5a5ef022bb32e009286167c61c07ba1438 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Jun 2019 17:07:33 +0100 Subject: [PATCH 132/138] Add test for AggregateNetworkObjects --- flowmachine/tests/test_total_network_objects.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/flowmachine/tests/test_total_network_objects.py b/flowmachine/tests/test_total_network_objects.py index 976d03ee31..b06cc7c8ab 100644 --- a/flowmachine/tests/test_total_network_objects.py +++ b/flowmachine/tests/test_total_network_objects.py @@ -106,6 +106,17 @@ def test_bad_spatial_units(bad_arg, spatial_unit_type): ) +def test_bad_aggregate_by(): + """Test that invalid 'aggregate_by' param raises value error.""" + with pytest.raises(ValueError): + AggregateNetworkObjects( + total_network_objects=TotalNetworkObjects( + start="2016-01-01", stop="2016-12-30", table="calls" + ), + aggregate_by="BAD_AGGREGATE_BY", + ) + + def test_bad_statistic(): """Test that invalid stat for aggregate raises value error.""" with pytest.raises(ValueError): From 4e6fc2e9e47138a1ebd35cd666a9a4d5c0adfa41 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Jun 2019 17:50:28 +0100 Subject: [PATCH 133/138] Add subscriber_location_cluster tests --- .../tests/test_subscriber_location_cluster.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 360224abc0..5929c4a4ac 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -295,6 +295,20 @@ def test_unlisted_methods_raises_error(): ) +def test_bad_subscriber_identifier_raises_error(): + """ + Test that passing an invalid subscriber_identifier raises an error. + """ + with pytest.raises(ValueError): + subscriber_location_cluster( + method="hartigan", + start="2016-01-01", + stop="2016-01-04", + radius=1, + subscriber_identifier="BAD_SUBSCRIBER_ID", + ) + + def test_lack_of_radius_with_hartigan_raises_error(): """ Test whether not passing a radius raises when choosing `hartigan` as a method raises an error @@ -312,3 +326,33 @@ def test_subscriber_location_clusters_defaults(): ) assert 0 == clus.buffer assert 0 == clus.call_threshold + + +def test_hartigan_cluster_bad_calldays_column_names_raises_error(): + """ + Test that using calldays without 'site_id' and 'version' columns raises an error. + """ + cd = CallDays( + SubscriberLocations( + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("lon-lat") + ) + ) + with pytest.raises(ValueError): + HartiganCluster(calldays=cd, radius=50) + + +def test_joined_hartigan_cluster_bad_query_column_names_raises_error(): + """ + Test that joining a HartiganCluster to a query without 'site_id' and 'version' columns raises an error. + """ + cd = CallDays( + SubscriberLocations( + "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") + ) + ) + HartiganCluster(calldays=cd, radius=50) + es = EventScore( + start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("lon-lat") + ) + with pytest.raises(ValueError): + hartigan.join_to_cluster_components(es) From 900c0d6b4dadd942e090140c02d319e82d6d9da1 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Jun 2019 18:06:43 +0100 Subject: [PATCH 134/138] Add tests for Displacement --- flowmachine/tests/test_displacement.py | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/flowmachine/tests/test_displacement.py b/flowmachine/tests/test_displacement.py index 011120468a..dfd30fcfb0 100644 --- a/flowmachine/tests/test_displacement.py +++ b/flowmachine/tests/test_displacement.py @@ -33,6 +33,18 @@ def test_returns_expected_values(stat, sub_a_expected, sub_b_expected, get_dataf assert df.loc[sub_b_id].statistic == pytest.approx(sub_b_expected) +def test_returns_expected_result_for_unit_m(get_dataframe): + """ + Test that we get expected results when unit='m'. + """ + sub_a_id, sub_b_id = "j6QYNbMJgAwlVORP", "NG1km5NzBg5JD8nj" + df = get_dataframe( + Displacement("2016-01-01", "2016-01-07", statistic="max", unit="m") + ).set_index("subscriber") + assert df.loc[sub_a_id].statistic == pytest.approx(500809.349) + assert df.loc[sub_b_id].statistic == pytest.approx(387024.628) + + def test_min_displacement_zero(get_dataframe): """ When time period for diplacement and home location are the same min displacement @@ -79,6 +91,23 @@ def test_error_when_modal_location_not_lon_lat(): Displacement("2016-01-01", "2016-01-02", modal_locations=ml, statistic="avg") +def test_error_when_not_modal_location(): + """ + Test that error is raised if modal_locations is not a ModalLocation. + """ + dl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("lon-lat")) + with pytest.raises(ValueError): + Displacement("2016-01-01", "2016-01-02", modal_locations=dl, statistic="avg") + + +def test_invalid_statistic_raises_error(): + """ + Test that passing an invalid statistic raises an error. + """ + with pytest.raises(ValueError): + Displacement("2016-01-01", "2016-01-07", statistic="BAD_STATISTIC") + + def test_get_all_users_in_modal_location(get_dataframe): """ This tests that diplacement values are returned for all subscribers From edf6c37597108bbc20bfcb57c4466a174cb1a1a4 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Jun 2019 18:27:50 +0100 Subject: [PATCH 135/138] Fix Hartigan test --- flowmachine/tests/test_subscriber_location_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/tests/test_subscriber_location_cluster.py b/flowmachine/tests/test_subscriber_location_cluster.py index 5929c4a4ac..055fced2a9 100644 --- a/flowmachine/tests/test_subscriber_location_cluster.py +++ b/flowmachine/tests/test_subscriber_location_cluster.py @@ -350,7 +350,7 @@ def test_joined_hartigan_cluster_bad_query_column_names_raises_error(): "2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site") ) ) - HartiganCluster(calldays=cd, radius=50) + hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore( start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("lon-lat") ) From 1848050d6c6da3bf4795d7d5da9878d6260ed9ec Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 28 Jun 2019 12:08:33 +0100 Subject: [PATCH 136/138] Add type annotations --- .../flowmachine/core/join_to_location.py | 12 ++- .../server/query_schemas/aggregation_unit.py | 5 +- flowmachine/flowmachine/core/spatial_unit.py | 85 +++++++++++-------- .../location/location_introversion.py | 13 +-- .../features/location/total_events.py | 3 +- .../location/unique_subscriber_counts.py | 5 +- .../features/network/total_network_objects.py | 7 +- .../features/spatial/distance_matrix.py | 9 +- .../flowmachine/features/spatial/geography.py | 3 +- .../features/subscriber/daily_location.py | 6 +- .../features/subscriber/entropy.py | 3 +- .../features/subscriber/first_location.py | 3 +- .../features/subscriber/last_location.py | 5 +- .../subscriber/meaningful_locations.py | 10 ++- .../subscriber/most_frequent_location.py | 5 +- .../subscriber/per_location_event_stats.py | 3 +- .../flowmachine/features/subscriber/scores.py | 5 +- .../subscriber/subscriber_call_durations.py | 7 +- .../subscriber/unique_location_counts.py | 3 +- .../flowmachine/features/utilities/sets.py | 5 +- .../utilities/subscriber_locations.py | 3 +- flowmachine/flowmachine/models/pwo.py | 18 ++-- flowmachine/flowmachine/utils.py | 4 +- 23 files changed, 133 insertions(+), 89 deletions(-) diff --git a/flowmachine/flowmachine/core/join_to_location.py b/flowmachine/flowmachine/core/join_to_location.py index 29baa1022c..5dade56426 100644 --- a/flowmachine/flowmachine/core/join_to_location.py +++ b/flowmachine/flowmachine/core/join_to_location.py @@ -12,10 +12,10 @@ to return a JoinToLocation object if a join is required, or the original query object otherwise. """ -from typing import List +from typing import List, Union from .query import Query -from .spatial_unit import SpatialUnitMixin +from .spatial_unit import SpatialUnitMixin, AnySpatialUnit, GeomSpatialUnit from .errors import InvalidSpatialUnitError @@ -51,7 +51,9 @@ class JoinToLocation(Query): """ - def __init__(self, left, *, spatial_unit, time_col="time"): + def __init__( + self, left: Query, *, spatial_unit: GeomSpatialUnit, time_col: str = "time" + ): # No need to join if spatial_unit has no geography information (i.e. just cell ID) spatial_unit.verify_criterion("has_geography") self.spatial_unit = spatial_unit @@ -108,7 +110,9 @@ def _make_query(self): return sql -def location_joined_query(left, *, spatial_unit, time_col="time"): +def location_joined_query( + left: Query, *, spatial_unit: AnySpatialUnit, time_col: str = "time" +): """ Helper function which returns JoinToLocation(left_query, spatial_unit, time_col) if spatial_unit has geography information, otherwise returns left_query. diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py index 567b2d2995..abfb01ed50 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py @@ -6,7 +6,8 @@ for getting the corresponding SpatialUnit object. """ -from flowmachine.core import make_spatial_unit +from flowmachine.core import make_ +from flowmachine.core.spatial_unit import GeomSpatialUnit from marshmallow.fields import String from marshmallow.validate import OneOf @@ -22,7 +23,7 @@ def __init__(self, required=True, **kwargs): super().__init__(required=required, validate=validate, **kwargs) -def get_spatial_unit_obj(aggregation_unit_string): +def get_spatial_unit_obj(aggregation_unit_string) -> GeomSpatialUnit: """ Given an aggregation unit string (as validated by AggregationUnit()), return a FlowMachine spatial unit object. diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index f12e241773..529a3484a1 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -7,7 +7,7 @@ The helper function 'make_spatial_unit' can be used to create spatial unit objects. """ -from typing import List +from typing import Union, List, Iterable, Optional from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError @@ -49,21 +49,21 @@ def location_id_columns(self) -> List[str]: return list(self._locid_cols) @property - def has_geography(self): + def has_geography(self) -> bool: """ True if spatial unit has geography information. """ return hasattr(self, "get_geom_query") @property - def has_lon_lat_columns(self): + def has_lon_lat_columns(self) -> bool: """ True if spatial unit has lon/lat columns. """ return "lon" in self.location_id_columns and "lat" in self.location_id_columns @property - def is_network_object(self): + def is_network_object(self) -> bool: """ True if spatial unit is a network object (cell or site). """ @@ -73,13 +73,13 @@ def is_network_object(self): ) @property - def is_polygon(self): + def is_polygon(self) -> bool: """ True if spatial unit's geographies are polygons. """ return isinstance(self, PolygonSpatialUnit) - def verify_criterion(self, criterion, negate=False): + def verify_criterion(self, criterion, negate=False) -> None: """ Check whether this spatial unit meets a criterion, and raise an InvalidSpatialUnitError if not. @@ -129,7 +129,7 @@ def verify_criterion(self, criterion, negate=False): + criteria[criterion]["message"] ) - def location_subset_clause(self, locations, check_column_names=True): + def location_subset_clause(self, locations, check_column_names: bool = True) -> str: """ Return a SQL "WHERE" clause to subset a query (joined to this spatial unit) to a location or set of locations. @@ -252,12 +252,12 @@ class GeomSpatialUnit(SpatialUnitMixin, Query): def __init__( self, *, - geom_table_column_names, - location_id_column_names, - geom_table=None, - geom_column="geom", - geom_table_join_on=None, - location_table_join_on=None, + geom_table_column_names: Union[str, Iterable[str]], + location_id_column_names: Union[str, Iterable[str]], + geom_table: Optional[Union[Query, str]] = None, + geom_column: str = "geom", + geom_table_join_on: Optional[str] = None, + location_table_join_on: Optional[str] = None, ): if isinstance(geom_table_column_names, str): self._geom_table_cols = (geom_table_column_names,) @@ -302,10 +302,10 @@ def __hash__(self): # Must define this because we explicitly define self.__eq__ return hash(self.md5) - def _get_aliased_geom_table_cols(self, table_alias): + def _get_aliased_geom_table_cols(self, table_alias: str) -> List[str]: return [f"{table_alias}.{c}" for c in self._geom_table_cols] - def _join_clause(self, loc_table_alias, geom_table_alias): + def _join_clause(self, loc_table_alias: str, geom_table_alias: str) -> str: """ Returns a SQL join clause to join the location table to the geography table. The join clause is not used if self.geom_table and @@ -389,7 +389,7 @@ def column_names(self) -> List[str]: cols += geom_table_cols return cols - def get_geom_query(self): + def get_geom_query(self) -> str: """ Returns a SQL query which can be used to map locations (identified by the values in self.location_id_columns) to their geometries (in a column @@ -450,12 +450,12 @@ class LonLatSpatialUnit(GeomSpatialUnit): def __init__( self, *, - geom_table_column_names=(), - location_id_column_names=(), - geom_table=None, - geom_column="geom_point", - geom_table_join_on=None, - location_table_join_on=None, + geom_table_column_names: Union[str, Iterable[str]] = (), + location_id_column_names: Union[str, Iterable[str]] = (), + geom_table: Optional[Union[Query, str]] = None, + geom_column: str = "geom_point", + geom_table_join_on: Optional[str] = None, + location_table_join_on: Optional[str] = None, ): super().__init__( geom_table_column_names=geom_table_column_names, @@ -466,7 +466,7 @@ def __init__( location_table_join_on=location_table_join_on, ) - def _get_aliased_geom_table_cols(self, table_alias): + def _get_aliased_geom_table_cols(self, table_alias: str) -> List[str]: return super()._get_aliased_geom_table_cols(table_alias) + [ f"ST_X({table_alias}.{self._geom_col}::geometry) AS lon", f"ST_Y({table_alias}.{self._geom_col}::geometry) AS lat", @@ -479,7 +479,7 @@ def location_id_columns(self) -> List[str]: """ return list(self._locid_cols) + ["lon", "lat"] - def location_subset_clause(self, locations, check_column_names=True): + def location_subset_clause(self, locations, check_column_names: bool = True) -> str: """ Return a SQL "WHERE" clause to subset a query (joined to this spatial unit) to a location or set of locations. This method differs from the @@ -591,7 +591,13 @@ class PolygonSpatialUnit(GeomSpatialUnit): Name of the column in geom_table that defines the geography. """ - def __init__(self, *, geom_table_column_names, geom_table, geom_column="geom"): + def __init__( + self, + *, + geom_table_column_names: Union[str, Iterable[str]], + geom_table: Union[Query, str], + geom_column: str = "geom", + ): if isinstance(geom_table_column_names, str): location_id_column_names = get_name_and_alias(geom_table_column_names)[1] else: @@ -605,7 +611,7 @@ def __init__(self, *, geom_table_column_names, geom_table, geom_column="geom"): geom_column=geom_column, ) - def _join_clause(self, loc_table_alias, geom_table_alias): + def _join_clause(self, loc_table_alias: str, geom_table_alias: str) -> str: return f""" INNER JOIN ({self.geom_table.get_query()}) AS {geom_table_alias} @@ -616,7 +622,7 @@ def _join_clause(self, loc_table_alias, geom_table_alias): """ -def versioned_cell_spatial_unit(): +def versioned_cell_spatial_unit() -> LonLatSpatialUnit: """ Returns a LonLatSpatialUnit that maps cell location_id to a cell version and lon-lat coordinates. @@ -635,7 +641,7 @@ def versioned_cell_spatial_unit(): ) -def versioned_site_spatial_unit(): +def versioned_site_spatial_unit() -> LonLatSpatialUnit: """ Returns a LonLatSpatialUnit that maps cell location_id to a site version and lon-lat coordinates. @@ -658,7 +664,9 @@ def versioned_site_spatial_unit(): ) -def admin_spatial_unit(*, level, region_id_column_name=None): +def admin_spatial_unit( + *, level: int, region_id_column_name: Optional[str] = None +) -> PolygonSpatialUnit: """ Returns a PolygonSpatialUnit object that maps all cells (aka sites) to an admin region. This assumes that you have geography data in the standard @@ -692,7 +700,7 @@ def admin_spatial_unit(*, level, region_id_column_name=None): return PolygonSpatialUnit(geom_table_column_names=col_name, geom_table=table) -def grid_spatial_unit(*, size): +def grid_spatial_unit(*, size: Union[float, int]) -> PolygonSpatialUnit: """ Returns a PolygonSpatialUnit that maps all the sites in the database to a grid of arbitrary size. @@ -714,15 +722,18 @@ def grid_spatial_unit(*, size): ) +AnySpatialUnit = Union[CellSpatialUnit, GeomSpatialUnit] + + def make_spatial_unit( - spatial_unit_type, + spatial_unit_type: str, *, - level=None, - region_id_column_name=None, - size=None, - geom_table=None, - geom_column="geom", -): + level: Optional[int] = None, + region_id_column_name: Optional[Union[str, Iterable[str]]] = None, + size: Union[float, int] = None, + geom_table: Optional[Union[Query, str]] = None, + geom_column: str = "geom", +) -> Union[CellSpatialUnit, GeomSpatialUnit]: """ Helper function to create an object representing a spatial unit. diff --git a/flowmachine/flowmachine/features/location/location_introversion.py b/flowmachine/flowmachine/features/location/location_introversion.py index 5daa44da1b..9a2f88541f 100644 --- a/flowmachine/flowmachine/features/location/location_introversion.py +++ b/flowmachine/flowmachine/features/location/location_introversion.py @@ -4,7 +4,7 @@ # -*- coding: utf-8 -*- -from typing import List +from typing import List, Union """ Location introversion [1]_ calculates the proportion @@ -24,6 +24,7 @@ from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ..utilities import EventsTablesUnion @@ -71,12 +72,12 @@ class LocationIntroversion(GeoDataMixin, Query): def __init__( self, - start, - stop, + start: str, + stop: str, *, - table="all", - spatial_unit=make_spatial_unit("cell"), - direction="both", + table: str = "all", + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), + direction: str = "both", hours="all", subscriber_subset=None, subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/location/total_events.py b/flowmachine/flowmachine/features/location/total_events.py index 75f7680192..460905a2d1 100644 --- a/flowmachine/flowmachine/features/location/total_events.py +++ b/flowmachine/flowmachine/features/location/total_events.py @@ -14,6 +14,7 @@ """ from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ..utilities import EventsTablesUnion from ...core import Query @@ -54,7 +55,7 @@ def __init__( stop: str, *, table: Union[None, List[str]] = None, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), interval: str = "hour", direction: str = "both", hours="all", diff --git a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py index 350789ece6..06d518664e 100644 --- a/flowmachine/flowmachine/features/location/unique_subscriber_counts.py +++ b/flowmachine/flowmachine/features/location/unique_subscriber_counts.py @@ -4,7 +4,7 @@ # -*- coding: utf-8 -*- -from typing import List +from typing import List, Union """ Class for UniqueSubscriberCounts. UniqueSubscriberCounts counts @@ -18,6 +18,7 @@ from ...core.query import Query from ...core.mixins import GeoDataMixin from ...core import make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ..utilities.subscriber_locations import SubscriberLocations @@ -74,7 +75,7 @@ def __init__( self, start, stop, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", table="all", ): diff --git a/flowmachine/flowmachine/features/network/total_network_objects.py b/flowmachine/flowmachine/features/network/total_network_objects.py index 816bc6db21..e4bed2fba6 100644 --- a/flowmachine/flowmachine/features/network/total_network_objects.py +++ b/flowmachine/flowmachine/features/network/total_network_objects.py @@ -11,10 +11,11 @@ """ -from typing import List +from typing import List, Optional from ...core.mixins import GeoDataMixin from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ...core.query import Query from ..utilities import EventsTablesUnion @@ -68,8 +69,8 @@ def __init__( *, table="all", total_by="day", - network_object=make_spatial_unit("cell"), - spatial_unit=None, + network_object: AnySpatialUnit = make_spatial_unit("cell"), + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/spatial/distance_matrix.py b/flowmachine/flowmachine/features/spatial/distance_matrix.py index 4606d21c13..8ba1995318 100644 --- a/flowmachine/flowmachine/features/spatial/distance_matrix.py +++ b/flowmachine/flowmachine/features/spatial/distance_matrix.py @@ -7,11 +7,12 @@ matrix from a given point collection. """ -from typing import List +from typing import List, Optional from ...core.query import Query from ...core.mixins import GraphMixin from ...core import make_spatial_unit +from ...core.spatial_unit import LonLatSpatialUnit class DistanceMatrix(GraphMixin, Query): @@ -37,7 +38,11 @@ class DistanceMatrix(GraphMixin, Query): """ - def __init__(self, spatial_unit=None, return_geometry=False): + def __init__( + self, + spatial_unit: Optional[LonLatSpatialUnit] = None, + return_geometry: bool = False, + ): if spatial_unit is None: self.spatial_unit = make_spatial_unit("versioned-cell") else: diff --git a/flowmachine/flowmachine/features/spatial/geography.py b/flowmachine/flowmachine/features/spatial/geography.py index 59411c2192..499e466502 100644 --- a/flowmachine/flowmachine/features/spatial/geography.py +++ b/flowmachine/flowmachine/features/spatial/geography.py @@ -11,6 +11,7 @@ from ...core.query import Query from ...core.mixins import GeoDataMixin +from ...core.spatial_unit import GeomSpatialUnit class Geography(GeoDataMixin, Query): @@ -29,7 +30,7 @@ class Geography(GeoDataMixin, Query): make_spatial_unit for more information. """ - def __init__(self, spatial_unit): + def __init__(self, spatial_unit: GeomSpatialUnit): spatial_unit.verify_criterion("has_geography") self.spatial_unit = spatial_unit super().__init__() diff --git a/flowmachine/flowmachine/features/subscriber/daily_location.py b/flowmachine/flowmachine/features/subscriber/daily_location.py index 64503c47a2..a80cdf9afd 100644 --- a/flowmachine/flowmachine/features/subscriber/daily_location.py +++ b/flowmachine/flowmachine/features/subscriber/daily_location.py @@ -12,8 +12,10 @@ """ import datetime +from typing import Optional from ...core import make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from .last_location import LastLocation from .most_frequent_location import MostFrequentLocation @@ -21,7 +23,7 @@ def locate_subscribers( start, stop, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", method="last", table="all", @@ -130,7 +132,7 @@ def daily_location( date, stop=None, *, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", method="last", table="all", diff --git a/flowmachine/flowmachine/features/subscriber/entropy.py b/flowmachine/flowmachine/features/subscriber/entropy.py index 14f7fe1c8b..8985c4a199 100644 --- a/flowmachine/flowmachine/features/subscriber/entropy.py +++ b/flowmachine/flowmachine/features/subscriber/entropy.py @@ -16,6 +16,7 @@ from ..utilities.sets import EventsTablesUnion from ..utilities.subscriber_locations import SubscriberLocations from flowmachine.core import make_spatial_unit +from flowmachine.core.spatial_unit import AnySpatialUnit class BaseEntropy(SubscriberFeature, metaclass=ABCMeta): @@ -257,7 +258,7 @@ def __init__( start, stop, *, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), subscriber_identifier="msisdn", hours="all", subscriber_subset=None, diff --git a/flowmachine/flowmachine/features/subscriber/first_location.py b/flowmachine/flowmachine/features/subscriber/first_location.py index dba4a6c826..f18b2150a1 100644 --- a/flowmachine/flowmachine/features/subscriber/first_location.py +++ b/flowmachine/flowmachine/features/subscriber/first_location.py @@ -13,6 +13,7 @@ from typing import List from flowmachine.core import make_spatial_unit +from flowmachine.core.spatial_unit import AnySpatialUnit from .metaclasses import SubscriberFeature from ..utilities.subscriber_locations import SubscriberLocations @@ -64,7 +65,7 @@ def __init__( stop, *, location, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", table="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/subscriber/last_location.py b/flowmachine/flowmachine/features/subscriber/last_location.py index 7542cf3d6b..df87460335 100644 --- a/flowmachine/flowmachine/features/subscriber/last_location.py +++ b/flowmachine/flowmachine/features/subscriber/last_location.py @@ -10,9 +10,10 @@ """ -from typing import List +from typing import List, Optional from flowmachine.core import Query, make_spatial_unit +from flowmachine.core.spatial_unit import AnySpatialUnit from ..utilities.subscriber_locations import BaseLocation from ..utilities.subscriber_locations import SubscriberLocations @@ -68,7 +69,7 @@ def __init__( self, start, stop, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", table="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py index 904ddcaf31..e79f2e94b0 100644 --- a/flowmachine/flowmachine/features/subscriber/meaningful_locations.py +++ b/flowmachine/flowmachine/features/subscriber/meaningful_locations.py @@ -2,9 +2,10 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from typing import Dict, Any, List, Union +from typing import Dict, Any, List, Union, Optional from ...core import GeoTable, Query, Grid, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from . import LabelEventScore, HartiganCluster, EventScore @@ -81,7 +82,10 @@ class MeaningfulLocationsAggregate(Query): """ def __init__( - self, *, meaningful_locations: MeaningfulLocations, spatial_unit=None + self, + *, + meaningful_locations: MeaningfulLocations, + spatial_unit: Optional[AnySpatialUnit] = None, ) -> None: self.meaningful_locations = meaningful_locations if spatial_unit is None: @@ -130,7 +134,7 @@ def __init__( *, meaningful_locations_a: MeaningfulLocations, meaningful_locations_b: MeaningfulLocations, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, ) -> None: self.flow = meaningful_locations_a.join( meaningful_locations_b, diff --git a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py index 34d193bd15..656564c4e6 100644 --- a/flowmachine/flowmachine/features/subscriber/most_frequent_location.py +++ b/flowmachine/flowmachine/features/subscriber/most_frequent_location.py @@ -9,9 +9,10 @@ """ -from typing import List +from typing import List, Optional from flowmachine.core import Query, make_spatial_unit +from flowmachine.core.spatial_unit import AnySpatialUnit from ..utilities.subscriber_locations import BaseLocation, SubscriberLocations @@ -66,7 +67,7 @@ def __init__( self, start, stop, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", table="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py b/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py index 564434d15b..4a2608490d 100644 --- a/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py +++ b/flowmachine/flowmachine/features/subscriber/per_location_event_stats.py @@ -7,6 +7,7 @@ from typing import List from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ..utilities.sets import EventsTablesUnion from .metaclasses import SubscriberFeature @@ -67,7 +68,7 @@ def __init__( stop, statistic="avg", *, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/subscriber/scores.py b/flowmachine/flowmachine/features/subscriber/scores.py index d4433608f9..465e7b3256 100644 --- a/flowmachine/flowmachine/features/subscriber/scores.py +++ b/flowmachine/flowmachine/features/subscriber/scores.py @@ -8,12 +8,13 @@ on a scoring dictionary. """ -from typing import Dict, Union, Tuple +from typing import Dict, Union, Tuple, Optional from typing import List from ..utilities import EventsTablesUnion from ...core import Query, location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit class EventScore(Query): @@ -78,7 +79,7 @@ def __init__( *, start: str, stop: str, - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours: Union[str, Tuple[int, int]] = "all", table: Union[str, List[str]] = "all", score_hour: List[float] = [ diff --git a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py index 1a5cb64798..2ad8af3940 100644 --- a/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py +++ b/flowmachine/flowmachine/features/subscriber/subscriber_call_durations.py @@ -10,9 +10,10 @@ """ import warnings -from typing import List +from typing import List, Optional from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from ..utilities import EventsTablesUnion from .metaclasses import SubscriberFeature @@ -165,7 +166,7 @@ def __init__( subscriber_identifier="msisdn", direction="out", statistic="sum", - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, ): @@ -377,7 +378,7 @@ def __init__( *, subscriber_identifier="msisdn", statistic="sum", - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, ): diff --git a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py index e54d9ab571..bc496e1ea3 100644 --- a/flowmachine/flowmachine/features/subscriber/unique_location_counts.py +++ b/flowmachine/flowmachine/features/subscriber/unique_location_counts.py @@ -14,6 +14,7 @@ from typing import List from flowmachine.core import make_spatial_unit +from flowmachine.core.spatial_unit import AnySpatialUnit from ..utilities.subscriber_locations import SubscriberLocations from .metaclasses import SubscriberFeature @@ -73,7 +74,7 @@ def __init__( start, stop, *, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/features/utilities/sets.py b/flowmachine/flowmachine/features/utilities/sets.py index 53e3609f46..5dcbfda2fd 100644 --- a/flowmachine/flowmachine/features/utilities/sets.py +++ b/flowmachine/flowmachine/features/utilities/sets.py @@ -7,11 +7,12 @@ """ -from typing import List +from typing import List, Optional from .event_table_subset import EventTableSubset from .events_tables_union import EventsTablesUnion from ...core import Query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit from numpy import inf @@ -176,7 +177,7 @@ def __init__( min_calls, subscriber_identifier="msisdn", direction="both", - spatial_unit=None, + spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, ): diff --git a/flowmachine/flowmachine/features/utilities/subscriber_locations.py b/flowmachine/flowmachine/features/utilities/subscriber_locations.py index 07f40c03d4..5cff7289d8 100644 --- a/flowmachine/flowmachine/features/utilities/subscriber_locations.py +++ b/flowmachine/flowmachine/features/utilities/subscriber_locations.py @@ -18,6 +18,7 @@ from ...core.query import Query from ...core import location_joined_query, make_spatial_unit +from ...core.spatial_unit import AnySpatialUnit import structlog @@ -83,7 +84,7 @@ def __init__( start, stop, *, - spatial_unit=make_spatial_unit("cell"), + spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", table="all", subscriber_identifier="msisdn", diff --git a/flowmachine/flowmachine/models/pwo.py b/flowmachine/flowmachine/models/pwo.py index 87920f17d8..a7181cd420 100644 --- a/flowmachine/flowmachine/models/pwo.py +++ b/flowmachine/flowmachine/models/pwo.py @@ -26,7 +26,7 @@ """ import warnings -from typing import List +from typing import List, Optional import pandas as pd @@ -36,6 +36,7 @@ from ..core.query import Query from ..core.model import Model, model_result from ..core import make_spatial_unit +from ..core.spatial_unit import LonLatSpatialUnit from ..features.spatial.distance_matrix import DistanceMatrix import structlog @@ -51,19 +52,16 @@ class _populationBuffer(Query): Parameters ---------- - spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit - Spatial unit to which subscriber locations are mapped. See the - docstring of spatial_unit.py for more information. population_object : flowmachine.features.utilities.spatial_aggregates.SpatialAggregate An aggregated subscriber locating object distance_matrix : flowmachine.features.spatial.distance_matrix.DistanceMatrix A distance matrix """ - def __init__(self, spatial_unit, population_object, distance_matrix): - self.spatial_unit = spatial_unit + def __init__(self, population_object, distance_matrix): self.population_object = population_object self.distance_matrix = distance_matrix + self.spatial_unit = self.distance_matrix.spatial_unit super().__init__() @@ -238,7 +236,12 @@ class PopulationWeightedOpportunities(Model): """ def __init__( - self, start, stop, method="home-location", spatial_unit=None, **kwargs + self, + start, + stop, + method="home-location", + spatial_unit: Optional[LonLatSpatialUnit] = None, + **kwargs, ): warnings.warn( @@ -268,7 +271,6 @@ def __init__( ).aggregate() self.population_buffer_object = _populationBuffer( - spatial_unit=self.spatial_unit, population_object=self.population_object, distance_matrix=self.distance_matrix, ) diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index 68e9231239..9c2864b0d7 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -19,7 +19,7 @@ from pglast import prettify from psycopg2._psycopg import adapt from time import sleep -from typing import List, Union +from typing import List, Union, Tuple logger = structlog.get_logger("flowmachine.debug", submodule=__name__) @@ -248,7 +248,7 @@ def _makesafe(x): return adapt(x).getquoted().decode() -def get_name_and_alias(column_name): +def get_name_and_alias(column_name: str) -> Tuple(str): """ Given a column name string, return the column name and alias (if there is one), or return the provided column name twice if there is no alias. From b2783af757ea9a680d7c647b1f15587728dd04f2 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 28 Jun 2019 12:35:19 +0100 Subject: [PATCH 137/138] Fix type annotation --- flowmachine/flowmachine/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/utils.py b/flowmachine/flowmachine/utils.py index 9c2864b0d7..2caaaae137 100644 --- a/flowmachine/flowmachine/utils.py +++ b/flowmachine/flowmachine/utils.py @@ -248,7 +248,7 @@ def _makesafe(x): return adapt(x).getquoted().decode() -def get_name_and_alias(column_name: str) -> Tuple(str): +def get_name_and_alias(column_name: str) -> Tuple[str]: """ Given a column name string, return the column name and alias (if there is one), or return the provided column name twice if there is no alias. From b007f8b4bda46225d5782dda12eb30badbd2b20d Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 28 Jun 2019 12:47:22 +0100 Subject: [PATCH 138/138] Fix typo --- .../flowmachine/core/server/query_schemas/aggregation_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py index abfb01ed50..015f3ec9a4 100644 --- a/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py +++ b/flowmachine/flowmachine/core/server/query_schemas/aggregation_unit.py @@ -6,7 +6,7 @@ for getting the corresponding SpatialUnit object. """ -from flowmachine.core import make_ +from flowmachine.core import make_spatial_unit from flowmachine.core.spatial_unit import GeomSpatialUnit from marshmallow.fields import String