import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import BooleanNullable, Double, LatLong
from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.utils import (
_deconstrct_latlongs,
_haversine_calculate
)
[docs]class CityblockDistance(TransformPrimitive):
"""Calculates the distance between points in a city road grid.
Description:
This distance is calculated using the haversine formula, which
takes into account the curvature of the Earth.
If either input data contains `NaN`s, the calculated
distance with be `NaN`.
This calculation is also known as the Mahnattan distance.
Args:
unit (str): Determines the unit value to output. Could
be miles or kilometers. Default is miles.
Examples:
>>> cityblock_distance = CityblockDistance()
>>> DC = (38, -77)
>>> Boston = (43, -71)
>>> NYC = (40, -74)
>>> distances_mi = cityblock_distance([DC, DC], [NYC, Boston])
>>> np.round(distances_mi, 3).tolist()
[301.519, 672.089]
We can also change the units in which the distance is calculated.
>>> cityblock_distance_kilometers = CityblockDistance(unit='kilometers')
>>> distances_km = cityblock_distance_kilometers([DC, DC], [NYC, Boston])
>>> np.round(distances_km, 3).tolist()
[485.248, 1081.622]
"""
name = "cityblock_distance"
input_types = [ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
commutative = True
[docs] def __init__(self, unit='miles'):
if unit not in ['miles', 'kilometers']:
raise ValueError("Invalid unit given")
self.unit = unit
def get_function(self):
def cityblock(latlong_1, latlong_2):
lat_1s, lon_1s = _deconstrct_latlongs(latlong_1)
lat_2s, lon_2s = _deconstrct_latlongs(latlong_2)
lon_dis = _haversine_calculate(lat_1s, lon_1s, lat_1s, lon_2s,
self.unit)
lat_dist = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_1s,
self.unit)
return pd.Series(lon_dis + lat_dist)
return cityblock
[docs]class GeoMidpoint(TransformPrimitive):
"""Determines the geographic center of two coordinates.
Examples:
>>> geomidpoint = GeoMidpoint()
>>> geomidpoint([(42.4, -71.1)], [(40.0, -122.4)])
[(41.2, -96.75)]
"""
name = "geomidpoint"
input_types = [ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(logical_type=LatLong)
commutative = True
def get_function(self):
def geomidpoint_func(latlong_1, latlong_2):
lat_1s, lon_1s = _deconstrct_latlongs(latlong_1)
lat_2s, lon_2s = _deconstrct_latlongs(latlong_2)
lat_middle = np.array([lat_1s, lat_2s]).transpose().mean(axis=1)
lon_middle = np.array([lon_1s, lon_2s]).transpose().mean(axis=1)
return list(zip(lat_middle, lon_middle))
return geomidpoint_func
[docs]class Haversine(TransformPrimitive):
"""Calculates the approximate haversine distance between two LatLong columns.
Args:
unit (str): Determines the unit value to output. Could
be `miles` or `kilometers`. Default is `miles`.
Examples:
>>> haversine = Haversine()
>>> distances = haversine([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances, 3).tolist()
[2631.231, 1343.289]
Output units can be specified
>>> haversine_km = Haversine(unit='kilometers')
>>> distances_km = haversine_km([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances_km, 3).tolist()
[4234.555, 2161.814]
"""
name = 'haversine'
input_types = [ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(semantic_tags={'numeric'})
commutative = True
[docs] def __init__(self, unit='miles'):
valid_units = ['miles', 'kilometers']
if unit not in valid_units:
error_message = 'Invalid unit %s provided. Must be one of %s' % (unit, valid_units)
raise ValueError(error_message)
self.unit = unit
self.description_template = "the haversine distance in {} between {{}} and {{}}".format(self.unit)
def get_function(self):
def haversine(latlong_1, latlong_2):
lat_1s, lon_1s = _deconstrct_latlongs(latlong_1)
lat_2s, lon_2s = _deconstrct_latlongs(latlong_2)
distance = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_2s, self.unit)
return distance
return haversine
def generate_name(self, base_feature_names):
name = u"{}(".format(self.name.upper())
name += u", ".join(base_feature_names)
if self.unit != 'miles':
name += u", unit={}".format(self.unit)
name += u")"
return name
[docs]class IsInGeoBox(TransformPrimitive):
"""Determines if coordinates are inside a box defined by two
corner coordinate points.
Description:
Coordinate values should be specified as (latitude, longitude)
tuples. This primitive is unable to handle coordinates and boxes
at the poles, and near +/- 180 degrees latitude.
Args:
point1 (tuple(float, float)): The coordinates
of the first corner of the box. Defaults to (0, 0).
point2 (tuple(float, float)): The coordinates
of the diagonal corner of the box. Defaults to (0, 0).
Example:
>>> is_in_geobox = IsInGeoBox((40.7128, -74.0060), (42.2436, -71.1677))
>>> is_in_geobox([(41.034, -72.254), (39.125, -87.345)]).tolist()
[True, False]
"""
name = "is_in_geobox"
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(logical_type=BooleanNullable)
[docs] def __init__(self, point1=(0, 0), point2=(0, 0)):
self.point1 = point1
self.point2 = point2
self.lats = np.sort(np.array([point1[0], point2[0]]))
self.lons = np.sort(np.array([point1[1], point2[1]]))
def get_function(self):
def geobox(latlongs):
if latlongs.hasnans:
latlongs = np.where(latlongs.isnull(), pd.Series([(np.nan, np.nan)] * len(latlongs)), latlongs)
transposed = np.transpose([list(latlon) for latlon in latlongs])
lats = (self.lats[0] <= transposed[0]) & \
(self.lats[1] >= transposed[0])
longs = (self.lons[0] <= transposed[1]) & \
(self.lons[1] >= transposed[1])
return lats & longs
return geobox
[docs]class Latitude(TransformPrimitive):
"""Returns the first tuple value in a list of LatLong tuples.
For use with the LatLong logical type.
Examples:
>>> latitude = Latitude()
>>> latitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[42.4, 40.0, 41.2]
"""
name = 'latitude'
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(semantic_tags={'numeric'})
description_template = "the latitude of {}"
def get_function(self):
def latitude(latlong):
return latlong.map(lambda x: x[0] if isinstance(x, tuple) else np.nan)
return latitude
[docs]class Longitude(TransformPrimitive):
"""Returns the second tuple value in a list of LatLong tuples.
For use with the LatLong logical type.
Examples:
>>> longitude = Longitude()
>>> longitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[-71.1, -122.4, -96.75]
"""
name = 'longitude'
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(semantic_tags={'numeric'})
description_template = "the longitude of {}"
def get_function(self):
def longitude(latlong):
return latlong.map(lambda x: x[1] if isinstance(x, tuple) else np.nan)
return longitude