import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import BooleanNullable, Double, LatLong
from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.utils import _haversine_calculate
[docs]class CityblockDistance(TransformPrimitive):
"""Calculates the distance between points in a city road grid.
Description:
This distance is calculated using the haversine formula, which
takes into account the curvature of the Earth.
If either input data contains `NaN`s, the calculated
distance with be `NaN`.
This calculation is also known as the Mahnattan distance.
Args:
unit (str): Determines the unit value to output. Could
be miles or kilometers. Default is miles.
Examples:
>>> cityblock_distance = CityblockDistance()
>>> DC = (38, -77)
>>> Boston = (43, -71)
>>> NYC = (40, -74)
>>> distances_mi = cityblock_distance([DC, DC], [NYC, Boston])
>>> np.round(distances_mi, 3).tolist()
[301.519, 672.089]
We can also change the units in which the distance is calculated.
>>> cityblock_distance_kilometers = CityblockDistance(unit='kilometers')
>>> distances_km = cityblock_distance_kilometers([DC, DC], [NYC, Boston])
>>> np.round(distances_km, 3).tolist()
[485.248, 1081.622]
"""
name = "cityblock_distance"
input_types = [
ColumnSchema(logical_type=LatLong),
ColumnSchema(logical_type=LatLong),
]
return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
commutative = True
[docs] def __init__(self, unit="miles"):
if unit not in ["miles", "kilometers"]:
raise ValueError("Invalid unit given")
self.unit = unit
def get_function(self):
def cityblock(latlong_1, latlong_2):
latlong_1 = np.array(latlong_1.tolist())
latlong_2 = np.array(latlong_2.tolist())
lat_1s = latlong_1[:, 0]
lat_2s = latlong_2[:, 0]
lon_1s = latlong_1[:, 1]
lon_2s = latlong_2[:, 1]
lon_dis = _haversine_calculate(lat_1s, lon_1s, lat_1s, lon_2s, self.unit)
lat_dist = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_1s, self.unit)
return pd.Series(lon_dis + lat_dist)
return cityblock
[docs]class GeoMidpoint(TransformPrimitive):
"""Determines the geographic center of two coordinates.
Examples:
>>> geomidpoint = GeoMidpoint()
>>> geomidpoint([(42.4, -71.1)], [(40.0, -122.4)])
[(41.2, -96.75)]
"""
name = "geomidpoint"
input_types = [
ColumnSchema(logical_type=LatLong),
ColumnSchema(logical_type=LatLong),
]
return_type = ColumnSchema(logical_type=LatLong)
commutative = True
def get_function(self):
def geomidpoint_func(latlong_1, latlong_2):
latlong_1 = np.array(latlong_1.tolist())
latlong_2 = np.array(latlong_2.tolist())
lat_1s = latlong_1[:, 0]
lat_2s = latlong_2[:, 0]
lon_1s = latlong_1[:, 1]
lon_2s = latlong_2[:, 1]
lat_middle = np.array([lat_1s, lat_2s]).transpose().mean(axis=1)
lon_middle = np.array([lon_1s, lon_2s]).transpose().mean(axis=1)
return list(zip(lat_middle, lon_middle))
return geomidpoint_func
[docs]class Haversine(TransformPrimitive):
"""Calculates the approximate haversine distance between two LatLong columns.
Args:
unit (str): Determines the unit value to output. Could
be `miles` or `kilometers`. Default is `miles`.
Examples:
>>> haversine = Haversine()
>>> distances = haversine([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances, 3).tolist()
[2631.231, 1343.289]
Output units can be specified
>>> haversine_km = Haversine(unit='kilometers')
>>> distances_km = haversine_km([(42.4, -71.1), (40.0, -122.4)],
... [(40.0, -122.4), (41.2, -96.75)])
>>> np.round(distances_km, 3).tolist()
[4234.555, 2161.814]
"""
name = "haversine"
input_types = [
ColumnSchema(logical_type=LatLong),
ColumnSchema(logical_type=LatLong),
]
return_type = ColumnSchema(semantic_tags={"numeric"})
commutative = True
[docs] def __init__(self, unit="miles"):
valid_units = ["miles", "kilometers"]
if unit not in valid_units:
error_message = "Invalid unit %s provided. Must be one of %s" % (
unit,
valid_units,
)
raise ValueError(error_message)
self.unit = unit
self.description_template = (
"the haversine distance in {} between {{}} and {{}}".format(self.unit)
)
def get_function(self):
def haversine(latlong_1, latlong_2):
latlong_1 = np.array(latlong_1.tolist())
latlong_2 = np.array(latlong_2.tolist())
lat_1s = latlong_1[:, 0]
lat_2s = latlong_2[:, 0]
lon_1s = latlong_1[:, 1]
lon_2s = latlong_2[:, 1]
distance = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_2s, self.unit)
return distance
return haversine
def generate_name(self, base_feature_names):
name = "{}(".format(self.name.upper())
name += ", ".join(base_feature_names)
if self.unit != "miles":
name += ", unit={}".format(self.unit)
name += ")"
return name
[docs]class IsInGeoBox(TransformPrimitive):
"""Determines if coordinates are inside a box defined by two
corner coordinate points.
Description:
Coordinate values should be specified as (latitude, longitude)
tuples. This primitive is unable to handle coordinates and boxes
at the poles, and near +/- 180 degrees latitude.
Args:
point1 (tuple(float, float)): The coordinates
of the first corner of the box. Defaults to (0, 0).
point2 (tuple(float, float)): The coordinates
of the diagonal corner of the box. Defaults to (0, 0).
Example:
>>> is_in_geobox = IsInGeoBox((40.7128, -74.0060), (42.2436, -71.1677))
>>> is_in_geobox([(41.034, -72.254), (39.125, -87.345)]).tolist()
[True, False]
"""
name = "is_in_geobox"
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(logical_type=BooleanNullable)
[docs] def __init__(self, point1=(0, 0), point2=(0, 0)):
self.point1 = point1
self.point2 = point2
self.lats = np.sort(np.array([point1[0], point2[0]]))
self.lons = np.sort(np.array([point1[1], point2[1]]))
def get_function(self):
def geobox(latlongs):
transposed = np.transpose(np.array(latlongs.tolist()))
lats = (self.lats[0] <= transposed[0]) & (self.lats[1] >= transposed[0])
longs = (self.lons[0] <= transposed[1]) & (self.lons[1] >= transposed[1])
return lats & longs
return geobox
[docs]class Latitude(TransformPrimitive):
"""Returns the first tuple value in a list of LatLong tuples.
For use with the LatLong logical type.
Examples:
>>> latitude = Latitude()
>>> latitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[42.4, 40.0, 41.2]
"""
name = "latitude"
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(semantic_tags={"numeric"})
description_template = "the latitude of {}"
def get_function(self):
def latitude(latlong):
latlong = np.array(latlong.tolist())
return latlong[:, 0]
return latitude
[docs]class Longitude(TransformPrimitive):
"""Returns the second tuple value in a list of LatLong tuples.
For use with the LatLong logical type.
Examples:
>>> longitude = Longitude()
>>> longitude([(42.4, -71.1),
... (40.0, -122.4),
... (41.2, -96.75)]).tolist()
[-71.1, -122.4, -96.75]
"""
name = "longitude"
input_types = [ColumnSchema(logical_type=LatLong)]
return_type = ColumnSchema(semantic_tags={"numeric"})
description_template = "the longitude of {}"
def get_function(self):
def longitude(latlong):
latlong = np.array(latlong.tolist())
return latlong[:, 1]
return longitude