import holidays
import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import (
AgeFractional,
BooleanNullable,
Categorical,
Datetime,
Ordinal,
)
from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.utils import HolidayUtil
from featuretools.utils import convert_time_units
from featuretools.utils.gen_utils import Library
class Age(TransformPrimitive):
"""Calculates the age in years as a floating point number given a
date of birth.
Description:
Age in years is computed by calculating the number of days between
the date of birth and the reference time and dividing the result
by 365.
Examples:
Determine the age of three people as of Jan 1, 2019
>>> import pandas as pd
>>> reference_date = pd.to_datetime("01-01-2019")
>>> age = Age()
>>> input_ages = [pd.to_datetime("01-01-2000"),
... pd.to_datetime("05-30-1983"),
... pd.to_datetime("10-17-1997")]
>>> age(input_ages, time=reference_date).tolist()
[19.013698630136986, 35.61643835616438, 21.221917808219178]
"""
name = "age"
input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"date_of_birth"})]
return_type = ColumnSchema(logical_type=AgeFractional, semantic_tags={"numeric"})
uses_calc_time = True
compatibility = [Library.PANDAS, Library.DASK]
description_template = "the age from {}"
def get_function(self):
def age(x, time=None):
return (time - x).dt.days / 365
return age
class DateToHoliday(TransformPrimitive):
"""Transforms time of an instance into the holiday name, if there is one.
Description:
If there is no holiday, it returns `NaN`. Currently only works for the
United States and Canada with dates between 1950 and 2100.
Args:
country (str): Country to use for determining Holidays.
Default is 'US'. Should be one of the available countries here:
https://github.com/dr-prodigy/python-holidays#available-countries
Examples:
>>> from datetime import datetime
>>> date_to_holiday = DateToHoliday()
>>> dates = pd.Series([datetime(2016, 1, 1),
... datetime(2016, 2, 27),
... datetime(2017, 5, 29, 10, 30, 5),
... datetime(2018, 7, 4)])
>>> date_to_holiday(dates).tolist()
["New Year's Day", nan, 'Memorial Day', 'Independence Day']
We can also change the country.
>>> date_to_holiday_canada = DateToHoliday(country='Canada')
>>> dates = pd.Series([datetime(2016, 7, 1),
... datetime(2016, 11, 15),
... datetime(2017, 12, 26),
... datetime(2018, 9, 3)])
>>> date_to_holiday_canada(dates).tolist()
['Canada Day', nan, 'Boxing Day', 'Labour Day']
"""
name = "date_to_holiday"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"})
def __init__(self, country="US"):
self.country = country
self.holidayUtil = HolidayUtil(country)
def get_function(self):
def date_to_holiday(x):
holiday_df = self.holidayUtil.to_df()
df = pd.DataFrame({"date": x})
df["date"] = df["date"].dt.date.astype("datetime64[ns]")
df = df.merge(
holiday_df, how="left", left_on="date", right_on="holiday_date"
)
return df.names.values
return date_to_holiday
[docs]class Day(TransformPrimitive):
"""Determines the day of the month from a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3),
... datetime(2019, 3, 31)]
>>> day = Day()
>>> day(dates).tolist()
[1, 3, 31]
"""
name = "day"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 32))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the day of the month of {}"
def get_function(self):
def day(vals):
return vals.dt.day
return day
[docs]class DayOfYear(TransformPrimitive):
"""Determines the ordinal day of the year from the given datetime
Description:
For a list of dates, return the ordinal day of the year
from the given datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 1, 1),
... datetime(2020, 12, 31),
... datetime(2020, 2, 28)]
>>> dayOfYear = DayOfYear()
>>> dayOfYear(dates).tolist()
[1, 366, 59]
"""
name = "day_of_year"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 367))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the day of year from {}"
def get_function(self):
def dayOfYear(vals):
return vals.dt.dayofyear
return dayOfYear
[docs]class DaysInMonth(TransformPrimitive):
"""Determines the day of the month from a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 12, 1),
... datetime(2019, 1, 3),
... datetime(2020, 2, 1)]
>>> days_in_month = DaysInMonth()
>>> days_in_month(dates).tolist()
[31, 31, 29]
"""
name = "days_in_month"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 32))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the days in the month of {}"
def get_function(self):
def days_in_month(vals):
return vals.dt.daysinmonth
return days_in_month
class DistanceToHoliday(TransformPrimitive):
"""Computes the number of days before or after a given holiday.
Description:
For a list of dates, return the distance from the nearest
occurrence of a chosen holiday. The distance is returned in
days. If the closest occurrence is prior to the date given,
return a negative number.
If a date is missing, return `NaN`.
Currently only works with dates between 1950 and 2100.
Args:
holiday (str): Name of the holiday. Defaults to New Year's Day.
country (str): Specifies which country's calendar to use for the
given holiday. Default is `US`.
Examples:
>>> from datetime import datetime
>>> distance_to_holiday = DistanceToHoliday("New Year's Day")
>>> dates = [datetime(2010, 1, 1),
... datetime(2012, 5, 31),
... datetime(2017, 7, 31),
... datetime(2020, 12, 31)]
>>> distance_to_holiday(dates).tolist()
[0, -151, 154, 1]
We can also control the country in which we're searching for
a holiday.
>>> distance_to_holiday = DistanceToHoliday("Victoria Day", country='Canada')
>>> dates = [datetime(2010, 1, 1),
... datetime(2012, 5, 31),
... datetime(2017, 7, 31),
... datetime(2020, 12, 31)]
>>> distance_to_holiday(dates).tolist()
[143, -10, -70, 144]
"""
name = "distance_to_holiday"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(semantic_tags={"numeric"})
default_value = 0
def __init__(self, holiday="New Year's Day", country="US"):
self.country = country
self.holiday = holiday
self.holidayUtil = HolidayUtil(country)
available_holidays = list(set(self.holidayUtil.federal_holidays.values()))
if self.holiday not in available_holidays:
error = "must be one of the available holidays:\n%s" % available_holidays
raise ValueError(error)
def get_function(self):
def distance_to_holiday(x):
holiday_df = self.holidayUtil.to_df()
holiday_df = holiday_df[holiday_df.names == self.holiday]
df = pd.DataFrame({"date": x})
df["x_index"] = df.index # store original index as a column
df = df.dropna()
df = df.sort_values("date")
df["date"] = df["date"].dt.date.astype("datetime64[ns]")
matches = pd.merge_asof(
df,
holiday_df,
left_on="date",
right_on="holiday_date",
direction="nearest",
tolerance=pd.Timedelta("365d"),
)
matches = matches.set_index("x_index")
matches["days_diff"] = (matches.holiday_date - matches.date).dt.days
return matches.days_diff.reindex_like(x)
return distance_to_holiday
[docs]class Hour(TransformPrimitive):
"""Determines the hour value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> hour = Hour()
>>> hour(dates).tolist()
[0, 11, 19]
"""
name = "hour"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(24))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the hour value of {}"
def get_function(self):
def hour(vals):
return vals.dt.hour
return hour
[docs]class IsLeapYear(TransformPrimitive):
"""Determines the is_leap_year attribute of a datetime column.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2020, 3, 3, 11, 10, 50),
... datetime(2021, 3, 31, 19, 45, 15)]
>>> ily = IsLeapYear()
>>> ily(dates).tolist()
[False, True, False]
"""
name = "is_leap_year"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether the year of {} is a leap year"
def get_function(self):
def is_leap_year(vals):
return vals.dt.is_leap_year
return is_leap_year
[docs]class IsMonthEnd(TransformPrimitive):
"""Determines the is_month_end attribute of a datetime column.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2021, 2, 28),
... datetime(2020, 2, 29)]
>>> ime = IsMonthEnd()
>>> ime(dates).tolist()
[False, True, True]
"""
name = "is_month_end"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} is at the end of a month"
def get_function(self):
def is_month_end(vals):
return vals.dt.is_month_end
return is_month_end
[docs]class IsMonthStart(TransformPrimitive):
"""Determines the is_month_start attribute of a datetime column.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2020, 2, 13),
... datetime(2020, 2, 29)]
>>> ims = IsMonthStart()
>>> ims(dates).tolist()
[True, False, False]
"""
name = "is_month_start"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} is at the start of a month"
def get_function(self):
def is_month_start(vals):
return vals.dt.is_month_start
return is_month_start
[docs]class IsQuarterEnd(TransformPrimitive):
"""Determines the is_quarter_end attribute of a datetime column.
Examples:
>>> from datetime import datetime
>>> iqe = IsQuarterEnd()
>>> dates = [datetime(2020, 3, 31),
... datetime(2020, 1, 1)]
>>> iqe(dates).tolist()
[True, False]
"""
name = "is_quarter_end"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} is a quarter end"
def get_function(self):
def is_quarter_end(vals):
return vals.dt.is_quarter_end
return is_quarter_end
[docs]class IsQuarterStart(TransformPrimitive):
"""Determines the is_quarter_start attribute of a datetime column.
Examples:
>>> from datetime import datetime
>>> iqs = IsQuarterStart()
>>> dates = [datetime(2020, 3, 31),
... datetime(2020, 1, 1)]
>>> iqs(dates).tolist()
[False, True]
"""
name = "is_quarter_start"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} is a quarter start"
def get_function(self):
def is_quarter_start(vals):
return vals.dt.is_quarter_start
return is_quarter_start
[docs]class IsWeekend(TransformPrimitive):
"""Determines if a date falls on a weekend.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> is_weekend = IsWeekend()
>>> is_weekend(dates).tolist()
[False, False, True]
"""
name = "is_weekend"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} occurred on a weekend"
def get_function(self):
def is_weekend(vals):
return vals.dt.weekday > 4
return is_weekend
[docs]class IsYearEnd(TransformPrimitive):
"""Determines if a date falls on the end of a year.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 12, 31),
... datetime(2019, 1, 1),
... datetime(2019, 11, 30),
... np.nan]
>>> is_year_end = IsYearEnd()
>>> is_year_end(dates).tolist()
[True, False, False, False]
"""
name = "is_year_end"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} occurred on the end of a year"
def get_function(self):
def is_year_end(vals):
return vals.dt.is_year_end
return is_year_end
[docs]class IsYearStart(TransformPrimitive):
"""Determines if a date falls on the start of a year.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 12, 31),
... datetime(2019, 1, 1),
... datetime(2019, 11, 30),
... np.nan]
>>> is_year_start = IsYearStart()
>>> is_year_start(dates).tolist()
[False, True, False, False]
"""
name = "is_year_start"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "whether {} occurred on the start of a year"
def get_function(self):
def is_year_start(vals):
return vals.dt.is_year_start
return is_year_start
[docs]class Minute(TransformPrimitive):
"""Determines the minutes value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> minute = Minute()
>>> minute(dates).tolist()
[0, 10, 45]
"""
name = "minute"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(60))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the minutes value of {}"
def get_function(self):
def minute(vals):
return vals.dt.minute
return minute
[docs]class Month(TransformPrimitive):
"""Determines the month value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> month = Month()
>>> month(dates).tolist()
[3, 6, 11]
"""
name = "month"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 13))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the month of {}"
def get_function(self):
def month(vals):
return vals.dt.month
return month
[docs]class PartOfDay(TransformPrimitive):
"""Determines the part of day of a datetime.
Description:
For a list of datetimes, determines the part of day the datetime
falls into, based on the hour.
If the hour falls from 4 to 5, the part of day is 'dawn'.
If the hour falls from 6 to 7, the part of day is 'early morning'.
If the hour falls from 8 to 10, the part of day is 'late morning'.
If the hour falls from 11 to 13, the part of day is 'noon'.
If the hour falls from 14 to 16, the part of day is 'afternoon'.
If the hour falls from 17 to 19, the part of day is 'evening'.
If the hour falls from 20 to 22, the part of day is 'night'.
If the hour falls into 23, 24, or 1 to 3, the part of day is 'midnight'.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2020, 1, 11, 6, 2, 1),
... datetime(2021, 3, 31, 4, 2, 1),
... datetime(2020, 3, 4, 9, 2, 1)]
>>> part_of_day = PartOfDay()
>>> part_of_day(dates).tolist()
['early morning', 'dawn', 'late morning']
"""
name = "part_of_day"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"})
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the part of day {} falls in"
@staticmethod
def construct_replacement_dict():
tdict = dict()
tdict[pd.NaT] = np.nan
for hour in [4, 5]:
tdict[hour] = "dawn"
for hour in [6, 7]:
tdict[hour] = "early morning"
for hour in [8, 9, 10]:
tdict[hour] = "late morning"
for hour in [11, 12, 13]:
tdict[hour] = "noon"
for hour in [14, 15, 16]:
tdict[hour] = "afternoon"
for hour in [17, 18, 19]:
tdict[hour] = "evening"
for hour in [20, 21, 22]:
tdict[hour] = "night"
for hour in [23, 24, 1, 2, 3]:
tdict[hour] = "midnight"
return tdict
def get_function(self):
replacement_dict = self.construct_replacement_dict()
def part_of_day(vals):
ans = vals.dt.hour.replace(replacement_dict)
return ans
return part_of_day
[docs]class Quarter(TransformPrimitive):
"""Determines the quarter a datetime column falls into (1, 2, 3, 4)
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019,12,1),
... datetime(2019,1,3),
... datetime(2020,2,1)]
>>> q = Quarter()
>>> q(dates).tolist()
[4, 1, 1]
"""
name = "quarter"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 5))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the quarter that describes {}"
def get_function(self):
def quarter(vals):
return vals.dt.quarter
return quarter
[docs]class Second(TransformPrimitive):
"""Determines the seconds value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 3, 3, 11, 10, 50),
... datetime(2019, 3, 31, 19, 45, 15)]
>>> second = Second()
>>> second(dates).tolist()
[0, 50, 15]
"""
name = "second"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(60))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the seconds value of {}"
def get_function(self):
def second(vals):
return vals.dt.second
return second
[docs]class TimeSince(TransformPrimitive):
"""Calculates time from a value to a specified cutoff datetime.
Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
Examples:
>>> from datetime import datetime
>>> time_since = TimeSince()
>>> times = [datetime(2019, 3, 1, 0, 0, 0, 1),
... datetime(2019, 3, 1, 0, 0, 1, 0),
... datetime(2019, 3, 1, 0, 2, 0, 0)]
>>> cutoff_time = datetime(2019, 3, 1, 0, 0, 0, 0)
>>> values = time_since(times, time=cutoff_time)
>>> list(map(int, values))
[0, -1, -120]
Change output to nanoseconds
>>> from datetime import datetime
>>> time_since_nano = TimeSince(unit='nanoseconds')
>>> times = [datetime(2019, 3, 1, 0, 0, 0, 1),
... datetime(2019, 3, 1, 0, 0, 1, 0),
... datetime(2019, 3, 1, 0, 2, 0, 0)]
>>> cutoff_time = datetime(2019, 3, 1, 0, 0, 0, 0)
>>> values = time_since_nano(times, time=cutoff_time)
>>> list(map(lambda x: int(round(x)), values))
[-1000, -1000000000, -120000000000]
"""
name = "time_since"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(semantic_tags={"numeric"})
uses_calc_time = True
compatibility = [Library.PANDAS, Library.DASK]
description_template = "the time from {} to the cutoff time"
[docs] def __init__(self, unit="seconds"):
self.unit = unit.lower()
def get_function(self):
def pd_time_since(array, time):
return convert_time_units((time - array).dt.total_seconds(), self.unit)
return pd_time_since
[docs]class TimeSincePrevious(TransformPrimitive):
"""Compute the time since the previous entry in a list.
Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
Description:
Given a list of datetimes, compute the time in seconds elapsed since
the previous item in the list. The result for the first item in the
list will always be `NaN`.
Examples:
>>> from datetime import datetime
>>> time_since_previous = TimeSincePrevious()
>>> dates = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 2, 0),
... datetime(2019, 3, 1, 0, 3, 0),
... datetime(2019, 3, 1, 0, 2, 30),
... datetime(2019, 3, 1, 0, 10, 0)]
>>> time_since_previous(dates).tolist()
[nan, 120.0, 60.0, -30.0, 450.0]
"""
name = "time_since_previous"
input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"time_index"})]
return_type = ColumnSchema(semantic_tags={"numeric"})
description_template = "the time since the previous instance of {}"
[docs] def __init__(self, unit="seconds"):
self.unit = unit.lower()
def get_function(self):
def pd_diff(values):
return convert_time_units(
values.diff().apply(lambda x: x.total_seconds()), self.unit
)
return pd_diff
[docs]class Week(TransformPrimitive):
"""Determines the week of the year from a datetime.
Description:
Returns the week of the year from a datetime value. The first week
of the year starts on January 1, and week numbers increment each
Monday.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 1, 3),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> week = Week()
>>> week(dates).tolist()
[1, 25, 48]
"""
name = "week"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 54))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the week of the year of {}"
def get_function(self):
def week(vals):
if hasattr(vals.dt, "isocalendar"):
return vals.dt.isocalendar().week
else:
return vals.dt.week
return week
[docs]class Weekday(TransformPrimitive):
"""Determines the day of the week from a datetime.
Description:
Returns the day of the week from a datetime value. Weeks
start on Monday (day 0) and run through Sunday (day 6).
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2019, 6, 17, 11, 10, 50),
... datetime(2019, 11, 30, 19, 45, 15)]
>>> weekday = Weekday()
>>> weekday(dates).tolist()
[4, 0, 5]
"""
name = "weekday"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(7))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the day of the week of {}"
def get_function(self):
def weekday(vals):
return vals.dt.weekday
return weekday
[docs]class Year(TransformPrimitive):
"""Determines the year value of a datetime.
Examples:
>>> from datetime import datetime
>>> dates = [datetime(2019, 3, 1),
... datetime(2048, 6, 17, 11, 10, 50),
... datetime(1950, 11, 30, 19, 45, 15)]
>>> year = Year()
>>> year(dates).tolist()
[2019, 2048, 1950]
"""
name = "year"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(
logical_type=Ordinal(order=list(range(1, 3000))), semantic_tags={"category"}
)
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the year of {}"
def get_function(self):
def year(vals):
return vals.dt.year
return year
class IsFederalHoliday(TransformPrimitive):
"""Determines if a given datetime is a federal holiday.
Description:
This primtive currently only works for the United States
and Canada with dates between 1950 and 2100.
Args:
country (str): Country to use for determining Holidays.
Default is 'US'. Should be one of the available countries here:
https://github.com/dr-prodigy/python-holidays#available-countries
Examples:
>>> from datetime import datetime
>>> is_federal_holiday = IsFederalHoliday(country="US")
>>> is_federal_holiday([
... datetime(2019, 7, 4, 10, 0, 30),
... datetime(2019, 2, 26)]).tolist()
[True, False]
"""
name = "is_federal_holiday"
input_types = [ColumnSchema(logical_type=Datetime)]
return_type = ColumnSchema(logical_type=BooleanNullable)
def __init__(self, country="US"):
self.country = country
try:
self.holidays = holidays.country_holidays(country=self.country)
except NotImplementedError:
available_countries = (
"https://github.com/dr-prodigy/python-holidays#available-countries"
)
error = "must be one of the available countries:\n%s" % available_countries
raise ValueError(error)
years_list = [1950 + x for x in range(150)]
self.federal_holidays = getattr(holidays, country)(years=years_list)
def get_function(self):
def is_federal_holiday(x):
holidays_df = pd.DataFrame(
sorted(self.federal_holidays.items()), columns=["dates", "names"]
)
is_holiday = x.dt.normalize().isin(holidays_df.dates)
if x.isnull().values.any():
is_holiday = is_holiday.astype("object")
is_holiday[x.isnull()] = np.nan
return is_holiday.values
return is_federal_holiday