Source code for featuretools.primitives.standard.transform.email.is_free_email_domain
import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import BooleanNullable, EmailAddress
from featuretools.primitives.base import TransformPrimitive
[docs]class IsFreeEmailDomain(TransformPrimitive):
"""Determines if an email address is from a free email domain.
Description:
EmailAddress input should be a string. Will return Nan
if an invalid email address is provided, or if the input is
not a string. The list of free email domains used in this primitive
was obtained from https://github.com/willwhite/freemail/blob/master/data/free.txt.
Examples:
>>> is_free_email_domain = IsFreeEmailDomain()
>>> is_free_email_domain(['[email protected]', '[email protected]']).tolist()
[True, False]
"""
name = "is_free_email_domain"
input_types = [ColumnSchema(logical_type=EmailAddress)]
return_type = ColumnSchema(logical_type=BooleanNullable)
filename = "free_email_provider_domains.txt"
def get_function(self):
file_path = self.get_filepath(self.filename)
free_domains = pd.read_csv(file_path, header=None, names=["domain"])
free_domains["domain"] = free_domains.domain.str.strip()
def is_free_email_domain(emails):
# if the input is empty return an empty Series
if len(emails) == 0:
return pd.Series([], dtype="category")
emails_df = pd.DataFrame({"email": emails})
# if all emails are NaN expand won't propogate NaNs and will fail on indexing
if emails_df["email"].isnull().all():
emails_df["domain"] = np.nan
else:
# .str.strip() and .str.split() return NaN for NaN values and propogate NaNs into new columns
emails_df["domain"] = (
emails_df["email"].str.strip().str.split("@", expand=True)[1]
)
emails_df["is_free"] = emails_df["domain"].isin(free_domains["domain"])
# if there are any NaN domain values, change the series type to allow for
# both bools and NaN values and set is_free to NaN for the NaN domains
if emails_df["domain"].isnull().values.any():
emails_df["is_free"] = emails_df["is_free"].astype("object")
emails_df.loc[emails_df["domain"].isnull(), "is_free"] = np.nan
return emails_df.is_free.values
return is_free_email_domain