Source code for featuretools.primitives.standard.transform.email.email_address_to_domain
import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Categorical, EmailAddress
from featuretools.primitives.base import TransformPrimitive
[docs]class EmailAddressToDomain(TransformPrimitive):
"""Determines the domain of an email
Description:
EmailAddress input should be a string. Will return Nan
if an invalid email address is provided, or if the input is
not a string.
Examples:
>>> email_address_to_domain = EmailAddressToDomain()
>>> email_address_to_domain(['[email protected]', '[email protected]']).tolist()
['gmail.com', 'featuretools.com']
"""
name = "email_address_to_domain"
input_types = [ColumnSchema(logical_type=EmailAddress)]
return_type = ColumnSchema(logical_type=Categorical, semantic_tags={"category"})
def get_function(self):
def email_address_to_domain(emails):
# if the input is empty return an empty Series
if len(emails) == 0:
return pd.Series([], dtype="category")
emails_df = pd.DataFrame({"email": emails})
# if all emails are NaN expand won't propogate NaNs and will fail on indexing
if emails_df["email"].isnull().all():
emails_df["domain"] = np.nan
emails_df["domain"] = emails_df["domain"].astype(object)
else:
# .str.strip() and .str.split() return NaN for NaN values and propogate NaNs into new columns
emails_df["domain"] = (
emails_df["email"].str.strip().str.split("@", expand=True)[1]
)
return emails_df.domain.values
return email_address_to_domain