import pandas as pd
from featuretools.primitives import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage
NATURAL_LANGUAGE_SEPARATORS = [" ", ".", ",", "!", "?", ";", "\n"]
[docs]class NumUniqueSeparators(TransformPrimitive):
r"""Calculates the number of unique separators.
Description:
Given a string and a list of separators, determine
the number of unique separators in each string. If a string
is null determined by pd.isnull return pd.NA.
Args:
separators (list, optional): a list of separator characters to count.
``[" ", ".", ",", "!", "?", ";", "\n"]`` is used by default.
Examples:
>>> x = ["First. Line.", "This. is the second, line!", "notinlist@#$%^%&"]
>>> num_unique_separators = NumUniqueSeparators([".", ",", "!"])
>>> num_unique_separators(x).tolist()
[1, 3, 0]
"""
name = "num_unique_separators"
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
[docs] def __init__(self, separators=NATURAL_LANGUAGE_SEPARATORS):
assert separators is not None, "separators needs to be defined"
self.separators = separators
def get_function(self):
def count_unique_separator(s):
if pd.isnull(s):
return pd.NA
return len(set(self.separators).intersection(set(s)))
def get_separator_count(column):
return column.apply(count_unique_separator)
return get_separator_count