Source code for featuretools.primitives.standard.transform.natural_language.number_of_mentions
import re
import string
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage
from featuretools.primitives.standard.transform.natural_language.count_string import (
CountString,
)
[docs]class NumberOfMentions(CountString):
"""Determines the number of mentions in a string.
Description:
Given a list of strings, determine the number of mentions
in each string.
A mention is defined as a string that meets the following criteria:
- Starts with a '@' character, followed by a sequence of alphanumeric characters
- Present at the start of a string or after whitespace
- Terminated by the end of the string, a whitespace, or a punctuation character other than '@'
- e.g. The string '@yes-no' contains a valid mention ('@yes')
- e.g. The string '@yes@' does not contain a valid mention
This implementation handles Unicode characters.
This implementation does not impose any character limit on mentions.
If a string is missing, return `NaN`.
Examples:
>>> x = ['@user1 @user2', 'this is a string', '@@@__user1@1and_0@expression']
>>> number_of_mentions = NumberOfMentions()
>>> number_of_mentions(x).tolist()
[2.0, 0.0, 0.0]
"""
name = "number_of_mentions"
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
default_value = 0
[docs] def __init__(self):
SPECIALS_MINUS_AT = "".join(list(set(string.punctuation) - {"@"}))
SPECIALS_MINUS_AT = re.escape(SPECIALS_MINUS_AT)
pattern = rf"((^@)|(\s+@))(\w+)(?=\s|$|[{SPECIALS_MINUS_AT}])"
super().__init__(string=pattern, is_regex=True, ignore_case=False)