Source code for featuretools.primitives.standard.transform.natural_language.number_of_words_in_quotes

import re
from string import punctuation

import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage

from featuretools.primitives.base import TransformPrimitive
from featuretools.primitives.standard.transform.natural_language.constants import (
    DELIMITERS,
)


[docs]class NumberOfWordsInQuotes(TransformPrimitive): """Determines the number of words in quotes in a string. Description: Given a list of strings, determine the number of words in quotes in each string. This implementation handles Unicode characters. If a string is missing, return `NaN`. Args: quote_type (str, optional): Specifies what type of quotation marks to match. Argument "single" matches on only single quotes (' '). Argument "double" matches words between double quotes (" "). Argument "both" matches words between either type of quotes. Defaults to "both". Examples: >>> x = ['"python" java prolog "Diffie-Hellman" "4.99"', "Reach me at '[email protected]'", "'Here's an interesting example!'"] >>> number_of_words_in_quotes = NumberOfWordsInQuotes() >>> number_of_words_in_quotes(x).tolist() [3, 1, 4] """ name = "number_of_words_in_quotes" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) default_value = 0
[docs] def __init__(self, quote_type="both"): if quote_type not in ["both", "single", "double"]: raise ValueError( f"{quote_type} is not a valid quote_type. Specify 'both', 'single', or 'double'", ) self.quote_type = quote_type IN_DOUBLE_QUOTES = r'((^|\W)"(.)*?"(?!\w))' IN_SINGLE_QUOTES = r"((^|\W)'(.)*?'(?!\w))" if quote_type == "double": self.regex = IN_DOUBLE_QUOTES elif quote_type == "single": self.regex = IN_SINGLE_QUOTES else: self.regex = f"({IN_SINGLE_QUOTES}|{IN_DOUBLE_QUOTES})"
def get_function(self): def count_words_in_quotes(text): if pd.isnull(text): return pd.NA matches = re.findall(self.regex, text, re.DOTALL) count = 0 for match in matches: matched_phrase = match[0] words = re.split(f"{DELIMITERS}", matched_phrase) for word in words: if len(word.strip(punctuation + " ")): count += 1 return count def num_words_in_quotes(array): return array.apply(count_words_in_quotes).astype("Int64") return num_words_in_quotes