Source code for nlp_primitives.count_string

import re

import numpy as np
from featuretools.primitives import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage


[docs]class CountString(TransformPrimitive):
    """Determines how many times a given string shows up in a text field.

    Args:
        string (str): The string to determine the count of. Defaults to
            the word "the".
        ignore_case (bool): Determines if case of the string should be
            considered or not. Defaults to true.
        ignore_non_alphanumeric (bool): Determines if non-alphanumeric
            characters should be used in the search. Defaults to False.
        is_regex (bool): Defines if the string argument is a regex or not.
            Defaults to False.
        match_whole_words_only (bool): Determines if whole words should be
            matched or not. For example searching for word `the` against
            `then, the, there` should only return `the` if this argument
            was True. Defaults to False.
    Examples:
        >>> count_string = CountString(string="the")
        >>> count_string(["The problem was difficult.",
        ...               "He was there.",
        ...               "The girl went to the store."]).tolist()
        [1.0, 1.0, 2.0]
        >>> # Match case of string
        >>> count_string_ignore_case = CountString(string="the", ignore_case=False)
        >>> count_string_ignore_case(["The problem was difficult.",
        ...                           "He was there.",
        ...                           "The girl went to the store."]).tolist()
        [0.0, 1.0, 1.0]
        >>> # Ignore non-alphanumeric characters in the search
        >>> count_string_ignore_non_alphanumeric = CountString(string="the",
        ...                                                    ignore_non_alphanumeric=True)
        >>> count_string_ignore_non_alphanumeric(["Th*/e problem was difficult.",
        ...                                       "He was there.",
        ...                                       "The girl went to the store."]).tolist()
        [1.0, 1.0, 2.0]
        >>> # Specify the string as a regex
        >>> count_string_is_regex = CountString(string="t.e", is_regex=True)
        >>> count_string_is_regex(["The problem was difficult.",
        ...                        "He was there.",
        ...                        "The girl went to the store."]).tolist()
        [1.0, 1.0, 2.0]
        >>> # Match whole words only
        >>> count_string_match_whole_words_only = CountString(string="the",
        ...                                                   match_whole_words_only=True)
        >>> count_string_match_whole_words_only(["The problem was difficult.",
        ...                                      "He was there.",
        ...                                      "The girl went to the store."]).tolist()
        [1.0, 0.0, 2.0]
    """

    name = "count_string"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})

[docs]    def __init__(
        self,
        string="the",
        ignore_case=True,
        ignore_non_alphanumeric=False,
        is_regex=False,
        match_whole_words_only=False,
    ):
        self.string = string
        self.ignore_case = ignore_case
        self.ignore_non_alphanumeric = ignore_non_alphanumeric
        self.match_whole_words_only = match_whole_words_only
        self.is_regex = is_regex

        # we don't want to strip non alphanumeric characters from the pattern
        # ie h.ll. should match "hello" so we can't strip the dots to make hll
        if not is_regex:
            self.pattern = re.escape(self.process_text(string))
        else:
            self.pattern = string
            if ignore_case:
                self.pattern = self.pattern.lower()

        # \b\b.*\b\b is the same as \b.*\b so we don't have to check if
        # the pattern is given to us as regex and if it already has leading
        # and trailing \b's
        if match_whole_words_only:
            self.pattern = "\\b" + self.pattern + "\\b"

    def process_text(self, text):
        if self.ignore_non_alphanumeric:
            text = re.sub("[^0-9a-zA-Z ]+", "", text)
        if self.ignore_case:
            text = text.lower()
        return text

    def get_function(self):
        def count_string(words):
            if type(words) != str:
                return np.nan
            words = self.process_text(words)
            return len(re.findall(self.pattern, words))

        return np.vectorize(count_string, otypes=[float])
Table of Contents

Quick search

Source code for nlp_primitives.count_string