Source code for nlp_primitives.number_of_common_words

from typing import Iterable

import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage

from .constants import common_words_1000


[docs]class NumberOfCommonWords(TransformPrimitive):
    """Determines the number of common words in a string.

    Description:
        Given string, determine the number of words that appear in a supplied word set.
        The word set defaults to nlp_primitives.constants.common_words_1000. The string
        is case insensitive. The word bank should consist of only lower case strings. If a string is
        missing, return `NaN`.

    Args:
        word_set (set, optional): The set of words to look for in the string. These
            words should all be lower case strings.
        delimiters_regex (str, optional): The regular expression used to determine
            what separates words.

    Examples:
        >>> x = ['Hey! This is some natural language', 'bacon, cheesburger, AND, fries', 'I! Am. A; duck?']
        >>> number_of_common_words = NumberOfCommonWords(word_set={'and', 'some', 'am', 'a', 'the', 'is', 'i'})
        >>> number_of_common_words(x).tolist()
        [2, 1, 3]

        >>> # regex doesn't include a ! so 'Hey!' gets matched to the wordset instead of 'Hey'
        >>> x = ['Hey! This is. some. natural language']
        >>> number_of_common_words = NumberOfCommonWords(word_set={'hey', 'is', 'some'}, delimiters_regex="[ .]")
        >>> number_of_common_words(x).tolist()
        [2]
    """

    name = "number_of_common_words"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})

    default_value = 0

[docs]    def __init__(
        self, word_set=set(common_words_1000), delimiters_regex=r"[- \[\].,!\?;\n]"
    ):
        self.delimiters_regex = delimiters_regex
        self.word_set = word_set

    def get_function(self):
        def get_num_in_word_bank(words):
            if not isinstance(words, Iterable):
                return pd.NA
            num_common_words = 0
            for w in words:
                if w.lower() in self.word_set:  # assumes word_set is all lowercase
                    num_common_words += 1
            return num_common_words

        def num_common_words(x):
            words = x.str.split(self.delimiters_regex)
            return words.apply(get_num_in_word_bank)

        return num_common_words
Table of Contents

Quick search

Source code for nlp_primitives.number_of_common_words