Source code for nlp_primitives.number_of_common_words

from typing import Iterable

import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable, NaturalLanguage

from .constants import common_words_1000


[docs]class NumberOfCommonWords(TransformPrimitive): """Determines the number of common words in a string. Description: Given string, determine the number of words that appear in a supplied word set. The word set defaults to nlp_primitives.constants.common_words_1000. The string is case insensitive. The word bank should consist of only lower case strings. If a string is missing, return `NaN`. Args: word_set (set, optional): The set of words to look for in the string. These words should all be lower case strings. delimiters_regex (str, optional): The regular expression used to determine what separates words. Examples: >>> x = ['Hey! This is some natural language', 'bacon, cheesburger, AND, fries', 'I! Am. A; duck?'] >>> number_of_common_words = NumberOfCommonWords(word_set={'and', 'some', 'am', 'a', 'the', 'is', 'i'}) >>> number_of_common_words(x).tolist() [2, 1, 3] >>> # regex doesn't include a ! so 'Hey!' gets matched to the wordset instead of 'Hey' >>> x = ['Hey! This is. some. natural language'] >>> number_of_common_words = NumberOfCommonWords(word_set={'hey', 'is', 'some'}, delimiters_regex="[ .]") >>> number_of_common_words(x).tolist() [2] """ name = "number_of_common_words" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"}) default_value = 0
[docs] def __init__( self, word_set=set(common_words_1000), delimiters_regex=r"[- \[\].,!\?;\n]" ): self.delimiters_regex = delimiters_regex self.word_set = word_set
def get_function(self): def get_num_in_word_bank(words): if not isinstance(words, Iterable): return pd.NA num_common_words = 0 for w in words: if w.lower() in self.word_set: # assumes word_set is all lowercase num_common_words += 1 return num_common_words def num_common_words(x): words = x.str.split(self.delimiters_regex) return words.apply(get_num_in_word_bank) return num_common_words