Source code for nlp_primitives.diversity_score

import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, NaturalLanguage

from nlp_primitives.utilities import clean_tokens


[docs]class DiversityScore(TransformPrimitive): """Calculates the overall complexity of the text based on the total number of words used in the text Description: Given a list of strings, calculates the total number of unique words divided by the total number of words in order to give the text a score from 0-1 that indicates how unique the words used in it are. This primitive only evaluates the 'clean' versions of strings, so ignoring cases, punctuation, and stopwords in its evaluation. If a string is missing, return `NaN` Examples: >>> diversity_score = DiversityScore() >>> diversity_score(["hi hi hi", "hello its me", "hey what hey what", "a dog ate a basket"]).tolist() [0.3333333333333333, 1.0, 0.5, 1.0] """ name = "diversity_score" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"}) default_value = 0 def get_function(self): def diversity_score(x): li = [] for el in x: if pd.isnull(el): li.append(np.nan) else: el = clean_tokens(el) if len(el) < 1: li.append(0.0) else: li.append(float(len(set(el))) / float(len(el))) return pd.Series(li) return diversity_score