Source code for nlp_primitives.diversity_score

import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Numeric, Text

from .utilities import clean_tokens


[docs]class DiversityScore(TransformPrimitive):
    """Calculates the overall complexity of the text based on the total
       number of words used in the text

    Description:
        Given a list of strings, calculates the total number of unique words
        divided by the total number of words in order to give the text a score
        from 0-1 that indicates how unique the words used in it are. This
        primitive only evaluates the 'clean' versions of strings, so ignoring cases,
        punctuation, and stopwords in its evaluation.

        If a string is missing, return `NaN`

    Examples:
        >>> diversity_score = DiversityScore()
        >>> diversity_score(["hi hi hi", "hello its me", "hey what hey what", "a dog ate a basket"]).tolist()
        [0.3333333333333333, 1.0, 0.5, 1.0]
    """
    name = "diversity_score"
    input_types = [Text]
    return_type = Numeric
    default_value = 0

    def get_function(self):

        def diversity_score(x):
            li = []
            for el in x:
                if pd.isnull(el):
                    li.append(np.nan)
                else:
                    el = clean_tokens(el)
                    if len(el) < 1:
                        li.append(0.0)
                    else:
                        li.append(float(len(set(el))) / float(len(el)))
            return pd.Series(li)
        return diversity_score