Source code for nlp_primitives.part_of_speech_count

import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, NaturalLanguage

from nlp_primitives.utilities import clean_tokens


[docs]class PartOfSpeechCount(TransformPrimitive):
    """Calculates the occurences of each different part of speech.

    Description:
        Given a list of strings, tags each word in the string with its part of speech.
        This method calculates the total count for each of the 15 different categories of speech.

        Given a list of N strings, this method will return a 15xN matrix. Each row will correspond
        to the appropriate part of speech.


        If a string is missing, return `NaN`.

    Examples:
        >>> x = ['He was eating cheese', '']
        >>> part_of_speech_count = PartOfSpeechCount()
        >>> part_of_speech_count(x).tolist()
        [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0]]
    """

    name = "part_of_speech_count"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
    default_value = 0

[docs]    def __init__(self):
        self.number_output_features = 15
        self.n = 15

    def get_function(self):
        # For more info about the different parts of speech, see here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        types = [
            "C",  # cardinal digits
            "D",  # determiner
            "E",  # existential there
            "F",  # foreign word
            "I",  # preposition/subordinating conjunction
            "J",  # adjective
            "L",  # list marker
            "M",  # modal
            "N",  # noun
            "P",  # pronoun
            "R",  # adverb
            "T",  # to
            "U",  # interjection
            "V",  # verb
            "W",  # "wh"-pronoun
        ]

        def part_of_speech_count(series):
            result = []
            for element in series:
                if pd.isnull(element):
                    result.append([np.nan] * 15)
                else:
                    tags = nltk.pos_tag(clean_tokens(element))
                    freq_dist = nltk.FreqDist(freq[0] for _, freq in tags)
                    result.append([float(freq_dist[t]) for t in types])
            result = (np.array(result).T).tolist()
            return pd.Series(result)

        return part_of_speech_count
Table of Contents

Quick search

Source code for nlp_primitives.part_of_speech_count