Source code for nlp_primitives.lsa

import nltk
import numpy as np
import pandas as pd
from featuretools.primitives.base import TransformPrimitive
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, NaturalLanguage

from .utilities import clean_tokens


[docs]class LSA(TransformPrimitive): """Calculates the Latent Semantic Analysis Values of NaturalLanguage Input Description: Given a list of strings, transforms those strings using tf-idf and single value decomposition to go from a sparse matrix to a compact matrix with two values for each string. These values represent that Latent Semantic Analysis of each string. These values will represent their context with respect to (nltk's gutenberg corpus.)[https://www.nltk.org/book/ch02.html#gutenberg-corpus] If a string is missing, return `NaN`. Examples: >>> lsa = LSA() >>> x = ["he helped her walk,", "me me me eat food", "the sentence doth long"] >>> res = lsa(x).tolist() >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]] >>> res [[0.01, 0.01, 0.01], [0.0, 0.0, 0.01]] Now, if we change the values of the input corpus, to something that better resembles the given text, the same given input text will result in a different, more discerning, output. Also, NaN values are handled, as well as strings without words. >>> lsa = LSA() >>> x = ["the earth is round", "", np.NaN, ".,/"] >>> res = lsa(x).tolist() >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]] >>> res [[0.02, 0.0, nan, 0.0], [0.02, 0.0, nan, 0.0]] """ name = "lsa" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'}) default_value = 0
[docs] def __init__(self, random_seed=0): # TODO: allow user to use own corpus self.number_output_features = 2 self.n = 2 self.trainer = None self.random_seed = random_seed
def _create_trainer(self): gutenberg = nltk.corpus.gutenberg.sents() svd = TruncatedSVD(random_state=self.random_seed) self.trainer = make_pipeline(TfidfVectorizer(), svd) self.trainer.fit([" ".join(sent) for sent in gutenberg]) def get_function(self): if self.trainer is None: self._create_trainer() dtk = TreebankWordDetokenizer() def lsa(array): array = pd.Series(array, index=pd.Series(array.index), name='array') copy = array.dropna() copy = copy.apply(lambda x: dtk.detokenize(clean_tokens(x))) li = self.trainer.transform(copy) lsa1 = pd.Series(li[:, 0], index=copy.index) lsa2 = pd.Series(li[:, 1], index=copy.index) array = pd.DataFrame(array) array['l1'] = lsa1 array['l2'] = lsa2 arr = ((np.array(array[['l1', 'l2']])).T).tolist() return pd.Series(arr) return lsa