Source code for featuretools.demo.retail

import pandas as pd
from woodwork.logical_types import NaturalLanguage

import featuretools as ft


[docs]def load_retail(id="demo_retail_data", nrows=None, return_single_table=False): """Returns the retail entityset example. The original dataset can be found `here <https://archive.ics.uci.edu/ml/datasets/online+retail>`_. We have also made some modifications to the data. We changed the column names, converted the ``customer_id`` to a unique fake ``customer_name``, dropped duplicates, added columns for ``total`` and ``cancelled`` and converted amounts from GBP to USD. You can download the modified CSV `in gz compressed (7 MB) <"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz">`_ or `uncompressed (43 MB) <"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv">`_ formats. Args: id (str): Id to assign to EntitySet. nrows (int): Number of rows to load of the underlying CSV. If None, load all. return_single_table (bool): If True, return a CSV rather than an EntitySet. Default is False. Examples: .. ipython:: :verbatim: In [1]: import featuretools as ft In [2]: es = ft.demo.load_retail() In [3]: es Out[3]: Entityset: demo_retail_data DataFrames: orders (shape = [22190, 3]) products (shape = [3684, 3]) customers (shape = [4372, 2]) order_products (shape = [401704, 7]) Load in subset of data .. ipython:: :verbatim: In [4]: es = ft.demo.load_retail(nrows=1000) In [5]: es Out[5]: Entityset: demo_retail_data DataFrames: orders (shape = [67, 5]) products (shape = [606, 3]) customers (shape = [50, 2]) order_products (shape = [1000, 7]) """ es = ft.EntitySet(id) csv_s3_gz = ( "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz?library=featuretools&version=" + ft.__version__ ) csv_s3 = ( "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv?library=featuretools&version=" + ft.__version__ ) # Try to read in gz compressed file try: df = pd.read_csv(csv_s3_gz, nrows=nrows, parse_dates=["order_date"]) # Fall back to uncompressed except Exception: df = pd.read_csv(csv_s3, nrows=nrows, parse_dates=["order_date"]) if return_single_table: return df es.add_dataframe( dataframe_name="order_products", dataframe=df, index="order_product_id", make_index=True, time_index="order_date", logical_types={"description": NaturalLanguage}, ) es.normalize_dataframe( new_dataframe_name="products", base_dataframe_name="order_products", index="product_id", additional_columns=["description"], ) es.normalize_dataframe( new_dataframe_name="orders", base_dataframe_name="order_products", index="order_id", additional_columns=["customer_name", "country", "cancelled"], ) es.normalize_dataframe( new_dataframe_name="customers", base_dataframe_name="orders", index="customer_name", ) es.add_last_time_indexes() return es