Source code for featuretools.demo.retail

import pandas as pd

import featuretools as ft
import featuretools.variable_types as vtypes


[docs]def load_retail(id='demo_retail_data', nrows=None, return_single_table=False):
    '''Returns the retail entityset example.
    The original dataset can be found `here <https://archive.ics.uci.edu/ml/datasets/online+retail>`_.

    We have also made some modifications to the data. We
    changed the column names, converted the ``customer_id``
    to a unique fake ``customer_name``, dropped duplicates,
    added columns for ``total`` and ``cancelled`` and
    converted amounts from GBP to USD. You can download the modified CSV `in gz compressed (7 MB)
    <"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz">`_
    or `uncompressed (43 MB)
    <"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv">`_ formats.

    Args:
        id (str):  Id to assign to EntitySet.
        nrows (int):  Number of rows to load of the underlying CSV.
            If None, load all.
        return_single_table (bool): If True, return a CSV rather than an EntitySet. Default is False.

    Examples:

        .. ipython::
            :verbatim:

            In [1]: import featuretools as ft

            In [2]: es = ft.demo.load_retail()

            In [3]: es
            Out[3]:
            Entityset: demo_retail_data
              Entities:
                orders (shape = [22190, 3])
                products (shape = [3684, 3])
                customers (shape = [4372, 2])
                order_products (shape = [401704, 7])

        Load in subset of data

        .. ipython::
            :verbatim:

            In [4]: es = ft.demo.load_retail(nrows=1000)

            In [5]: es
            Out[5]:
            Entityset: demo_retail_data
              Entities:
                orders (shape = [67, 5])
                products (shape = [606, 3])
                customers (shape = [50, 2])
                order_products (shape = [1000, 7])

'''
    es = ft.EntitySet(id)
    csv_s3_gz = "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz?version=" + ft.__version__
    csv_s3 = "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv?version=" + ft.__version__
    # Try to read in gz compressed file
    try:
        df = pd.read_csv(csv_s3_gz,
                         nrows=nrows,
                         parse_dates=["order_date"])
    # Fall back to uncompressed
    except Exception:
        df = pd.read_csv(csv_s3,
                         nrows=nrows,
                         parse_dates=["order_date"])
    if return_single_table:
        return df

    es.entity_from_dataframe("order_products",
                             dataframe=df,
                             index="order_product_id",
                             make_index=True,
                             time_index="order_date",
                             variable_types={'description': vtypes.Text})

    es.normalize_entity(new_entity_id="products",
                        base_entity_id="order_products",
                        index="product_id",
                        additional_variables=["description"])

    es.normalize_entity(new_entity_id="orders",
                        base_entity_id="order_products",
                        index="order_id",
                        additional_variables=["customer_name", "country", "cancelled"])

    es.normalize_entity(new_entity_id="customers",
                        base_entity_id="orders",
                        index="customer_name")
    es.add_last_time_indexes()

    return es