import pandas as pd
from woodwork.logical_types import NaturalLanguage
import featuretools as ft
[docs]def load_retail(id="demo_retail_data", nrows=None, return_single_table=False):
"""Returns the retail entityset example.
The original dataset can be found `here <https://archive.ics.uci.edu/ml/datasets/online+retail>`_.
We have also made some modifications to the data. We
changed the column names, converted the ``customer_id``
to a unique fake ``customer_name``, dropped duplicates,
added columns for ``total`` and ``cancelled`` and
converted amounts from GBP to USD. You can download the modified CSV `in gz compressed (7 MB)
<"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz">`_
or `uncompressed (43 MB)
<"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv">`_ formats.
Args:
id (str): Id to assign to EntitySet.
nrows (int): Number of rows to load of the underlying CSV.
If None, load all.
return_single_table (bool): If True, return a CSV rather than an EntitySet. Default is False.
Examples:
.. ipython::
:verbatim:
In [1]: import featuretools as ft
In [2]: es = ft.demo.load_retail()
In [3]: es
Out[3]:
Entityset: demo_retail_data
DataFrames:
orders (shape = [22190, 3])
products (shape = [3684, 3])
customers (shape = [4372, 2])
order_products (shape = [401704, 7])
Load in subset of data
.. ipython::
:verbatim:
In [4]: es = ft.demo.load_retail(nrows=1000)
In [5]: es
Out[5]:
Entityset: demo_retail_data
DataFrames:
orders (shape = [67, 5])
products (shape = [606, 3])
customers (shape = [50, 2])
order_products (shape = [1000, 7])
"""
es = ft.EntitySet(id)
csv_s3_gz = (
"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz?library=featuretools&version="
+ ft.__version__
)
csv_s3 = (
"https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv?library=featuretools&version="
+ ft.__version__
)
# Try to read in gz compressed file
try:
df = pd.read_csv(csv_s3_gz, nrows=nrows, parse_dates=["order_date"])
# Fall back to uncompressed
except Exception:
df = pd.read_csv(csv_s3, nrows=nrows, parse_dates=["order_date"])
if return_single_table:
return df
es.add_dataframe(
dataframe_name="order_products",
dataframe=df,
index="order_product_id",
make_index=True,
time_index="order_date",
logical_types={"description": NaturalLanguage},
)
es.normalize_dataframe(
new_dataframe_name="products",
base_dataframe_name="order_products",
index="product_id",
additional_columns=["description"],
)
es.normalize_dataframe(
new_dataframe_name="orders",
base_dataframe_name="order_products",
index="order_id",
additional_columns=["customer_name", "country", "cancelled"],
)
es.normalize_dataframe(
new_dataframe_name="customers",
base_dataframe_name="orders",
index="customer_name",
)
es.add_last_time_indexes()
return es