NOTICE

The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release by installing from GitHub:

pip install https://github.com/alteryx/featuretools/archive/woodwork-integration.zip

For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.

Source code for featuretools.demo.mock_customer

import pandas as pd
from numpy import random
from numpy.random import choice

import featuretools as ft
from featuretools.variable_types import Categorical, ZIPCode


[docs]def load_mock_customer(n_customers=5, n_products=5, n_sessions=35, n_transactions=500,
                       random_seed=0, return_single_table=False, return_entityset=False):
    """Return dataframes of mock customer data"""

    random.seed(random_seed)
    last_date = pd.to_datetime('12/31/2013')
    first_date = pd.to_datetime('1/1/2008')
    first_bday = pd.to_datetime('1/1/1970')

    join_dates = [random.uniform(0, 1) * (last_date - first_date) + first_date
                  for _ in range(n_customers)]
    birth_dates = [random.uniform(0, 1) * (first_date - first_bday) + first_bday
                   for _ in range(n_customers)]

    customers_df = pd.DataFrame({"customer_id": range(1, n_customers + 1)})
    customers_df["zip_code"] = choice(["60091", "13244"], n_customers,)
    customers_df["join_date"] = pd.Series(join_dates).dt.round('1s')
    customers_df["date_of_birth"] = pd.Series(birth_dates).dt.round('1d')

    products_df = pd.DataFrame({"product_id": pd.Categorical(range(1, n_products + 1))})
    products_df["brand"] = choice(["A", "B", "C"], n_products)

    sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)})
    sessions_df["customer_id"] = choice(customers_df["customer_id"], n_sessions)
    sessions_df["device"] = choice(["desktop", "mobile", "tablet"], n_sessions)

    transactions_df = pd.DataFrame({"transaction_id": range(1, n_transactions + 1)})
    transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions)
    transactions_df = transactions_df.sort_values("session_id").reset_index(drop=True)
    transactions_df["transaction_time"] = pd.date_range('1/1/2014', periods=n_transactions, freq='65s')  # todo make these less regular
    transactions_df["product_id"] = pd.Categorical(choice(products_df["product_id"], n_transactions))
    transactions_df["amount"] = random.randint(500, 15000, n_transactions) / 100

    # calculate and merge in session start
    # based on the times we came up with for transactions
    session_starts = transactions_df.drop_duplicates("session_id")[["session_id", "transaction_time"]].rename(columns={"transaction_time": "session_start"})
    sessions_df = sessions_df.merge(session_starts)

    if return_single_table:
        return transactions_df.merge(sessions_df).merge(customers_df).merge(products_df).reset_index(drop=True)
    elif return_entityset:
        es = ft.EntitySet(id="transactions")
        es = es.entity_from_dataframe(entity_id="transactions",
                                      dataframe=transactions_df,
                                      index="transaction_id",
                                      time_index="transaction_time",
                                      variable_types={"product_id": Categorical})

        es = es.entity_from_dataframe(entity_id="products",
                                      dataframe=products_df,
                                      index="product_id")

        es = es.entity_from_dataframe(entity_id="sessions",
                                      dataframe=sessions_df,
                                      index="session_id",
                                      time_index="session_start")

        es = es.entity_from_dataframe(entity_id="customers",
                                      dataframe=customers_df,
                                      index="customer_id",
                                      time_index="join_date",
                                      variable_types={"zip_code": ZIPCode})

        rels = [ft.Relationship(es["products"]["product_id"],
                                es["transactions"]["product_id"]),
                ft.Relationship(es["sessions"]["session_id"],
                                es["transactions"]["session_id"]),
                ft.Relationship(es["customers"]["customer_id"],
                                es["sessions"]["customer_id"])]
        es = es.add_relationships(rels)
        es.add_last_time_indexes()
        return es

    return {"customers": customers_df,
            "sessions": sessions_df,
            "transactions": transactions_df,
            "products": products_df}