Source code for featuretools.demo.mock_customer

import pandas as pd
from numpy import random
from numpy.random import choice
from woodwork.logical_types import Categorical, PostalCode

import featuretools as ft


[docs]def load_mock_customer(
    n_customers=5,
    n_products=5,
    n_sessions=35,
    n_transactions=500,
    random_seed=0,
    return_single_table=False,
    return_entityset=False,
):
    """Return dataframes of mock customer data"""

    random.seed(random_seed)
    last_date = pd.to_datetime("12/31/2013")
    first_date = pd.to_datetime("1/1/2008")
    first_bday = pd.to_datetime("1/1/1970")

    join_dates = [
        random.uniform(0, 1) * (last_date - first_date) + first_date
        for _ in range(n_customers)
    ]
    birth_dates = [
        random.uniform(0, 1) * (first_date - first_bday) + first_bday
        for _ in range(n_customers)
    ]

    customers_df = pd.DataFrame({"customer_id": range(1, n_customers + 1)})
    customers_df["zip_code"] = choice(
        ["60091", "13244"],
        n_customers,
    )
    customers_df["join_date"] = pd.Series(join_dates).dt.round("1s")
    customers_df["birthday"] = pd.Series(birth_dates).dt.round("1d")

    products_df = pd.DataFrame({"product_id": pd.Categorical(range(1, n_products + 1))})
    products_df["brand"] = choice(["A", "B", "C"], n_products)

    sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)})
    sessions_df["customer_id"] = choice(customers_df["customer_id"], n_sessions)
    sessions_df["device"] = choice(["desktop", "mobile", "tablet"], n_sessions)

    transactions_df = pd.DataFrame({"transaction_id": range(1, n_transactions + 1)})
    transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions)
    transactions_df = transactions_df.sort_values("session_id").reset_index(drop=True)
    transactions_df["transaction_time"] = pd.date_range(
        "1/1/2014",
        periods=n_transactions,
        freq="65s",
    )  # todo make these less regular
    transactions_df["product_id"] = pd.Categorical(
        choice(products_df["product_id"], n_transactions),
    )
    transactions_df["amount"] = random.randint(500, 15000, n_transactions) / 100

    # calculate and merge in session start
    # based on the times we came up with for transactions
    session_starts = transactions_df.drop_duplicates("session_id")[
        ["session_id", "transaction_time"]
    ].rename(columns={"transaction_time": "session_start"})
    sessions_df = sessions_df.merge(session_starts)

    if return_single_table:
        return (
            transactions_df.merge(sessions_df)
            .merge(customers_df)
            .merge(products_df)
            .reset_index(drop=True)
        )
    elif return_entityset:
        es = ft.EntitySet(id="transactions")
        es = es.add_dataframe(
            dataframe_name="transactions",
            dataframe=transactions_df,
            index="transaction_id",
            time_index="transaction_time",
            logical_types={"product_id": Categorical},
        )

        es = es.add_dataframe(
            dataframe_name="products",
            dataframe=products_df,
            index="product_id",
        )

        es = es.add_dataframe(
            dataframe_name="sessions",
            dataframe=sessions_df,
            index="session_id",
            time_index="session_start",
        )

        es = es.add_dataframe(
            dataframe_name="customers",
            dataframe=customers_df,
            index="customer_id",
            time_index="join_date",
            logical_types={"zip_code": PostalCode},
        )

        rels = [
            ("products", "product_id", "transactions", "product_id"),
            ("sessions", "session_id", "transactions", "session_id"),
            ("customers", "customer_id", "sessions", "customer_id"),
        ]
        es = es.add_relationships(rels)
        es.add_last_time_indexes()
        return es

    return {
        "customers": customers_df,
        "sessions": sessions_df,
        "transactions": transactions_df,
        "products": products_df,
    }
Table of Contents

Quick search

Source code for featuretools.demo.mock_customer