NOTICE

The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release by installing from GitHub:

pip install https://github.com/alteryx/featuretools/archive/woodwork-integration.zip

For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.

Source code for featuretools.demo.flight

import math
import re

import pandas as pd
from tqdm import tqdm
from woodwork.logical_types import Boolean, Categorical, Ordinal

import featuretools as ft


[docs]def load_flight(month_filter=None,
                categorical_filter=None,
                nrows=None,
                demo=True,
                return_single_table=False,
                verbose=False):
    '''
    Download, clean, and filter flight data from 2017.
    The original dataset can be found `here <https://www.transtats.bts.gov/DL_SelectFields.asp?DB_Short_Name=On-Time&Table_ID=236>`_.

    Args:

        month_filter (list[int]): Only use data from these months (example is ``[1, 2]``).
            To skip, set to None.
        categorical_filter (dict[str->str]): Use only specified categorical values.
            Example is ``{'dest_city': ['Boston, MA'], 'origin_city': ['Boston, MA']}``
            which returns all flights in OR out of Boston. To skip, set to None.
        nrows (int): Passed to nrows in ``pd.read_csv``. Used before filtering.
        demo (bool): Use only two months of data. If False, use the whole year.
        return_single_table (bool): Exit the function early and return a dataframe.
        verbose (bool): Show a progress bar while loading the data.

    Examples:

        .. ipython::
            :verbatim:

            In [1]: import featuretools as ft

            In [2]: es = ft.demo.load_flight(verbose=True,
               ...:                          month_filter=[1],
               ...:                          categorical_filter={'origin_city':['Boston, MA']})
            100%|xxxxxxxxxxxxxxxxxxxxxxxxx| 100/100 [01:16<00:00,  1.31it/s]

            In [3]: es
            Out[3]:
            Entityset: Flight Data
              DataFrames:
                airports [Rows: 55, Columns: 3]
                flights [Rows: 613, Columns: 9]
                trip_logs [Rows: 9456, Columns: 22]
                airlines [Rows: 10, Columns: 1]
              Relationships:
                trip_logs.flight_id -> flights.flight_id
                flights.carrier -> airlines.carrier
                flights.dest -> airports.dest
    '''

    filename, csv_length = get_flight_filename(demo=demo)

    print('Downloading data ...')
    url = "https://api.featurelabs.com/datasets/{}?library=featuretools&version={}".format(filename, ft.__version__)

    chunksize = math.ceil(csv_length / 99)
    pd.options.display.max_columns = 200
    iter_csv = pd.read_csv(url,
                           compression='zip',
                           iterator=True,
                           nrows=nrows,
                           chunksize=chunksize)
    if verbose:
        iter_csv = tqdm(iter_csv, total=100)

    partial_df_list = []
    for chunk in iter_csv:
        df = filter_data(_clean_data(chunk),
                         month_filter=month_filter,
                         categorical_filter=categorical_filter)
        partial_df_list.append(df)
    data = pd.concat(partial_df_list)

    if return_single_table:
        return data

    es = make_es(data)

    return es


def make_es(data):
    es = ft.EntitySet('Flight Data')
    arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                        'national_airspace_delay', 'security_delay',
                        'late_aircraft_delay', 'canceled', 'diverted',
                        'taxi_in', 'taxi_out', 'air_time', 'dep_time']

    logical_types = {'flight_num': Categorical,
                     'distance_group': Ordinal(order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
                     'canceled': Boolean,
                     'diverted': Boolean}

    es.add_dataframe(data,
                     dataframe_name='trip_logs',
                     index='trip_log_id',
                     make_index=True,
                     time_index='date_scheduled',
                     secondary_time_index={'arr_time': arr_time_columns},
                     logical_types=logical_types)

    es.normalize_dataframe('trip_logs', 'flights', 'flight_id',
                           additional_columns=['origin', 'origin_city', 'origin_state',
                                               'dest', 'dest_city', 'dest_state',
                                               'distance_group', 'carrier', 'flight_num'])

    es.normalize_dataframe('flights', 'airlines', 'carrier',
                           make_time_index=False)

    es.normalize_dataframe('flights', 'airports', 'dest',
                           additional_columns=['dest_city', 'dest_state'],
                           make_time_index=False)
    return es


def _clean_data(data):

    # Make column names snake case
    clean_data = data.rename(
        columns={col: convert(col) for col in data})

    # Chance crs -> "scheduled" and other minor clarifications
    clean_data = clean_data.rename(columns={'crs_arr_time': 'scheduled_arr_time',
                                            'crs_dep_time': 'scheduled_dep_time',
                                            'crs_elapsed_time': 'scheduled_elapsed_time',
                                            'nas_delay': 'national_airspace_delay',
                                            'origin_city_name': 'origin_city',
                                            'dest_city_name': 'dest_city',
                                            'cancelled': 'canceled'})

    # Combine strings like 0130 (1:30 AM) with dates (2017-01-01)
    clean_data['scheduled_dep_time'] = clean_data['scheduled_dep_time'].apply(lambda x: str(x)) + clean_data['flight_date'].astype('str')

    # Parse combined string as a date
    clean_data.loc[:, 'scheduled_dep_time'] = pd.to_datetime(
        clean_data['scheduled_dep_time'], format='%H%M%Y-%m-%d', errors='coerce')

    clean_data['scheduled_elapsed_time'] = pd.to_timedelta(clean_data['scheduled_elapsed_time'], unit='m')

    clean_data = _reconstruct_times(clean_data)

    # Create a time index 6 months before scheduled_dep
    clean_data.loc[:, 'date_scheduled'] = clean_data['scheduled_dep_time'].dt.date - \
        pd.Timedelta('120d')

    # A null entry for a delay means no delay
    clean_data = _fill_labels(clean_data)

    # Nulls for scheduled values are too problematic. Remove them.
    clean_data = clean_data.dropna(
        axis='rows', subset=['scheduled_dep_time', 'scheduled_arr_time'])

    # Make a flight id. Define a flight as a combination of:
    # 1. carrier 2. flight number 3. origin airport 4. dest airport
    clean_data.loc[:, 'flight_id'] = clean_data['carrier'] + '-' + \
        clean_data['flight_num'].apply(lambda x: str(x)) + ':' + clean_data['origin'] + '->' + clean_data['dest']

    column_order = [
        'flight_id',
        'flight_num',
        'date_scheduled',
        'scheduled_dep_time',
        'scheduled_arr_time',
        'carrier',
        'origin', 'origin_city', 'origin_state',
        'dest', 'dest_city', 'dest_state',
        'distance_group',
        'dep_time',
        'arr_time',
        'dep_delay', 'taxi_out', 'taxi_in', 'arr_delay',
        'diverted', 'scheduled_elapsed_time', 'air_time', 'distance',
        'carrier_delay', 'weather_delay',
        'national_airspace_delay', 'security_delay', 'late_aircraft_delay',
        'canceled'
    ]

    clean_data = clean_data[column_order]

    return clean_data


def _fill_labels(clean_data):
    labely_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                      'national_airspace_delay', 'security_delay',
                      'late_aircraft_delay', 'canceled', 'diverted',
                      'taxi_in', 'taxi_out', 'air_time']
    for col in labely_columns:
        clean_data.loc[:, col] = clean_data[col].fillna(0)

    return clean_data


def _reconstruct_times(clean_data):
    """ Reconstruct departure_time, scheduled_dep_time,
        arrival_time and scheduled_arr_time by adding known delays
        to known times. We do:
            - dep_time is scheduled_dep + dep_delay
            - arr_time is dep_time + taxiing and air_time
            - scheduled arrival is scheduled_dep + scheduled_elapsed
    """
    clean_data.loc[:, 'dep_time'] = clean_data['scheduled_dep_time'] + \
        pd.to_timedelta(clean_data['dep_delay'], unit='m')

    clean_data.loc[:, 'arr_time'] = clean_data['dep_time'] + \
        pd.to_timedelta(clean_data['taxi_out'] +
                        clean_data['air_time'] +
                        clean_data['taxi_in'], unit='m')

    clean_data.loc[:, 'scheduled_arr_time'] = clean_data['scheduled_dep_time'] + \
        clean_data['scheduled_elapsed_time']
    return clean_data


def filter_data(clean_data,
                month_filter=None,
                categorical_filter=None):
    if month_filter is not None:
        tmp = clean_data['scheduled_dep_time'].dt.month.isin(month_filter)
        clean_data = clean_data[tmp]

    if categorical_filter is not None:
        tmp = False
        for key, values in categorical_filter.items():
            tmp = tmp | clean_data[key].isin(values)
        clean_data = clean_data[tmp]

    return clean_data


def convert(name):
    # Rename columns to underscore
    # Code via SO https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


def get_flight_filename(demo=True):
    if demo:
        filename = SMALL_FLIGHT_CSV
        rows = 860457
    else:
        filename = BIG_FLIGHT_CSV
        rows = 5162742

    return filename, rows


SMALL_FLIGHT_CSV = 'data_2017_jan_feb.csv.zip'
BIG_FLIGHT_CSV = 'data_all_2017.csv.zip'