Source code for featuretools.demo.flight

import math
import re

import pandas as pd
from tqdm import tqdm
from woodwork.logical_types import Boolean, Categorical, Ordinal

import featuretools as ft


[docs]def load_flight(month_filter=None, categorical_filter=None, nrows=None, demo=True, return_single_table=False, verbose=False): ''' Download, clean, and filter flight data from 2017. The original dataset can be found `here <https://www.transtats.bts.gov/DL_SelectFields.asp?DB_Short_Name=On-Time&Table_ID=236>`_. Args: month_filter (list[int]): Only use data from these months (example is ``[1, 2]``). To skip, set to None. categorical_filter (dict[str->str]): Use only specified categorical values. Example is ``{'dest_city': ['Boston, MA'], 'origin_city': ['Boston, MA']}`` which returns all flights in OR out of Boston. To skip, set to None. nrows (int): Passed to nrows in ``pd.read_csv``. Used before filtering. demo (bool): Use only two months of data. If False, use the whole year. return_single_table (bool): Exit the function early and return a dataframe. verbose (bool): Show a progress bar while loading the data. Examples: .. ipython:: :verbatim: In [1]: import featuretools as ft In [2]: es = ft.demo.load_flight(verbose=True, ...: month_filter=[1], ...: categorical_filter={'origin_city':['Boston, MA']}) 100%|xxxxxxxxxxxxxxxxxxxxxxxxx| 100/100 [01:16<00:00, 1.31it/s] In [3]: es Out[3]: Entityset: Flight Data DataFrames: airports [Rows: 55, Columns: 3] flights [Rows: 613, Columns: 9] trip_logs [Rows: 9456, Columns: 22] airlines [Rows: 10, Columns: 1] Relationships: trip_logs.flight_id -> flights.flight_id flights.carrier -> airlines.carrier flights.dest -> airports.dest ''' filename, csv_length = get_flight_filename(demo=demo) print('Downloading data ...') url = "https://api.featurelabs.com/datasets/{}?library=featuretools&version={}".format(filename, ft.__version__) chunksize = math.ceil(csv_length / 99) pd.options.display.max_columns = 200 iter_csv = pd.read_csv(url, compression='zip', iterator=True, nrows=nrows, chunksize=chunksize) if verbose: iter_csv = tqdm(iter_csv, total=100) partial_df_list = [] for chunk in iter_csv: df = filter_data(_clean_data(chunk), month_filter=month_filter, categorical_filter=categorical_filter) partial_df_list.append(df) data = pd.concat(partial_df_list) if return_single_table: return data es = make_es(data) return es
def make_es(data): es = ft.EntitySet('Flight Data') arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay', 'national_airspace_delay', 'security_delay', 'late_aircraft_delay', 'canceled', 'diverted', 'taxi_in', 'taxi_out', 'air_time', 'dep_time'] logical_types = {'flight_num': Categorical, 'distance_group': Ordinal(order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), 'canceled': Boolean, 'diverted': Boolean} es.add_dataframe(data, dataframe_name='trip_logs', index='trip_log_id', make_index=True, time_index='date_scheduled', secondary_time_index={'arr_time': arr_time_columns}, logical_types=logical_types) es.normalize_dataframe('trip_logs', 'flights', 'flight_id', additional_columns=['origin', 'origin_city', 'origin_state', 'dest', 'dest_city', 'dest_state', 'distance_group', 'carrier', 'flight_num']) es.normalize_dataframe('flights', 'airlines', 'carrier', make_time_index=False) es.normalize_dataframe('flights', 'airports', 'dest', additional_columns=['dest_city', 'dest_state'], make_time_index=False) return es def _clean_data(data): # Make column names snake case clean_data = data.rename( columns={col: convert(col) for col in data}) # Chance crs -> "scheduled" and other minor clarifications clean_data = clean_data.rename(columns={'crs_arr_time': 'scheduled_arr_time', 'crs_dep_time': 'scheduled_dep_time', 'crs_elapsed_time': 'scheduled_elapsed_time', 'nas_delay': 'national_airspace_delay', 'origin_city_name': 'origin_city', 'dest_city_name': 'dest_city', 'cancelled': 'canceled'}) # Combine strings like 0130 (1:30 AM) with dates (2017-01-01) clean_data['scheduled_dep_time'] = clean_data['scheduled_dep_time'].apply(lambda x: str(x)) + clean_data['flight_date'].astype('str') # Parse combined string as a date clean_data.loc[:, 'scheduled_dep_time'] = pd.to_datetime( clean_data['scheduled_dep_time'], format='%H%M%Y-%m-%d', errors='coerce') clean_data['scheduled_elapsed_time'] = pd.to_timedelta(clean_data['scheduled_elapsed_time'], unit='m') clean_data = _reconstruct_times(clean_data) # Create a time index 6 months before scheduled_dep clean_data.loc[:, 'date_scheduled'] = clean_data['scheduled_dep_time'].dt.date - \ pd.Timedelta('120d') # A null entry for a delay means no delay clean_data = _fill_labels(clean_data) # Nulls for scheduled values are too problematic. Remove them. clean_data = clean_data.dropna( axis='rows', subset=['scheduled_dep_time', 'scheduled_arr_time']) # Make a flight id. Define a flight as a combination of: # 1. carrier 2. flight number 3. origin airport 4. dest airport clean_data.loc[:, 'flight_id'] = clean_data['carrier'] + '-' + \ clean_data['flight_num'].apply(lambda x: str(x)) + ':' + clean_data['origin'] + '->' + clean_data['dest'] column_order = [ 'flight_id', 'flight_num', 'date_scheduled', 'scheduled_dep_time', 'scheduled_arr_time', 'carrier', 'origin', 'origin_city', 'origin_state', 'dest', 'dest_city', 'dest_state', 'distance_group', 'dep_time', 'arr_time', 'dep_delay', 'taxi_out', 'taxi_in', 'arr_delay', 'diverted', 'scheduled_elapsed_time', 'air_time', 'distance', 'carrier_delay', 'weather_delay', 'national_airspace_delay', 'security_delay', 'late_aircraft_delay', 'canceled' ] clean_data = clean_data[column_order] return clean_data def _fill_labels(clean_data): labely_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay', 'national_airspace_delay', 'security_delay', 'late_aircraft_delay', 'canceled', 'diverted', 'taxi_in', 'taxi_out', 'air_time'] for col in labely_columns: clean_data.loc[:, col] = clean_data[col].fillna(0) return clean_data def _reconstruct_times(clean_data): """ Reconstruct departure_time, scheduled_dep_time, arrival_time and scheduled_arr_time by adding known delays to known times. We do: - dep_time is scheduled_dep + dep_delay - arr_time is dep_time + taxiing and air_time - scheduled arrival is scheduled_dep + scheduled_elapsed """ clean_data.loc[:, 'dep_time'] = clean_data['scheduled_dep_time'] + \ pd.to_timedelta(clean_data['dep_delay'], unit='m') clean_data.loc[:, 'arr_time'] = clean_data['dep_time'] + \ pd.to_timedelta(clean_data['taxi_out'] + clean_data['air_time'] + clean_data['taxi_in'], unit='m') clean_data.loc[:, 'scheduled_arr_time'] = clean_data['scheduled_dep_time'] + \ clean_data['scheduled_elapsed_time'] return clean_data def filter_data(clean_data, month_filter=None, categorical_filter=None): if month_filter is not None: tmp = clean_data['scheduled_dep_time'].dt.month.isin(month_filter) clean_data = clean_data[tmp] if categorical_filter is not None: tmp = False for key, values in categorical_filter.items(): tmp = tmp | clean_data[key].isin(values) clean_data = clean_data[tmp] return clean_data def convert(name): # Rename columns to underscore # Code via SO https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() def get_flight_filename(demo=True): if demo: filename = SMALL_FLIGHT_CSV rows = 860457 else: filename = BIG_FLIGHT_CSV rows = 5162742 return filename, rows SMALL_FLIGHT_CSV = 'data_2017_jan_feb.csv.zip' BIG_FLIGHT_CSV = 'data_all_2017.csv.zip'