In [1]:
# Author: Stephen Situ
# This is a project that takes US Air Quality data and creates a regression model based on a deep learning neural network model with 3 dense layers
# to predict AQI (Air Quality Index) that ranges from 0 to 300 typically. 
# using tensorflow/keras in python. The data was split into training and testing data sets, hot encoded for the categorical variables,
# and Min/Max scaled. Using a basic deep learning/neural network model, we achieve a very low MAE of 0.0067.
# Original Dataset can be found here: https://www.kaggle.com/datasets/calebreigada/us-air-quality-1980present

# Import Libraries
import numpy as np
import pandas as pd
In [2]:
# Read CSV
air_quality = pd.read_csv('US_AQI.csv')
air_quality.head()
Out[2]:
CBSA Code Date AQI Category Defining Parameter Number of Sites Reporting city_ascii state_id state_name lat lng population density timezone
0 10140 1/1/2022 21 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles
1 10140 1/2/2022 12 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles
2 10140 1/3/2022 18 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles
3 10140 1/4/2022 19 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles
4 10140 1/5/2022 17 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles
In [3]:
# Data Summary
air_quality.dtypes
Out[3]:
CBSA Code                      int64
Date                          object
AQI                            int64
Category                      object
Defining Parameter            object
Number of Sites Reporting      int64
city_ascii                    object
state_id                      object
state_name                    object
lat                          float64
lng                          float64
population                     int64
density                        int64
timezone                      object
dtype: object
In [4]:
# check duplicate
duplicate_rows = air_quality[air_quality.duplicated(keep='last')]
print(duplicate_rows)
Empty DataFrame
Columns: [CBSA Code, Date, AQI, Category, Defining Parameter, Number of Sites Reporting, city_ascii, state_id, state_name, lat, lng, population, density, timezone]
Index: []
In [5]:
# check null
nan_rows  = air_quality[air_quality.isna().any(axis=1)]
print(nan_rows)
Empty DataFrame
Columns: [CBSA Code, Date, AQI, Category, Defining Parameter, Number of Sites Reporting, city_ascii, state_id, state_name, lat, lng, population, density, timezone]
Index: []
In [6]:
# Split Date column from string into three columns, month, day, year
date_split = air_quality["Date"].str.split("/", n = -1, expand = True)
air_quality["Month"]= date_split[0]
air_quality["Day"]= date_split[1]
air_quality["Year"]= date_split[2]
air_quality.info
Out[6]:
<bound method DataFrame.info of          CBSA Code       Date  AQI  Category Defining Parameter  \
0            10140   1/1/2022   21      Good              PM2.5   
1            10140   1/2/2022   12      Good              PM2.5   
2            10140   1/3/2022   18      Good              PM2.5   
3            10140   1/4/2022   19      Good              PM2.5   
4            10140   1/5/2022   17      Good              PM2.5   
...            ...        ...  ...       ...                ...   
1048570      13900  6/11/2015   66  Moderate              PM2.5   
1048571      13900  6/12/2015   53  Moderate              PM2.5   
1048572      13900  6/13/2015   58  Moderate              Ozone   
1048573      13900  6/14/2015   61  Moderate              Ozone   
1048574      13900  6/15/2015   34      Good              Ozone   

         Number of Sites Reporting city_ascii state_id    state_name      lat  \
0                                2   Aberdeen       WA    Washington  46.9757   
1                                2   Aberdeen       WA    Washington  46.9757   
2                                2   Aberdeen       WA    Washington  46.9757   
3                                2   Aberdeen       WA    Washington  46.9757   
4                                2   Aberdeen       WA    Washington  46.9757   
...                            ...        ...      ...           ...      ...   
1048570                          2   Bismarck       ND  North Dakota  46.8143   
1048571                          2   Bismarck       ND  North Dakota  46.8143   
1048572                          2   Bismarck       ND  North Dakota  46.8143   
1048573                          2   Bismarck       ND  North Dakota  46.8143   
1048574                          2   Bismarck       ND  North Dakota  46.8143   

              lng  population  density             timezone Month Day  Year  
0       -123.8094       16571      588  America/Los_Angeles     1   1  2022  
1       -123.8094       16571      588  America/Los_Angeles     1   2  2022  
2       -123.8094       16571      588  America/Los_Angeles     1   3  2022  
3       -123.8094       16571      588  America/Los_Angeles     1   4  2022  
4       -123.8094       16571      588  America/Los_Angeles     1   5  2022  
...           ...         ...      ...                  ...   ...  ..   ...  
1048570 -100.7694       90420      817      America/Chicago     6  11  2015  
1048571 -100.7694       90420      817      America/Chicago     6  12  2015  
1048572 -100.7694       90420      817      America/Chicago     6  13  2015  
1048573 -100.7694       90420      817      America/Chicago     6  14  2015  
1048574 -100.7694       90420      817      America/Chicago     6  15  2015  

[1048575 rows x 17 columns]>
In [7]:
# Remove original date column
air_quality_clean = air_quality.drop(['Date'], axis=1)
air_quality_clean
Out[7]:
CBSA Code AQI Category Defining Parameter Number of Sites Reporting city_ascii state_id state_name lat lng population density timezone Month Day Year
0 10140 21 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 1 2022
1 10140 12 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 2 2022
2 10140 18 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 3 2022
3 10140 19 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 4 2022
4 10140 17 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 5 2022
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 13900 66 Moderate PM2.5 2 Bismarck ND North Dakota 46.8143 -100.7694 90420 817 America/Chicago 6 11 2015
1048571 13900 53 Moderate PM2.5 2 Bismarck ND North Dakota 46.8143 -100.7694 90420 817 America/Chicago 6 12 2015
1048572 13900 58 Moderate Ozone 2 Bismarck ND North Dakota 46.8143 -100.7694 90420 817 America/Chicago 6 13 2015
1048573 13900 61 Moderate Ozone 2 Bismarck ND North Dakota 46.8143 -100.7694 90420 817 America/Chicago 6 14 2015
1048574 13900 34 Good Ozone 2 Bismarck ND North Dakota 46.8143 -100.7694 90420 817 America/Chicago 6 15 2015

1048575 rows × 16 columns

In [8]:
# Convert Month, Day, Year columns to number
air_quality_clean['Month'] = pd.to_numeric(air_quality_clean['Month'])
air_quality_clean['Day'] = pd.to_numeric(air_quality_clean['Day'])
air_quality_clean['Year'] = pd.to_numeric(air_quality_clean['Year'])
air_quality_clean.dtypes
Out[8]:
CBSA Code                      int64
AQI                            int64
Category                      object
Defining Parameter            object
Number of Sites Reporting      int64
city_ascii                    object
state_id                      object
state_name                    object
lat                          float64
lng                          float64
population                     int64
density                        int64
timezone                      object
Month                          int64
Day                            int64
Year                           int64
dtype: object
In [9]:
# Filter to 2019 and after due to Computational resource constraint
index = np.where(air_quality_clean["Year"] >= 2019)
air_quality_clean1 = air_quality_clean.loc[index]
air_quality_clean1
Out[9]:
CBSA Code AQI Category Defining Parameter Number of Sites Reporting city_ascii state_id state_name lat lng population density timezone Month Day Year
0 10140 21 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 1 2022
1 10140 12 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 2 2022
2 10140 18 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 3 2022
3 10140 19 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 4 2022
4 10140 17 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571 588 America/Los_Angeles 1 5 2022
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
526803 49740 32 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612 311 America/Phoenix 12 27 2019
526804 49740 37 Good PM2.5 1 Yuma AZ Arizona 32.5995 -114.5491 137612 311 America/Phoenix 12 28 2019
526805 49740 35 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612 311 America/Phoenix 12 29 2019
526806 49740 27 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612 311 America/Phoenix 12 30 2019
526807 49740 32 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612 311 America/Phoenix 12 31 2019

526808 rows × 16 columns

In [10]:
# Hot Encode categorical variables 
air_quality_clean2 = pd.get_dummies(air_quality_clean1)
air_quality_clean2.dtypes
Out[10]:
CBSA Code                         int64
AQI                               int64
Number of Sites Reporting         int64
lat                             float64
lng                             float64
                                 ...   
timezone_America/Matamoros        uint8
timezone_America/New_York         uint8
timezone_America/Phoenix          uint8
timezone_America/Puerto_Rico      uint8
timezone_Pacific/Honolulu         uint8
Length: 610, dtype: object
In [219]:
# Split into Train and Test
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(air_quality_clean2, test_size=0.2)
train_data_y = train_data['AQI']
train_data_x = train_data.drop(columns=['AQI'])
test_data_y = test_data['AQI']
test_data_x = test_data.drop(columns=['AQI'])
Out[219]:
CBSA Code Number of Sites Reporting lat lng population density Month Day Year Category_Good ... timezone_America/Detroit timezone_America/Indiana/Indianapolis timezone_America/Indiana/Vincennes timezone_America/Juneau timezone_America/Los_Angeles timezone_America/Matamoros timezone_America/New_York timezone_America/Phoenix timezone_America/Puerto_Rico timezone_Pacific/Honolulu
244428 22280 2 39.5627 -119.1906 20616 65 11 7 2020 1 ... 0 0 0 0 1 0 0 0 0 0
111377 30140 1 40.3412 -76.4228 80620 2388 9 18 2021 1 ... 0 0 0 0 0 0 1 0 0 0
443111 29740 9 32.3265 -106.7893 129538 516 4 10 2019 0 ... 0 0 0 0 0 0 0 0 0 0
279863 30780 3 34.7256 -92.3577 446814 637 4 13 2020 1 ... 0 0 0 0 0 0 0 0 0 0
122409 33340 5 43.0642 -87.9675 1390046 2379 1 11 2021 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
221176 17140 13 39.1413 -84.5060 1684800 1501 3 17 2020 1 ... 0 0 0 0 0 0 1 0 0 0
4806 17300 1 36.5692 -87.3413 183798 606 2 9 2022 1 ... 0 0 0 0 0 0 0 0 0 0
385701 16820 1 38.0375 -78.4855 102164 1779 8 5 2019 1 ... 0 0 0 0 0 0 1 0 0 0
511376 46020 1 39.3455 -120.1848 16666 199 7 1 2019 0 ... 0 0 0 0 1 0 0 0 0 0
191079 49620 2 39.9651 -76.7315 236818 3211 3 8 2021 1 ... 0 0 0 0 0 0 1 0 0 0

105362 rows × 609 columns

In [221]:
# Convert data to Numpy array from panda dataframe
train_data_x  = np.array(train_data_x)
train_data_y  = np.array(train_data_y)
test_data_x   =  np.array(test_data_x)
test_data_y   = np.array(test_data_y)
In [224]:
# Normalize data to min/max scaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

train_data_y = train_data_y.reshape(-1, 1)
train_data_y_s = scaler.fit_transform(train_data_y)

test_data_y = test_data_y.reshape(-1, 1)
test_data_y_s = scaler.transform(test_data_y)

train_data_x_s = scaler.fit_transform(train_data_x)
test_data_x_s = scaler.transform(test_data_x)
In [225]:
# Import Tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
In [226]:
# Create Model with 3 dense layers
tf.random.set_seed(42)
air_model = tf.keras.Sequential([
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
])
In [227]:
# Compile Model
air_model.compile(loss=tf.keras.losses.mae, optimizer=tf.keras.optimizers.Adam(),metrics=["mae"])
In [228]:
# Train Model using validation set of 0.1 to prevent over fitting
air_model.fit(x=train_data_x_s, y=train_data_y_s, validation_split=0.1, batch_size=10,epochs=10,shuffle=True,verbose=2)
Epoch 1/10
37931/37931 - 35s - loss: 0.0077 - mae: 0.0077 - val_loss: 0.0070 - val_mae: 0.0070 - 35s/epoch - 923us/step
Epoch 2/10
37931/37931 - 41s - loss: 0.0069 - mae: 0.0069 - val_loss: 0.0073 - val_mae: 0.0073 - 41s/epoch - 1ms/step
Epoch 3/10
37931/37931 - 36s - loss: 0.0069 - mae: 0.0069 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 949us/step
Epoch 4/10
37931/37931 - 36s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 954us/step
Epoch 5/10
37931/37931 - 38s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0068 - val_mae: 0.0068 - 38s/epoch - 990us/step
Epoch 6/10
37931/37931 - 42s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 42s/epoch - 1ms/step
Epoch 7/10
37931/37931 - 38s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 38s/epoch - 1ms/step
Epoch 8/10
37931/37931 - 43s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0072 - val_mae: 0.0072 - 43s/epoch - 1ms/step
Epoch 9/10
37931/37931 - 40s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 40s/epoch - 1ms/step
Epoch 10/10
37931/37931 - 36s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 944us/step
Out[228]:
<keras.callbacks.History at 0x11fa514adc0>
In [231]:
# Predict on test x
predictions = air_model.predict(x=test_data_x_s,batch_size=10,verbose=2)
predictions.shape
10537/10537 - 6s - 6s/epoch - 604us/step
Out[231]:
(105362, 1)
In [232]:
#Check Shape
test_data_y_s.shape
Out[232]:
(105362, 1)
In [233]:
# Calculated MAE of around 0.0067, this validates the MAE we see in the model 
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test_data_y_s, predictions)
Out[233]:
0.006721152943301766