# Author: Stephen Situ
# This is a project that takes US Air Quality data and creates a regression model based on a deep learning neural network model with 3 dense layers
# to predict AQI (Air Quality Index) that ranges from 0 to 300 typically.
# using tensorflow/keras in python. The data was split into training and testing data sets, hot encoded for the categorical variables,
# and Min/Max scaled. Using a basic deep learning/neural network model, we achieve a very low MAE of 0.0067.
# Original Dataset can be found here: https://www.kaggle.com/datasets/calebreigada/us-air-quality-1980present
# Import Libraries
import numpy as np
import pandas as pd
# Read CSV
air_quality = pd.read_csv('US_AQI.csv')
air_quality.head()
CBSA Code | Date | AQI | Category | Defining Parameter | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | timezone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10140 | 1/1/2022 | 21 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles |
1 | 10140 | 1/2/2022 | 12 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles |
2 | 10140 | 1/3/2022 | 18 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles |
3 | 10140 | 1/4/2022 | 19 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles |
4 | 10140 | 1/5/2022 | 17 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles |
# Data Summary
air_quality.dtypes
CBSA Code int64 Date object AQI int64 Category object Defining Parameter object Number of Sites Reporting int64 city_ascii object state_id object state_name object lat float64 lng float64 population int64 density int64 timezone object dtype: object
# check duplicate
duplicate_rows = air_quality[air_quality.duplicated(keep='last')]
print(duplicate_rows)
Empty DataFrame Columns: [CBSA Code, Date, AQI, Category, Defining Parameter, Number of Sites Reporting, city_ascii, state_id, state_name, lat, lng, population, density, timezone] Index: []
# check null
nan_rows = air_quality[air_quality.isna().any(axis=1)]
print(nan_rows)
Empty DataFrame Columns: [CBSA Code, Date, AQI, Category, Defining Parameter, Number of Sites Reporting, city_ascii, state_id, state_name, lat, lng, population, density, timezone] Index: []
# Split Date column from string into three columns, month, day, year
date_split = air_quality["Date"].str.split("/", n = -1, expand = True)
air_quality["Month"]= date_split[0]
air_quality["Day"]= date_split[1]
air_quality["Year"]= date_split[2]
air_quality.info
<bound method DataFrame.info of CBSA Code Date AQI Category Defining Parameter \ 0 10140 1/1/2022 21 Good PM2.5 1 10140 1/2/2022 12 Good PM2.5 2 10140 1/3/2022 18 Good PM2.5 3 10140 1/4/2022 19 Good PM2.5 4 10140 1/5/2022 17 Good PM2.5 ... ... ... ... ... ... 1048570 13900 6/11/2015 66 Moderate PM2.5 1048571 13900 6/12/2015 53 Moderate PM2.5 1048572 13900 6/13/2015 58 Moderate Ozone 1048573 13900 6/14/2015 61 Moderate Ozone 1048574 13900 6/15/2015 34 Good Ozone Number of Sites Reporting city_ascii state_id state_name lat \ 0 2 Aberdeen WA Washington 46.9757 1 2 Aberdeen WA Washington 46.9757 2 2 Aberdeen WA Washington 46.9757 3 2 Aberdeen WA Washington 46.9757 4 2 Aberdeen WA Washington 46.9757 ... ... ... ... ... ... 1048570 2 Bismarck ND North Dakota 46.8143 1048571 2 Bismarck ND North Dakota 46.8143 1048572 2 Bismarck ND North Dakota 46.8143 1048573 2 Bismarck ND North Dakota 46.8143 1048574 2 Bismarck ND North Dakota 46.8143 lng population density timezone Month Day Year 0 -123.8094 16571 588 America/Los_Angeles 1 1 2022 1 -123.8094 16571 588 America/Los_Angeles 1 2 2022 2 -123.8094 16571 588 America/Los_Angeles 1 3 2022 3 -123.8094 16571 588 America/Los_Angeles 1 4 2022 4 -123.8094 16571 588 America/Los_Angeles 1 5 2022 ... ... ... ... ... ... .. ... 1048570 -100.7694 90420 817 America/Chicago 6 11 2015 1048571 -100.7694 90420 817 America/Chicago 6 12 2015 1048572 -100.7694 90420 817 America/Chicago 6 13 2015 1048573 -100.7694 90420 817 America/Chicago 6 14 2015 1048574 -100.7694 90420 817 America/Chicago 6 15 2015 [1048575 rows x 17 columns]>
# Remove original date column
air_quality_clean = air_quality.drop(['Date'], axis=1)
air_quality_clean
CBSA Code | AQI | Category | Defining Parameter | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | timezone | Month | Day | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10140 | 21 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 1 | 2022 |
1 | 10140 | 12 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 2 | 2022 |
2 | 10140 | 18 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 3 | 2022 |
3 | 10140 | 19 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 4 | 2022 |
4 | 10140 | 17 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 5 | 2022 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 13900 | 66 | Moderate | PM2.5 | 2 | Bismarck | ND | North Dakota | 46.8143 | -100.7694 | 90420 | 817 | America/Chicago | 6 | 11 | 2015 |
1048571 | 13900 | 53 | Moderate | PM2.5 | 2 | Bismarck | ND | North Dakota | 46.8143 | -100.7694 | 90420 | 817 | America/Chicago | 6 | 12 | 2015 |
1048572 | 13900 | 58 | Moderate | Ozone | 2 | Bismarck | ND | North Dakota | 46.8143 | -100.7694 | 90420 | 817 | America/Chicago | 6 | 13 | 2015 |
1048573 | 13900 | 61 | Moderate | Ozone | 2 | Bismarck | ND | North Dakota | 46.8143 | -100.7694 | 90420 | 817 | America/Chicago | 6 | 14 | 2015 |
1048574 | 13900 | 34 | Good | Ozone | 2 | Bismarck | ND | North Dakota | 46.8143 | -100.7694 | 90420 | 817 | America/Chicago | 6 | 15 | 2015 |
1048575 rows × 16 columns
# Convert Month, Day, Year columns to number
air_quality_clean['Month'] = pd.to_numeric(air_quality_clean['Month'])
air_quality_clean['Day'] = pd.to_numeric(air_quality_clean['Day'])
air_quality_clean['Year'] = pd.to_numeric(air_quality_clean['Year'])
air_quality_clean.dtypes
CBSA Code int64 AQI int64 Category object Defining Parameter object Number of Sites Reporting int64 city_ascii object state_id object state_name object lat float64 lng float64 population int64 density int64 timezone object Month int64 Day int64 Year int64 dtype: object
# Filter to 2019 and after due to Computational resource constraint
index = np.where(air_quality_clean["Year"] >= 2019)
air_quality_clean1 = air_quality_clean.loc[index]
air_quality_clean1
CBSA Code | AQI | Category | Defining Parameter | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | timezone | Month | Day | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10140 | 21 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 1 | 2022 |
1 | 10140 | 12 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 2 | 2022 |
2 | 10140 | 18 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 3 | 2022 |
3 | 10140 | 19 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 4 | 2022 |
4 | 10140 | 17 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571 | 588 | America/Los_Angeles | 1 | 5 | 2022 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
526803 | 49740 | 32 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612 | 311 | America/Phoenix | 12 | 27 | 2019 |
526804 | 49740 | 37 | Good | PM2.5 | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612 | 311 | America/Phoenix | 12 | 28 | 2019 |
526805 | 49740 | 35 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612 | 311 | America/Phoenix | 12 | 29 | 2019 |
526806 | 49740 | 27 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612 | 311 | America/Phoenix | 12 | 30 | 2019 |
526807 | 49740 | 32 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612 | 311 | America/Phoenix | 12 | 31 | 2019 |
526808 rows × 16 columns
# Hot Encode categorical variables
air_quality_clean2 = pd.get_dummies(air_quality_clean1)
air_quality_clean2.dtypes
CBSA Code int64 AQI int64 Number of Sites Reporting int64 lat float64 lng float64 ... timezone_America/Matamoros uint8 timezone_America/New_York uint8 timezone_America/Phoenix uint8 timezone_America/Puerto_Rico uint8 timezone_Pacific/Honolulu uint8 Length: 610, dtype: object
# Split into Train and Test
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(air_quality_clean2, test_size=0.2)
train_data_y = train_data['AQI']
train_data_x = train_data.drop(columns=['AQI'])
test_data_y = test_data['AQI']
test_data_x = test_data.drop(columns=['AQI'])
CBSA Code | Number of Sites Reporting | lat | lng | population | density | Month | Day | Year | Category_Good | ... | timezone_America/Detroit | timezone_America/Indiana/Indianapolis | timezone_America/Indiana/Vincennes | timezone_America/Juneau | timezone_America/Los_Angeles | timezone_America/Matamoros | timezone_America/New_York | timezone_America/Phoenix | timezone_America/Puerto_Rico | timezone_Pacific/Honolulu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
244428 | 22280 | 2 | 39.5627 | -119.1906 | 20616 | 65 | 11 | 7 | 2020 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
111377 | 30140 | 1 | 40.3412 | -76.4228 | 80620 | 2388 | 9 | 18 | 2021 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
443111 | 29740 | 9 | 32.3265 | -106.7893 | 129538 | 516 | 4 | 10 | 2019 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
279863 | 30780 | 3 | 34.7256 | -92.3577 | 446814 | 637 | 4 | 13 | 2020 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
122409 | 33340 | 5 | 43.0642 | -87.9675 | 1390046 | 2379 | 1 | 11 | 2021 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
221176 | 17140 | 13 | 39.1413 | -84.5060 | 1684800 | 1501 | 3 | 17 | 2020 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4806 | 17300 | 1 | 36.5692 | -87.3413 | 183798 | 606 | 2 | 9 | 2022 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
385701 | 16820 | 1 | 38.0375 | -78.4855 | 102164 | 1779 | 8 | 5 | 2019 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
511376 | 46020 | 1 | 39.3455 | -120.1848 | 16666 | 199 | 7 | 1 | 2019 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
191079 | 49620 | 2 | 39.9651 | -76.7315 | 236818 | 3211 | 3 | 8 | 2021 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
105362 rows × 609 columns
# Convert data to Numpy array from panda dataframe
train_data_x = np.array(train_data_x)
train_data_y = np.array(train_data_y)
test_data_x = np.array(test_data_x)
test_data_y = np.array(test_data_y)
# Normalize data to min/max scaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
train_data_y = train_data_y.reshape(-1, 1)
train_data_y_s = scaler.fit_transform(train_data_y)
test_data_y = test_data_y.reshape(-1, 1)
test_data_y_s = scaler.transform(test_data_y)
train_data_x_s = scaler.fit_transform(train_data_x)
test_data_x_s = scaler.transform(test_data_x)
# Import Tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
# Create Model with 3 dense layers
tf.random.set_seed(42)
air_model = tf.keras.Sequential([
tf.keras.layers.Dense(10),
tf.keras.layers.Dense(10),
tf.keras.layers.Dense(1)
])
# Compile Model
air_model.compile(loss=tf.keras.losses.mae, optimizer=tf.keras.optimizers.Adam(),metrics=["mae"])
# Train Model using validation set of 0.1 to prevent over fitting
air_model.fit(x=train_data_x_s, y=train_data_y_s, validation_split=0.1, batch_size=10,epochs=10,shuffle=True,verbose=2)
Epoch 1/10 37931/37931 - 35s - loss: 0.0077 - mae: 0.0077 - val_loss: 0.0070 - val_mae: 0.0070 - 35s/epoch - 923us/step Epoch 2/10 37931/37931 - 41s - loss: 0.0069 - mae: 0.0069 - val_loss: 0.0073 - val_mae: 0.0073 - 41s/epoch - 1ms/step Epoch 3/10 37931/37931 - 36s - loss: 0.0069 - mae: 0.0069 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 949us/step Epoch 4/10 37931/37931 - 36s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 954us/step Epoch 5/10 37931/37931 - 38s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0068 - val_mae: 0.0068 - 38s/epoch - 990us/step Epoch 6/10 37931/37931 - 42s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 42s/epoch - 1ms/step Epoch 7/10 37931/37931 - 38s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 38s/epoch - 1ms/step Epoch 8/10 37931/37931 - 43s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0072 - val_mae: 0.0072 - 43s/epoch - 1ms/step Epoch 9/10 37931/37931 - 40s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 40s/epoch - 1ms/step Epoch 10/10 37931/37931 - 36s - loss: 0.0068 - mae: 0.0068 - val_loss: 0.0067 - val_mae: 0.0067 - 36s/epoch - 944us/step
<keras.callbacks.History at 0x11fa514adc0>
# Predict on test x
predictions = air_model.predict(x=test_data_x_s,batch_size=10,verbose=2)
predictions.shape
10537/10537 - 6s - 6s/epoch - 604us/step
(105362, 1)
#Check Shape
test_data_y_s.shape
(105362, 1)
# Calculated MAE of around 0.0067, this validates the MAE we see in the model
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test_data_y_s, predictions)
0.006721152943301766