This is a project on using three similar machine learning techniques for classification of labeled data with continuous features. Linear Discriminant Analysis (LDA) tries to find a linear combination of features to classify data, this boundary takes on a shape of a line or hyperplane. This assumes that the data is normal and the covariance matrices are identical. Quadratic Discriminant Analysis (QDA) is similar to LDA, but the boundary is non-linear. Support Vector Machines (SVM) has support vectors that create the boundary and then the algorithm tries to maximize the distance between the different groups, known as maximizing the margin of separation. SVM does not require any assumptions and is more flexible.
First we conducted a Shapiro-wilk test to test the normality of the data set and found that we could not assume the data is normally distributed. Then we proceeded to use all 3 methods and evaluated their accuracy. They all gave a similar accuracy of 88% for the training data. To investigate further, we created a scatter plot of correct and incorrect predictions. This lets us see the results of each method in depth.
Original Data set: https://www.kaggle.com/datasets/muratkokludataset/pumpkin-seeds-dataset
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpkP6CUZ\downloaded_packages
install.packages("e1071", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\e1071\libs\x64\e1071.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\e1071\libs\x64\e1071.dll:
## Permission denied
## Warning: restored 'e1071'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpkP6CUZ\downloaded_packages
install.packages("MASS", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'MASS' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'MASS'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\MASS\libs\x64\MASS.dll to C:
## \Users\Steve\AppData\Local\R\win-library\4.2\MASS\libs\x64\MASS.dll: Permission
## denied
## Warning: restored 'MASS'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpkP6CUZ\downloaded_packages
install.packages("caret", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpkP6CUZ\downloaded_packages
install.packages("readxl", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\readxl\libs\x64\readxl.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpkP6CUZ\downloaded_packages
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(e1071)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(readxl)
df <- read_xlsx("Pumpkin_Seeds_Dataset.xlsx", sheet=1)
summary(df)
## Area Perimeter Major_Axis_Length Minor_Axis_Length
## Min. : 47939 Min. : 868.5 Min. :320.8 Min. :152.2
## 1st Qu.: 70765 1st Qu.:1048.8 1st Qu.:415.0 1st Qu.:211.2
## Median : 79076 Median :1123.7 Median :449.5 Median :224.7
## Mean : 80658 Mean :1130.3 Mean :456.6 Mean :225.8
## 3rd Qu.: 89758 3rd Qu.:1203.3 3rd Qu.:492.7 3rd Qu.:240.7
## Max. :136574 Max. :1559.5 Max. :661.9 Max. :305.8
## Convex_Area Equiv_Diameter Eccentricity Solidity
## Min. : 48366 Min. :247.1 Min. :0.4921 Min. :0.9186
## 1st Qu.: 71512 1st Qu.:300.2 1st Qu.:0.8317 1st Qu.:0.9883
## Median : 79872 Median :317.3 Median :0.8637 Median :0.9903
## Mean : 81508 Mean :319.3 Mean :0.8609 Mean :0.9895
## 3rd Qu.: 90798 3rd Qu.:338.1 3rd Qu.:0.8970 3rd Qu.:0.9915
## Max. :138384 Max. :417.0 Max. :0.9481 Max. :0.9944
## Extent Roundness Aspect_Ration Compactness
## Min. :0.4680 Min. :0.5546 Min. :1.149 Min. :0.5608
## 1st Qu.:0.6589 1st Qu.:0.7519 1st Qu.:1.801 1st Qu.:0.6635
## Median :0.7130 Median :0.7977 Median :1.984 Median :0.7077
## Mean :0.6932 Mean :0.7915 Mean :2.042 Mean :0.7041
## 3rd Qu.:0.7402 3rd Qu.:0.8343 3rd Qu.:2.262 3rd Qu.:0.7435
## Max. :0.8296 Max. :0.9396 Max. :3.144 Max. :0.9049
## Class
## Length:2500
## Class :character
## Mode :character
##
##
##
str(df)
## tibble [2,500 × 13] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:2500] 56276 76631 71623 66458 66107 ...
## $ Perimeter : num [1:2500] 888 1068 1083 992 998 ...
## $ Major_Axis_Length: num [1:2500] 326 417 436 382 384 ...
## $ Minor_Axis_Length: num [1:2500] 220 234 211 223 220 ...
## $ Convex_Area : num [1:2500] 56831 77280 72663 67118 67117 ...
## $ Equiv_Diameter : num [1:2500] 268 312 302 291 290 ...
## $ Eccentricity : num [1:2500] 0.738 0.828 0.875 0.812 0.819 ...
## $ Solidity : num [1:2500] 0.99 0.992 0.986 0.99 0.985 ...
## $ Extent : num [1:2500] 0.745 0.715 0.74 0.74 0.675 ...
## $ Roundness : num [1:2500] 0.896 0.844 0.767 0.849 0.834 ...
## $ Aspect_Ration : num [1:2500] 1.48 1.78 2.07 1.71 1.74 ...
## $ Compactness : num [1:2500] 0.821 0.749 0.693 0.762 0.756 ...
## $ Class : chr [1:2500] "Çerçevelik" "Çerçevelik" "Çerçevelik" "Çerçevelik" ...
head(df)
## # A tibble: 6 × 13
## Area Perimeter Major…¹ Minor…² Conve…³ Equiv…⁴ Eccen…⁵ Solid…⁶ Extent Round…⁷
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 56276 888. 326. 220. 56831 268. 0.738 0.990 0.745 0.896
## 2 76631 1068. 417. 234. 77280 312. 0.828 0.992 0.715 0.844
## 3 71623 1083. 436. 211. 72663 302. 0.875 0.986 0.74 0.767
## 4 66458 992. 382. 223. 67118 291. 0.812 0.990 0.740 0.849
## 5 66107 998. 384. 220. 67117 290. 0.819 0.985 0.675 0.834
## 6 73191 1041. 406. 231. 73969 305. 0.822 0.990 0.716 0.848
## # … with 3 more variables: Aspect_Ration <dbl>, Compactness <dbl>, Class <chr>,
## # and abbreviated variable names ¹Major_Axis_Length, ²Minor_Axis_Length,
## # ³Convex_Area, ⁴Equiv_Diameter, ⁵Eccentricity, ⁶Solidity, ⁷Roundness
pumpkin_data <- df %>% rename(Aspect_ratio = Aspect_Ration)
pumpkin_data$Class = as.factor(pumpkin_data$Class)
str(pumpkin_data)
## tibble [2,500 × 13] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:2500] 56276 76631 71623 66458 66107 ...
## $ Perimeter : num [1:2500] 888 1068 1083 992 998 ...
## $ Major_Axis_Length: num [1:2500] 326 417 436 382 384 ...
## $ Minor_Axis_Length: num [1:2500] 220 234 211 223 220 ...
## $ Convex_Area : num [1:2500] 56831 77280 72663 67118 67117 ...
## $ Equiv_Diameter : num [1:2500] 268 312 302 291 290 ...
## $ Eccentricity : num [1:2500] 0.738 0.828 0.875 0.812 0.819 ...
## $ Solidity : num [1:2500] 0.99 0.992 0.986 0.99 0.985 ...
## $ Extent : num [1:2500] 0.745 0.715 0.74 0.74 0.675 ...
## $ Roundness : num [1:2500] 0.896 0.844 0.767 0.849 0.834 ...
## $ Aspect_ratio : num [1:2500] 1.48 1.78 2.07 1.71 1.74 ...
## $ Compactness : num [1:2500] 0.821 0.749 0.693 0.762 0.756 ...
## $ Class : Factor w/ 2 levels "Çerçevelik","Ürgüp Sivrisi": 1 1 1 1 1 1 1 1 1 1 ...
set.seed(123)
shapiro.test(pumpkin_data$Area)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Area
## W = 0.9834, p-value < 2.2e-16
shapiro.test(pumpkin_data$Perimeter)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Perimeter
## W = 0.98795, p-value = 1.076e-13
shapiro.test(pumpkin_data$Major_Axis_Length)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Major_Axis_Length
## W = 0.98202, p-value < 2.2e-16
shapiro.test(pumpkin_data$Minor_Axis_Length)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Minor_Axis_Length
## W = 0.99816, p-value = 0.00591
shapiro.test(pumpkin_data$Convex_Area)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Convex_Area
## W = 0.98344, p-value < 2.2e-16
shapiro.test(pumpkin_data$Equiv_Diameter)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Equiv_Diameter
## W = 0.9935, p-value = 4.331e-09
shapiro.test(pumpkin_data$Eccentricity)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Eccentricity
## W = 0.96585, p-value < 2.2e-16
shapiro.test(pumpkin_data$Solidity)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Solidity
## W = 0.70041, p-value < 2.2e-16
shapiro.test(pumpkin_data$Extent)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Extent
## W = 0.90792, p-value < 2.2e-16
shapiro.test(pumpkin_data$Roundness)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Roundness
## W = 0.98731, p-value = 3.924e-14
shapiro.test(pumpkin_data$Aspect_ratio)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Aspect_ratio
## W = 0.97237, p-value < 2.2e-16
shapiro.test(pumpkin_data$Compactness)
##
## Shapiro-Wilk normality test
##
## data: pumpkin_data$Compactness
## W = 0.99305, p-value = 1.512e-09
ggplot(data=pumpkin_data) + geom_point(mapping=aes(x=Area,y=Aspect_ratio,shape=Class,colour=Class)) +
labs(y = "Aspect Ratio",x="Area") +
ggtitle("Scatter Plot of Pumpkin Seeds") + theme(plot.title = element_text(hjust = 0.5))
lda_model <- lda(Class~., data=pumpkin_data, type= 'C-Classification', cost=10)
pred_lda <- predict(lda_model, pumpkin_data)
cmatrix_lda <- table(predicted_values = pred_lda$class, true_values= pumpkin_data$Class)
confusionMatrix(cmatrix_lda)
## Confusion Matrix and Statistics
##
## true_values
## predicted_values Çerçevelik Ürgüp Sivrisi
## Çerçevelik 1185 183
## Ürgüp Sivrisi 115 1017
##
## Accuracy : 0.8808
## 95% CI : (0.8675, 0.8932)
## No Information Rate : 0.52
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7607
##
## Mcnemar's Test P-Value : 0.0001039
##
## Sensitivity : 0.9115
## Specificity : 0.8475
## Pos Pred Value : 0.8662
## Neg Pred Value : 0.8984
## Prevalence : 0.5200
## Detection Rate : 0.4740
## Detection Prevalence : 0.5472
## Balanced Accuracy : 0.8795
##
## 'Positive' Class : Çerçevelik
##
qda_model <- qda(Class~., data=pumpkin_data, type= 'C-Classification', cost=10)
pred_qda <- predict(qda_model, pumpkin_data)
cmatrix_qda <- table(predicted_values = pred_qda$class, true_values= pumpkin_data$Class)
confusionMatrix(cmatrix_qda)
## Confusion Matrix and Statistics
##
## true_values
## predicted_values Çerçevelik Ürgüp Sivrisi
## Çerçevelik 1192 184
## Ürgüp Sivrisi 108 1016
##
## Accuracy : 0.8832
## 95% CI : (0.87, 0.8955)
## No Information Rate : 0.52
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7655
##
## Mcnemar's Test P-Value : 1.139e-05
##
## Sensitivity : 0.9169
## Specificity : 0.8467
## Pos Pred Value : 0.8663
## Neg Pred Value : 0.9039
## Prevalence : 0.5200
## Detection Rate : 0.4768
## Detection Prevalence : 0.5504
## Balanced Accuracy : 0.8818
##
## 'Positive' Class : Çerçevelik
##
svm_model <- svm(Class~., data=pumpkin_data, type= 'C-classification', cost=10)
summary(svm_model)
##
## Call:
## svm(formula = Class ~ ., data = pumpkin_data, type = "C-classification",
## cost = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 688
##
## ( 345 343 )
##
##
## Number of Classes: 2
##
## Levels:
## Çerçevelik Ürgüp Sivrisi
pred_svm <- predict(svm_model, pumpkin_data)
cmatrix_svm <- table(predicted_values = pred_svm, true_values= pumpkin_data$Class)
confusionMatrix(cmatrix_qda)
## Confusion Matrix and Statistics
##
## true_values
## predicted_values Çerçevelik Ürgüp Sivrisi
## Çerçevelik 1192 184
## Ürgüp Sivrisi 108 1016
##
## Accuracy : 0.8832
## 95% CI : (0.87, 0.8955)
## No Information Rate : 0.52
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7655
##
## Mcnemar's Test P-Value : 1.139e-05
##
## Sensitivity : 0.9169
## Specificity : 0.8467
## Pos Pred Value : 0.8663
## Neg Pred Value : 0.9039
## Prevalence : 0.5200
## Detection Rate : 0.4768
## Detection Prevalence : 0.5504
## Balanced Accuracy : 0.8818
##
## 'Positive' Class : Çerçevelik
##
pumpkin_data_final <- mutate(pumpkin_data, LDA_values = pred_lda$class, QDA_values = pred_qda$class, SVM_values = pred_svm)
pumpkin_data_final$LDA_test = ifelse(pumpkin_data_final$Class == pumpkin_data_final$LDA_values, "Correct", "Incorrect")
pumpkin_data_final$QDA_test = ifelse(pumpkin_data_final$Class == pumpkin_data_final$QDA_values, "Correct", "Incorrect")
pumpkin_data_final$SVM_test = ifelse(pumpkin_data_final$Class == pumpkin_data_final$SVM_values, "Correct", "Incorrect")
ggplot(data=pumpkin_data_final) + geom_point(mapping=aes(x=Area,y=Aspect_ratio,shape=Class,colour=LDA_test)) +
scale_color_manual(values = c("Correct" = "green", "Incorrect" = "black")) +
labs(y = "Aspect Ratio",x="Area") +
ggtitle("Scatter Plot of Pumpkin Seed Classification using LDA") + theme(plot.title = element_text(hjust = 0.5))
ggplot(data=pumpkin_data_final) + geom_point(mapping=aes(x=Area,y=Aspect_ratio,shape=Class,colour=QDA_test)) +
scale_color_manual(values = c("Correct" = "green", "Incorrect" = "black")) +
labs(y = "Aspect Ratio",x="Area") +
ggtitle("Scatter Plot of Pumpkin Seed Classification using QDA") + theme(plot.title = element_text(hjust = 0.5))
ggplot(data=pumpkin_data_final) + geom_point(mapping=aes(x=Area,y=Aspect_ratio,shape=Class,colour=SVM_test)) +
scale_color_manual(values = c("Correct" = "green", "Incorrect" = "black")) +
labs(y = "Aspect Ratio",x="Area") +
ggtitle("Scatter Plot of Pumpkin Seed Classification using SVM") + theme(plot.title = element_text(hjust = 0.5))