Principal component analysis
2021-01-09
Principal component analysis(PCA)
From this section, the machine learning will be discussed by foods data, the data from Mid-infrared spectroscopy will be used for Geographical origin of Extra Virgin Olive Oils (Quadram open dataset,2003).
Load dataset
## Load date
setwd("C:/blog/Dataset")
oil <- read.csv('FTIR_Spectra_olive_oils.csv',header = F,
stringsAsFactors = T)
print(oil[1:10,1:5])
## V1 V2 V3 V4 V5
## 1 Sample Number: 1 1 2 2
## 2 Group Code: 1 1 1 1
## 3 Wavenumbers Greece Greece Greece Greece
## 4 798.892 0.127523009 0.126498181 0.130411785 0.130022227
## 5 800.8215 0.127949615 0.127130974 0.130675401 0.130406662
## 6 802.751 0.129282219 0.128510777 0.13201661 0.132018029
## 7 804.6805 0.131174169 0.13033991 0.133824061 0.134007275
## 8 806.61 0.133590328 0.132527221 0.136095296 0.136270568
## 9 808.5395 0.136425525 0.135308508 0.138943757 0.13887477
## 10 810.469 0.139357827 0.13835292 0.141722779 0.141481132
Data wrangling
The raw data form are not suitable for R programming, therefore we need to transpose it to suitable form.
## data transpose
oil_transpose <- as.data.frame(t(oil))
print(oil_transpose[1:10,1:5])
## V1 V2 V3 V4 V5
## V1 Sample Number: Group Code: Wavenumbers 798.892 800.8215
## V2 1 1 Greece 0.127523009 0.127949615
## V3 1 1 Greece 0.126498181 0.127130974
## V4 2 1 Greece 0.130411785 0.130675401
## V5 2 1 Greece 0.130022227 0.130406662
## V6 3 1 Greece 0.128601989 0.128789565
## V7 3 1 Greece 0.128217254 0.128282253
## V8 4 1 Greece 0.126174933 0.126732773
## V9 4 1 Greece 0.126466053 0.126915413
## V10 5 1 Greece 0.127060105 0.127551128
## rename some col names to make data analysis easier
colnames(oil_transpose)=oil_transpose[1,]
oil_transpose <- oil_transpose[-1,]
library(dplyr)
oil_transpose <- rename(oil_transpose,c("Group" ="Group Code:",
"Number" ="Sample Number:",
"Countries" ="Wavenumbers"))
## Get transposed dataset
oil.data <- oil_transpose
print(oil.data[1:10,1:5])
## Number Group Countries 798.892 800.8215
## V2 1 1 Greece 0.127523009 0.127949615
## V3 1 1 Greece 0.126498181 0.127130974
## V4 2 1 Greece 0.130411785 0.130675401
## V5 2 1 Greece 0.130022227 0.130406662
## V6 3 1 Greece 0.128601989 0.128789565
## V7 3 1 Greece 0.128217254 0.128282253
## V8 4 1 Greece 0.126174933 0.126732773
## V9 4 1 Greece 0.126466053 0.126915413
## V10 5 1 Greece 0.127060105 0.127551128
## V11 5 1 Greece 0.126812707 0.127460743
Principal Component Analysis
The PCA is used to reduce the dimensionality of the spectral value, and initially explore the distribution of the sample.
The original data and related transformated data (MSC, SNV and Savitzky-Golay filtering) are used to obtain the PCA plots.
library(factoextra)
library(FactoMineR)
library(ggrepel)
library(ggplot2)
library(prospectr)
Raw data PCA
Have a look at spectral of different pre-processing for MIR.
colors = c("#CD0000", "#EE7600", "#FFFF00", "#008B45")
col <- as.factor(oil.data$Countries)
group <- c("Greece","Italy","Portuga", "Spain")
RAW spectral
raw_spc <- oil.data[,4:573]
x <- as.numeric(colnames(raw_spc))
y <- t(raw_spc[ , ])
par(mar=(c(5,7,4,2)))
matplot(x, y,col = colors,
type = 'l', lty = 1,
xlab = 'Wavelength (nm)', ylab = 'Absorbance',main = "Raw",
font.lab = 1,
cex.main = 1, cex.axis = 1, cex.lab=2)
legend("topleft",
title = "Country",
xpd = F,ncol = 1,
legend = group,
col = col,
fill = colors,
bty = "n",
cex = 1.5,
text.col = "black", text.font = 1,
horiz = F)
p1 <- recordPlot()
df_raw <- oil.data[,4:573]
df <- as.data.frame(apply(df_raw, 2, as.numeric))
print(df[1:10,1:5])
## 798.892 800.8215 802.751 804.6805 806.61
## 1 0.1275230 0.1279496 0.1292822 0.1311742 0.1335903
## 2 0.1264982 0.1271310 0.1285108 0.1303399 0.1325272
## 3 0.1304118 0.1306754 0.1320166 0.1338241 0.1360953
## 4 0.1300222 0.1304067 0.1320180 0.1340073 0.1362706
## 5 0.1286020 0.1287896 0.1300223 0.1320119 0.1344266
## 6 0.1282173 0.1282823 0.1296366 0.1317986 0.1340615
## 7 0.1261749 0.1267328 0.1282438 0.1298927 0.1317546
## 8 0.1264661 0.1269154 0.1282541 0.1299583 0.1320672
## 9 0.1270601 0.1275511 0.1289000 0.1306090 0.1329558
## 10 0.1268127 0.1274607 0.1287653 0.1306390 0.1331313
Standard Normal Variate
df_snv <- standardNormalVariate(df)
oil.snv <- cbind(oil.data[,1:3], df)
SNV spectral
x <- as.numeric(colnames(oil.snv))
y <- t(oil.snv[ , ])
par(mar=(c(5,7,4,2)))
matplot(x, y,col = colors, ylim = c(0,2),
type = 'l', lty = 1,
xlab = 'Wavelength (nm)', ylab = 'Absorbance',main = "SNV",
font.lab = 1,
cex.main = 1, cex.axis = 1, cex.lab=2)
legend("topleft",
title = "Country",
xpd = F,ncol = 1,
legend = group,
col = col,
fill = colors,
bty = "n",
cex = 1.5,
text.col = "black", text.font = 1,
horiz = F)
p3 <- recordPlot()
p3
The PCA plot of SNV Spectral
oil.snv.pca <- PCA(oil.snv[,4:563],scale.unit = TRUE,ncp = 5,graph = TRUE)
g3 <- fviz_pca_ind(oil.snv.pca, geom.ind = "point",
col.ind = oil.snv$Countries, # color by groups
palette = 'jco',
addEllipses = TRUE, ellipse.type = "convex",
legend.title = "Groups")+
ggtitle("Oil - geographical-SNV")
g3