Blog

Practice makes perfect.

Principal component analysis (Python version)

Zhijun / 2022-03-14


Principal component analysis(PCA)

Load packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from sklearn.decomposition import PCA as sk_pca
from sklearn.preprocessing import StandardScaler
from scipy.signal import savgol_filter, general_gaussian
from sklearn.decomposition import PCA

Load data (The NIR spectral of oil samples)

oil_data.head()
##    Number  Group Countries  808.5395  ...  1881.3715  1883.301  1885.2305   1887.16
## 0       1      1    Greece  0.103753  ...   0.003744  0.003461   0.002774  0.002253
## 1       1      1    Greece  0.100083  ...   0.004165  0.002345   0.000598 -0.000700
## 2       2      1    Greece  0.098488  ...   0.003454  0.003874   0.003860  0.002053
## 3       2      1    Greece  0.097094  ...   0.004128  0.002877   0.001644  0.000653
## 4       3      1    Greece  0.098733  ...   0.002996  0.001952   0.001568  0.001747
## 
## [5 rows x 563 columns]
oil = oil_data.values[:,4:].astype(str).astype(float)

RAW spectralof oil samples

plt.figure(figsize=(8,6))
with plt.style.context(('ggplot')):
    
    plt.plot(wl, oil.T)
    plt.title('Raw-oil')
plt.show()

Defind some functions

MSC function

# https://nirpyresearch.com/two-scatter-correction-techniques-nir-spectroscopy-python/
def msc(input_data, reference=None):
    ''' Perform Multiplicative scatter correction'''
 
    # mean centre correction
    for i in range(input_data.shape[0]):
        input_data[i,:] -= input_data[i,:].mean()
 
    # Get the reference spectrum. If not given, estimate it from the mean    
    if reference is None:    
        # Calculate mean
        ref = np.mean(input_data, axis=0)
    else:
        ref = reference
 
    # Define a new array and populate it with the corrected data    
    data_msc = np.zeros_like(input_data)
    for i in range(input_data.shape[0]):
        # Run regression
        fit = np.polyfit(ref, input_data[i,:], 1, full=True)
        # Apply correction
        data_msc[i,:] = (input_data[i,:] - fit[0][1]) / fit[0][0] 
 
    return (data_msc, ref)

SNV function

def snv(input_data):
  
    # Define a new array and populate it with the corrected data  
    output_data = np.zeros_like(input_data)
    for i in range(input_data.shape[0]):
 
        # Apply correction
        output_data[i,:] = (input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
 
    return output_data

Data pre-processing

oil_msc = msc(oil)[0]
oil_snv = snv(oil)

w = 5
p = 2
oil_sg = savgol_filter(oil, w, polyorder = p, deriv=1)

The MSC-transformated NIR spectral of oil samples

plt.figure(figsize=(8,6))
with plt.style.context(('ggplot')):
    
    plt.plot(wl, oil_msc.T)
    plt.title('MSC-oil')
plt.show()

The SNV- transformated NIR spectral of oil samples

plt.figure(figsize=(8,6))
with plt.style.context(('ggplot')):
    
    plt.plot(wl, oil_snv.T)
    plt.title('SNV-oil')
plt.show()

The SG-processed NIR spectral of oil samples

plt.figure(figsize=(8,6))
with plt.style.context(('ggplot')):
    
    plt.plot(wl, oil_sg.T)
    plt.title('SG-oil')
plt.show()

PCA plots using different pre-treatments

PCA plots with raw data

pca_oil_raw = PCA(n_components=3)
principalComponents_oil = pca_oil_raw.fit_transform(oil)

principal_oil_Df = pd.DataFrame(data = principalComponents_oil, columns = ['principal component 1','principal component 2', 'principal component 3'])

principal_oil_Df.tail()
##      principal component 1  principal component 2  principal component 3
## 115              -0.296913               0.050949              -0.111565
## 116               0.118397              -0.395352               0.041215
## 117               0.133961              -0.378638               0.053863
## 118               0.009523              -0.204413               0.054615
## 119               0.032706              -0.263974               0.034556
print('Explained variation per principal component: {}'.format(pca_oil_raw.explained_variance_ratio_))
## Explained variation per principal component: [0.46126247 0.25206043 0.05789436]
plt.figure()
plt.figure(figsize=(8,6))
plt.xticks(fontsize=12)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0.0, 0, '0.0'), Text(0.2, 0, '0.2'), Text(0.4, 0, '0.4'), Text(0.6000000000000001, 0, '0.6'), Text(0.8, 0, '0.8'), Text(1.0, 0, '1.0')])
plt.yticks(fontsize=14)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])
plt.xlabel('PC1 30.6%',fontsize=20)
plt.ylabel('PC2 21.1%',fontsize=20)
plt.title("Principal Component Analysis of raw oil",fontsize=26)

targets = ["Greece","Italy","Portugal", "Spain"]
colors = ["#CD0000", "#008B45", "#000000", "#FF1493"]
for target, color in zip(targets,colors):
    indicesToKeep = oil_data['Countries'] == target
    plt.scatter(principal_oil_Df.loc[indicesToKeep, 'principal component 1']
               , principal_oil_Df.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})
plt.tight_layout()
plt.show()

PCA plots with SNV data

pca_oil_snv = PCA(n_components=3)
principalComponents_oil_snv = pca_oil_snv.fit_transform(oil_snv)

principal_oil_Df = pd.DataFrame(data = principalComponents_oil_snv, columns = ['principal component 1','principal component 2', 'principal component 3'])

principal_oil_Df.tail()
##      principal component 1  principal component 2  principal component 3
## 115              -0.296916               0.050949              -0.111566
## 116               0.118399              -0.395355               0.041214
## 117               0.133963              -0.378641               0.053862
## 118               0.009524              -0.204415               0.054616
## 119               0.032707              -0.263976               0.034556
print('Explained variation per principal component: {}'.format(pca_oil_snv.explained_variance_ratio_))
## Explained variation per principal component: [0.46126252 0.25206018 0.05789437]
plt.figure()
plt.figure(figsize=(8,6))
plt.xticks(fontsize=12)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0.0, 0, '0.0'), Text(0.2, 0, '0.2'), Text(0.4, 0, '0.4'), Text(0.6000000000000001, 0, '0.6'), Text(0.8, 0, '0.8'), Text(1.0, 0, '1.0')])
plt.yticks(fontsize=14)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])
plt.xlabel('PC1 46.1%',fontsize=20)
plt.ylabel('PC2 25.2%',fontsize=20)
plt.title("Principal Component Analysis of SNV oil",fontsize=26)

targets = ["Greece","Italy","Portugal", "Spain"]
colors = ["#CD0000", "#008B45", "#000000", "#FF1493"]
for target, color in zip(targets,colors):
    indicesToKeep = oil_data['Countries'] == target
    plt.scatter(principal_oil_Df.loc[indicesToKeep, 'principal component 1']
               , principal_oil_Df.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})
plt.tight_layout()
plt.show()

PCA plots with SG data

pca_oil = PCA(n_components=3)
principalComponents_oil_sg = pca_oil.fit_transform(oil_sg)

principal_oil_Df = pd.DataFrame(data = principalComponents_oil_sg, columns = ['principal component 1','principal component 2', 'principal component 3'])

principal_oil_Df.tail()
##      principal component 1  principal component 2  principal component 3
## 115              -0.069488              -0.045126               0.023853
## 116              -0.054010               0.119382              -0.013546
## 117              -0.048952               0.113395              -0.023043
## 118              -0.018831               0.045615              -0.044294
## 119              -0.028277               0.068972              -0.001258
print('Explained variation per principal component: {}'.format(pca_oil.explained_variance_ratio_))
## Explained variation per principal component: [0.30679875 0.21167453 0.08481524]
plt.figure()
plt.figure(figsize=(8,6))
plt.xticks(fontsize=12)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0.0, 0, '0.0'), Text(0.2, 0, '0.2'), Text(0.4, 0, '0.4'), Text(0.6000000000000001, 0, '0.6'), Text(0.8, 0, '0.8'), Text(1.0, 0, '1.0')])
plt.yticks(fontsize=14)
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])
plt.xlabel('PC1 30.6%',fontsize=20)
plt.ylabel('PC2 21.1%',fontsize=20)
plt.title("Principal Component Analysis of SG oil",fontsize=26)

targets = ["Greece","Italy","Portugal", "Spain"]
colors = ["#CD0000", "#008B45", "#000000", "#FF1493"]
for target, color in zip(targets,colors):
    indicesToKeep = oil_data['Countries'] == target
    plt.scatter(principal_oil_Df.loc[indicesToKeep, 'principal component 1']
               , principal_oil_Df.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})
plt.tight_layout()
plt.show()