R --- LDA and QDA

Classification

This post shows the R code for LDA and QDA by using funtion lda() and qda() in package MASS. To show how to use these function, I created a function, bvn(), to generate bivariate normal dataset based on the assumptions and then used lda() and qda() on the generated datasets.

Details

Resources for Package ‘MASS’
- CRAN - Package ‘MASS’
- Package ‘MASS’ - Reference manual

Example Code
- LDA :
  Suppose our dataset are from
  \[X_{green}=N(\begin{pmatrix} 0.5\\ -0.5 \end{pmatrix}, \Sigma) \quad \text{and} \quad X_{red}=N(\begin{pmatrix} -2\\ 0.7 \end{pmatrix}, \Sigma), \\[5pt] \text{where} \quad \Sigma=\begin{pmatrix} 1& 0.5\\ 0.5&1 \end{pmatrix}.\]
- QDA :
  Suppose our dataset are from
  \[X_{green}=N(\begin{pmatrix} -2\\ 0.7 \end{pmatrix}, \Sigma_1) \quad \text{and} \quad X_{red}=N(\begin{pmatrix} 0.5\\ -0.5 \end{pmatrix}, \Sigma_2), \\[5pt] \text{where} \quad \Sigma_1=\begin{pmatrix} 1& 0.5\\ 0.5&1 \end{pmatrix} \quad \text{and} \quad \Sigma_2=\begin{pmatrix} 0.8& -0.7\\ -0.7& 0.8 \end{pmatrix}.\]

if (!require("MASS")) install.packages("MASS")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("gridExtra")) install.packages("gridExtra")

library(MASS)
library(ggplot2)
library(grid)
library(gridExtra)

########################################
#### Generate Bivariate Normal Data ####
########################################
set.seed(12345)
bvn=function(m1,m2,sigma1,sigma2,n1,n2){
      d1 <- mvrnorm(n1, mu = m1, Sigma = sigma1 )
      d2 <- mvrnorm(n2, mu = m2, Sigma = sigma2 )
      d=rbind(d1,d2)
      colnames(d)=c("X1","X2")
      d=data.frame(d)
      d$group=c(rep("Red",n1),rep("Green",n2))
      list(data = d)
}

m_1 <- c(0.5, -0.5) # mean of the first group
m_2 <- c(-2, 0.7)  # mean of the second group
sigma_1 <- matrix(c(1,0.5,0.5,1), nrow=2) # covariance matrix
sigma_2 <- matrix(c(0.8,-0.7,-0.7,0.8), nrow=2)

# The training dataset
data_lda = bvn(m_1,m_2,sigma_1,sigma_1,n1=1500,n2=2000)$data
data_qda = bvn(m_1,m_2,sigma_1,sigma_2,n1=1500,n2=2000)$data
head(data_lda, 3)

##          X1          X2 group
## 1 1.0943941 -0.08022845   Red
## 2 1.4497239 -0.22089273   Red
## 3 0.1516277 -0.34094657   Red

###############################
#### Generate Test Dataset ####
###############################
test_lda = bvn(m_1,m_2,sigma_1,sigma_1,n1=100,n2=100)$data
test_qda = bvn(m_1,m_2,sigma_1,sigma_2,n1=100,n2=100)$data

###############################################
#### Scatter Plot of the Generated Dataset ####
###############################################
p1=ggplot(data_lda, aes(x=X1, y=X2)) +
    geom_point(aes(colour = group)) +
    ggtitle("Dataset for LDA")
p2=ggplot(data_qda, aes(x=X1, y=X2)) +
    geom_point(aes(colour = group)) +
    ggtitle("Dataset for QDA")
grid.arrange(p1, p2, ncol = 2)

#############
#### LDA ####
#############
m_lda=lda(group~X1+X2, data=data_lda)
m_lda

## Call:
## lda(group ~ X1 + X2, data = data_lda)
## 
## Prior probabilities of groups:
##     Green       Red 
## 0.5714286 0.4285714 
## 
## Group means:
##               X1         X2
## Green -1.9873353  0.6966975
## Red    0.5254939 -0.4972749
## 
## Coefficients of linear discriminants:
##           LD1
## X1  1.1111290
## X2 -0.8491722

# The Confusion Matrix
m_lda.pred=predict(m_lda,test_lda)
table(true=test_lda$group, pred=m_lda.pred$class)

##        pred
## true    Green Red
##   Green    97   3
##   Red       7  93

#############
#### QDA ####
#############
m_qda=qda(group~X1+X2, data=data_qda)
m_qda

## Call:
## qda(group ~ X1 + X2, data = data_qda)
## 
## Prior probabilities of groups:
##     Green       Red 
## 0.5714286 0.4285714 
## 
## Group means:
##               X1         X2
## Green -1.9913743  0.6829456
## Red    0.4691391 -0.4981505

# The Confusion Matrix
m_qda.pred=predict(m_qda,test_qda)
table(true=test_qda$group, pred=m_qda.pred$class)

##        pred
## true    Green Red
##   Green    98   2
##   Red       4  96