How to plot classification boundaries on a linear discrimination analysis graph in R
I used Linear Discriminant Analysis (LDA) to investigate how well a set of variables differentiates between 3 groups. Then I used a function plot.lda()
to plot my data along two linear discriminants (LD1 on the x-axis and LD2 on the y-axis). Now I would like to add classification boundaries from LDA to the plot. I don't see an argument in a function that allows this. The function partimat()
allows you to visualize the boundaries of the LD classification, but in this case the variables are used as the x and y axes, not linear discriminants. Any advice on how to add classification boundaries in plot.lda
would be greatly appreciated. Below is a sample code:
library(MASS)
# LDA
t.lda = lda(Group ~ Var1 + Var2, data=mydata,
na.action="na.omit", CV=TRUE)
# Scatter plot using the two discriminant dimensions
plot(t.lda,
panel = function(x, y, ...) { points(x, y, ...) },
col = c(4,2,3)[factor(mydata$Group)],
pch = c(17,19,15)[factor(mydata$Group)],
ylim=c(-3,3), xlim=c(-5,5))
Below are some examples of data (3 groups, 2 variables):
> dput(mydata)
structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a",
"b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c",
"c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8,
8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12,
11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1,
-8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10,
-6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group",
"Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
))
> head(mydata)
Group Var1 Var2
1 a 7.5 -6.5
2 a 6.9 -6.2
3 a 6.5 -6.7
4 a 7.3 -6.9
5 a 8.1 -7.1
6 a 8.0 -8.0
EDIT: Following the answer from Roman, I tried to modify the code to plot the classification boundary on a linear discriminant scale (which is what I am trying to achieve), not on the scale of the original variables. However, the border doesn't sit where it should. Any advice on what I am doing wrong here would be much appreciated:
#create new data
np = 300
nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
nd = expand.grid(Var1 = nd.x, Var2 = nd.y)
#run lda and predict using new data
new.lda = lda(Group ~ Var1 + Var2, data=mydata)
prd = as.numeric(predict(new.lda, newdata = nd)$class)
#create LD sequences from min - max values
p = predict(new.lda, newdata= nd)
p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores
#create original plot
quartz()
plot(t.lda, panel = function(x, y, ...) { points(x, y, ...) },
col = c(4,2,3)[factor(mydata$Group)],
pch = c(17,19,15)[factor(mydata$Group)],
ylim=c(-3,3), xlim=c(-5,5))
#add classification border on scale of linear discriminants (NOTE: this step currently doesn't work)
contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np),
levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)
source to share
I've adapted my code to follow the example here .
require(MASS)
# generate data
set.seed(357)
Ng <- 100 # number of cases per group
group.a.x <- rnorm(n = Ng, mean = 2, sd = 3)
group.a.y <- rnorm(n = Ng, mean = 2, sd = 3)
group.b.x <- rnorm(n = Ng, mean = 11, sd = 3)
group.b.y <- rnorm(n = Ng, mean = 11, sd = 3)
group.a <- data.frame(x = group.a.x, y = group.a.y, group = "A")
group.b <- data.frame(x = group.b.x, y = group.b.y, group = "B")
my.xy <- rbind(group.a, group.b)
# construct the model
mdl <- lda(group ~ x + y, data = my.xy)
# draw discrimination line
np <- 300
nd.x <- seq(from = min(my.xy$x), to = max(my.xy$x), length.out = np)
nd.y <- seq(from = min(my.xy$y), to = max(my.xy$y), length.out = np)
nd <- expand.grid(x = nd.x, y = nd.y)
prd <- as.numeric(predict(mdl, newdata = nd)$class)
plot(my.xy[, 1:2], col = my.xy$group)
points(mdl$means, pch = "+", cex = 3, col = c("black", "red"))
contour(x = nd.x, y = nd.y, z = matrix(prd, nrow = np, ncol = np),
levels = c(1, 2), add = TRUE, drawlabels = FALSE)
EDIT
If i try
library(MASS)
mydata <- structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a",
"b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c",
"c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8,
8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12,
11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1,
-8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10,
-6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group",
"Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
))
np <- 300
nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
nd = expand.grid(Var1 = nd.x, Var2 = nd.y)
#run lda and predict using new data
new.lda = lda(Group ~ Var1 + Var2, data=mydata)
prd = as.numeric(predict(new.lda, newdata = nd)$class)
#create LD sequences from min - max values
p = predict(new.lda, newdata= nd)
p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores
# notice I don't use t.lda for first variable
plot(new.lda, panel = function(x, y, ...) { points(x, y, ...) },
col = c(4,2,3)[factor(mydata$Group)],
pch = c(17,19,15)[factor(mydata$Group)],
ylim=c(-3,3), xlim=c(-5,5))
contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np),
levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)
I get
source to share
Here's an approach too ggplot2
:
library(MASS)
library(ggplot2)
fit <- lda(Species ~ ., data = iris, prior = rep(1, 3)/3)
datPred <- data.frame(Species=predict(fit)$class,predict(fit)$x)
#Create decision boundaries
fit2 <- lda(Species ~ LD1 + LD2, data=datPred, prior = rep(1, 3)/3)
ld1lim <- expand_range(c(min(datPred$LD1),max(datPred$LD1)),mul=0.05)
ld2lim <- expand_range(c(min(datPred$LD2),max(datPred$LD2)),mul=0.05)
ld1 <- seq(ld1lim[[1]], ld1lim[[2]], length.out=300)
ld2 <- seq(ld2lim[[1]], ld1lim[[2]], length.out=300)
newdat <- expand.grid(list(LD1=ld1,LD2=ld2))
preds <-predict(fit2,newdata=newdat)
predclass <- preds$class
postprob <- preds$posterior
df <- data.frame(x=newdat$LD1, y=newdat$LD2, class=predclass)
df$classnum <- as.numeric(df$class)
df <- cbind(df,postprob)
head(df)
x y class classnum setosa versicolor virginica
1 -10.122541 -2.91246 virginica 3 5.417906e-66 1.805470e-10 1
2 -10.052563 -2.91246 virginica 3 1.428691e-65 2.418658e-10 1
3 -9.982585 -2.91246 virginica 3 3.767428e-65 3.240102e-10 1
4 -9.912606 -2.91246 virginica 3 9.934630e-65 4.340531e-10 1
5 -9.842628 -2.91246 virginica 3 2.619741e-64 5.814697e-10 1
6 -9.772650 -2.91246 virginica 3 6.908204e-64 7.789531e-10 1
colorfun <- function(n,l=65,c=100) { hues = seq(15, 375, length=n+1); hcl(h=hues, l=l, c=c)[1:n] } # default ggplot2 colours
colors <- colorfun(3)
colorslight <- colorfun(3,l=90,c=50)
ggplot(datPred, aes(x=LD1, y=LD2) ) +
geom_raster(data=df, aes(x=x, y=y, fill = factor(class)),alpha=0.7,show_guide=FALSE) +
geom_contour(data=df, aes(x=x, y=y, z=classnum), colour="red2", alpha=0.5, breaks=c(1.5,2.5)) +
geom_point(data = datPred, size = 3, aes(pch = Species, colour=Species)) +
scale_x_continuous(limits = ld1lim, expand=c(0,0)) +
scale_y_continuous(limits = ld2lim, expand=c(0,0)) +
scale_fill_manual(values=colorslight,guide=F)
(not entirely sure if this approach for displaying classification boundaries using contours / breaks at points 1.5 and 2.5 is always correct - it is correct for the boundary between views 1 and 2 and views 2 and 3, but not if the area of views 1 is next to view 3, since then I would get two boundaries - perhaps I would have to use the approach used here , where each boundary between each species pair is counted separately)
source to share