• write a function for generating data
  • use melt for rearranging data
  • create a base plot
  • add a boxplot to the base plot
  • add a jiiter plot to the base plot
  • create a figure for explaining the box plot (the fun part of this post)
  • create another dataset
  • add boxplot + jiiter plot
  • what's happening?
#  loading required libraries for this notebook

#loading libraries

library(rstatix, warn.conflicts = FALSE)
#	creating a function for generating a dataset

# function for generating data with custom number of rows, means and sds

simpleDataset <- function(number_of_rows,means,sds)
l <- length(means)
res <- lapply(seq(1:l),function(x) 
dat <- data.frame((sapply(res,c)))
id <- rownames(dat)
dat <-  cbind(id=id,dat)
dt <- data.table(dat)

dat1 <- simpleDataset(number_of_rows=100,
outliers <- simpleDataset(number_of_rows=5,
dato     <-rbind(dat1,outliers) 
dt.melt <- melt(dat1, id.vars="id")
colnames(dt.melt) <- c("id","category","var1")
dt.melt$ncat <- as.numeric(dt.melt$category)

#	Jiitter plots + boxplot + brackets 

#setting up dimensions
options(repr.plot.width=8.9, repr.plot.height=8.9,units="cm")

#adding jiitter plot
p <-  ggplot(dt.melt,aes(x=factor(ncat),y=var1))        +
      geom_jitter(position = position_jitter(0.15),alpha=0.5,size = 3) +
      geom_boxplot(alpha = 0,lwd=0.2) + theme_minimal(base_size =24)

p +theme_light(base_size=24)

So for now everything on track. We created a dataset using a custom function. 10 variables with 100 points each and them we plot them using scatter plots. Before plotting a few more data we need to answer the question How are boxplot constructed? (warning: shameless self-promotion ahead) First of all you can check on my book/ebook Now it starts the fun part we will recreate a plot on the anatomy of a boxplot (see here) using ggplot.

y   <- c(60,63,105,155,rnorm(100,80,25))
box <-  ggplot()                                               +
        theme_void()                                           +
        geom_boxplot(aes(x=0,y=y),width=1,notch = FALSE,lwd=2) +  
        theme(legend.position = "none")                        +
#how can we  get out data? using the function ggplot_build()
#need to change it to a data.frame and rename cols

box_data <- (ggplot_build(box)$data)[[1]]
bdata <- data.frame(t(box_data[c(1,2,3,4,5,14)]))
colnames(bdata) <- c("y")
#we need to transpose the data and convert them to a data frame
#now we extract the ourliers 
outl  <- data.frame(box_data$outliers)
colnames(outl)  <- c("outl")

#now that I got the data I plot everything with labels
p2 <- box + geom_text (data=bdata,aes(
                      label = c("min (no outliers)","Lower Q","Median","Upper Q","max","outliers")),size=8)    +
                      geom_segment(data=bdata, aes(x = 0.8, y = y, xend = 0.6, yend =y),lwd=1)  

#since we have created the dataset WITH outliers we include labels also for them
#if your dataset has no outliers you need to commet this part out
p2 + geom_text_repel(data=outl,aes(x=0.1, y=outl,label=format(round(outl, 2), nsmall = 2)),size=6)
A data.frame: 1 × 26
ymin lower middle upper ymax outliers notchupper notchlower x flipped_aes ... xid newx new_width weight colour fill size alpha shape linetype
<dbl> <dbl> <dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl> <lgl> ... <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <lgl> <dbl> <chr>
27.14312 61.85538 74.33114 93.25041 127.1763 155 79.19523 69.46705 0 FALSE ... 1 0 1 1 grey20 white 2 NA 19 solid

notes on the code: we create our variable y with rnorm and we add a few outliers by hand then we create the boxplot with an empty theme using theme_void(). The funny part start when we ask ggplot to show how the plot was built with the ggplot_build. We then need to rotate (t) the selected columns c(1,2,3,4,5,14) ,convert the results into a data.frame, rename the columns (colnames) and then use them (our y) to add labvels to our plot using geom_text

So the bound of the box refers to upper and lower quartile. The lower quartile splits off the lowest 25% of the data (also called 25% percentile) while the third quartile splits off the highest 25% of data from the lowest 75% (75% percentile). But what is a quartile? when a set of n measurements of the variable x has been arranged in order of magnitude the pth percentile is the value of x greater than p of the measurements. The 25th and 75th percentile are called the lower and upper quartiles and the 50th percentile is the median of the dataset. the IQR interquartile range (IQR) for a set of measurement is the difference between the upper and lower quartile

Another representation of boxplot can also include notch. the default is not to visualuize them but just adding notch=true to the previous plot we will do the trick

boxnotch <-  ggplot()                                               +
             theme_void()                                           +
             geom_boxplot(aes(x=0,y=y),width=1,notch = TRUE,lwd=2)  +  
             theme(legend.position = "none")                        +
notchdata <- data.frame(t(box_data[c(7,8)]))
colnames(notchdata) <- c("y_notch")
#we need to transpose the data and convert them to a data frame

#now that I got the data I plot everything with labels
p3 <- boxnotch +  geom_segment(data=notchdata, aes(x = 0.8, 
                                                   y = mean(y_notch),
                                                   xend = 0.6, yend = y_notch
p4 <- p3 + annotate(geom="text", x=2.5, y= mean(notchdata$y_notch),
                    label="notch (95% confidence\ninterval of median)",size=10)

so having a look at the the page we can see that the following case can happen. we will load the dataset from the datasauRus package

What's happening?

options(repr.plot.width=8.9, repr.plot.height=8.9,units="cm")

p1 <-ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_jitter(alpha=0.05)                                 +

p2 <- ggplot(stack(box_plots), aes(x = ind, y = values))+
geom_boxplot(lwd=1) +
      left              lines               normal          right       
 Min.   :-9.76964   Min.   :-9.769575   Min.   :-9.76   Min.   :-9.760  
 1st Qu.:-2.68999   1st Qu.:-2.689993   1st Qu.:-2.68   1st Qu.:-2.680  
 Median :-0.00999   Median :-0.007132   Median : 0.00   Median : 0.000  
 Mean   :-1.17780   Mean   :-0.831733   Mean   : 0.00   Mean   : 1.174  
 3rd Qu.: 2.67007   3rd Qu.: 2.670236   3rd Qu.: 2.68   3rd Qu.: 2.680  
 Max.   : 9.75025   Max.   : 9.756001   Max.   : 9.76   Max.   : 9.760  
 Min.   :-9.769886  
 1st Qu.:-2.689989  
 Median :-0.003099  
 Mean   :-0.003060  
 3rd Qu.: 2.680000  
 Max.   : 9.760000  


We can see that plotting the raw points even for hundreds of points works and represent well our data. In this case adding notch does not solve the problem. Other kind of plot get not fooled by our data as it can be seen in the following figure

options(repr.plot.width=12, repr.plot.height=12)
pnotch <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_boxplot(notch=TRUE,lwd=1)  +  ggtitle("(notch=TRUE)") +
pjitter <-ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_jitter(alpha=0.05)	 +  ggtitle("geom_jitter") +								   

pviolin <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_violin(lwd=1)  +  ggtitle("geom_violin") +

pnotch + pjitter + pviolin

Other packages

  1. beeswarm plot ggbeeswarm [] (and here the things start getting artistic too!) (note: not all representation for this dataset work due to the number of points)

p_qrandom0 <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_quasirandom(alpha=0.05)  +  ggtitle("quasi_random") +


p_qrandom1 <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_quasirandom(alpha=0.05,method = "tukey")  +  ggtitle("Tukey") +


p_qrandom2 <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_quasirandom(alpha=0.05,method = "tukeyDense")   +  ggtitle("Tukey + density") +


p_qrandom3 <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_quasirandom(alpha=0.05,method = "tukeyDense")   +  ggtitle("Banded frowns") +


p_qrandom4 <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_quasirandom(alpha=0.05,method = "frowney")   +  ggtitle("Banded smiles") +


#too many points
#p_beeswarm <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
#geom_beeswarm(alpha=0.05) +  ggtitle("beeswarm") +

p_qrandom0 + p_qrandom1+p_qrandom2

you can halso mix plot a useful package for that is gghalves that you can find here


point_half <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_half_point(alpha=0.05) +theme_void(base_size=20)

violin_half <- ggplot(stack(box_plots), aes(x = ind, y = values)) +
geom_half_violin() +theme_void(base_size=20)

point_half + violin_half

finally a very useful package, also my favorite one for EDA ggstatplotthat you can find here that calculate also a lot of useful stats and combine different kind of plot in one plot


stackbox <- stack(box_plots)

pstack  <- ggbetweenstats(
  data = stackbox,
  x = ind,
  y = values,
pstack +  theme(text = element_text(size = 22),
          plot.subtitle = element_text(size = 20),
          legend.title = element_text(size = 22),
          legend.text = element_text(size = 22))