About different data same box plot
TOC
- write a function for generating random group of data with (almost) same stats
As in my previous post I was fascinated by the "datasaurus" (sets of data having same basic stats and also same boxplot). You can have a look at this page to find all details. Since the plot also looks a lot like a kind of diffraction pattern I tried a simple idea I had in mind. What will happen if I generate distributions with as mean the median, q1,q3,min and max of a "reference" dataset and I merge everything together? If I keep the everything "symmetric enough" (I know this is a term that does not exists) cuttin in half the distibution at the min and max I probably will obtain another "equivalent" dataset.
library(data.table)
library(ggplot2)
library(datasauRus)
library(patchwork)
library(foreach)
summary(box_plots)
in_dat <- summary(box_plots$left)
median_r <- in_dat[3]
q1_r <- in_dat[2]
q3_r <- in_dat[5]
min_r <- in_dat[1]
max_r <- in_dat[6]
sd_r <- abs(max_r-min_r)/20
npoints=100
d_median <- rnorm(npoints,median_r,sd_r)
d_q1 <- rnorm(npoints,q1_r,sd_r)
d_q3 <- rnorm(npoints,q3_r,sd_r)
d_min <- rnorm(npoints,min_r,sd_r)
d_max <- rnorm(npoints,max_r,sd_r)
sim <- c(d_median,d_q1,d_q3,d_min,d_max)
lab_median <- rep("median",npoints)
lab_q1 <- rep("q1" ,npoints)
lab_q3 <- rep("q3" ,npoints)
lab_min <- rep("min" ,npoints)
lab_max <- rep("max" ,npoints)
sim_lab <- c(lab_median,lab_q1,lab_q3,lab_min,lab_max)
df_sim <- data.frame(sim,sim_lab)
df_sim_sel <- df_sim[df_sim$sim < max_r & df_sim$sim>min_r,]
options(repr.plot.width=12, repr.plot.height=8)
options(repr.plot.width=12, repr.plot.height=8)
psim <-ggplot(df_sim_sel, aes(x = 0, y = sim, color=sim_lab)) +
geom_jitter(alpha=0.5,size=3.5) +
theme_light(base_size=24) +
xlim(-1, 1)
bplot <- data.frame(box_plots)
p1 <-ggplot(bplot, aes(x = 0, y = left)) +
geom_jitter(alpha=0.05,size=3.5) +
theme_light(base_size=24) +
xlim(-1, 1)
#summary(df_sim_sel$sim)
p1 + psim
so the plot is almost there but the data are still not ok now we create a function, add points and create a bit of benchmarks and also plots.starting with the function
boxSim <- function(npoints,med = median_r,
q1 = q1_r,
q3 = q3_r,
mmin = min_r,
mmax = max_r,
sdr = c(2,2,2,2,2))
{
d_median <- rnorm(npoints,med,sdr[1])
d_q1 <- rnorm(npoints,q1,sdr[2])
d_q3 <- rnorm(npoints,q3,sdr[3])
d_min <- rnorm(npoints,mmin,sdr[4])
d_max <- rnorm(npoints,mmax,sdr[5])
sim <- c(d_median,d_q1,d_q3,d_min,d_max)
lab_median <- rep("median",npoints)
lab_q1 <- rep("q1" ,npoints)
lab_q3 <- rep("q3" ,npoints)
lab_min <- rep("min" ,npoints)
lab_max <- rep("max" ,npoints)
sim_lab <- c(lab_median,lab_q1,lab_q3,lab_min,lab_max)
df_sim <- data.frame(sim,sim_lab)
df_sim_sel <- df_sim[df_sim$sim < mmax & df_sim$sim>mmin,]
return(df_sim_sel)
}
now everything is more flexible since we can give each of the median
,q1
,q3
,min
,max
its own variance and create
even more combinations (keeping "everything symmetric")
median_r <- in_dat[3]
q1_r <- in_dat[2]
q3_r <- in_dat[5]
min_r <- in_dat[1]
max_r <- in_dat[6]
sd_r <- abs(max_r-min_r)/10
n_points = c(100,1000,50000)
t100 <- boxSim(100,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.1,0.2,0.2,2,2))
t1k <- boxSim(1000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.1,0.2,0.2,2,2))
t10k <- boxSim(10000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.1,0.2,0.2,2,2))
t50k <- boxSim(50000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.1,0.2,0.2,2,2))
t100_2 <- boxSim(100,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.2,0.2,0.2,3,3))
t1k_2 <- boxSim(1000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.2,0.2,0.2,3,3))
t10k_2 <- boxSim(10000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.1,0.2,0.2,2,2))
t50k_2 <- boxSim(50000,in_dat[3],in_dat[2], in_dat[5],in_dat[1],in_dat[6],c(0.2,0.2,0.2,3,3))
ptest <-ggplot(t100, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.5) +
theme_void() +
xlim(-1, 3)
legend <- cowplot::get_legend(ptest)
datr <- data.table(box_plots$left)
colnames(datr) <- c("y")
pref <- ggplot(datr, aes(x = 1, y = y )) +
geom_jitter(alpha=0.5,size=3.5) +
theme_light(base_size=24)
pt100 <-ggplot(t100, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.5,size=3.5) +
theme_light(base_size=24)
pt1k <-ggplot(t1k, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.05,size=3.5) +
theme_light(base_size=24) +
theme(legend.position = "none")
pt10k <-ggplot(t10k, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.05,size=3.5) +
theme_light(base_size=24) +
theme(legend.position = "none")
pt50k <-ggplot(t50k, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.2,size=3.5) +
theme_light(base_size=24)
pt100_2 <-ggplot(t100_2, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.5 ,size=3.5) +
theme_light(base_size=24)
pt1k_2 <-ggplot(t1k_2, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.05,size=3.5) +
theme_light(base_size=24) +
theme(legend.position = "none")
pt10k_2 <-ggplot(t10k_2, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.05,size=3.5) +
theme_light(base_size=24) +
theme(legend.position = "none")
pt5k_2 <-ggplot(t50k_2, aes(x = 1, y =sim, color=sim_lab)) +
geom_jitter(alpha=0.2) +
theme_light(base_size=24)
library(patchwork)
pt100 + pt1k + pt10k
pt100_2 + pt1k_2 + pt10k_2
pb1 <- pref + geom_boxplot(lwd=1, alpha=0.5)
pb2 <- ggplot(t100, aes(x = 1, y =sim)) + geom_boxplot(lwd=1) + geom_jitter(alpha=0.5,,size=3.5) + theme_light(base_size=24) +
theme(legend.position = "none")
pb3 <- ggplot(t1k, aes(x = 1, y =sim)) + geom_boxplot(lwd=1) + geom_jitter(alpha=0.05,,size=3.5) + theme_light(base_size=24) +
theme(legend.position = "none")
pb1+pb2+pb3