Correlation parameters in the literature (July 1, 2014)

Before generating a figure showing the different correlation values used in the literature, we first want to study the symmetry of the problem: does varying only the row correlation differ from varying the column correlation?

Symmetry of correlation coefficients

n <- 100
m <- n/10
mu <- 10
sigma <- 2

rdist <- function(n, mean, sd) {
    a <- mean - sqrt(12)/2 * sd  #lower
    b <- mean + sqrt(12)/2 * sd  #upper
    return(runif(n, a, b))
}

best_cmax <- function(costs) {
    min(EFT(costs), LPT_unrelated(costs), LPT_unrelated(costs, min), balance_sufferage(costs), 
        balance_EFT(costs))
}
LPT_unrelated_diff_correlation <- NULL
for (i in 1:1000) {
    corr <- runif(1)
    Z1 <- generate_matrix_corr(n, m, rdist, mu, sigma, corr, 0)
    cmax1 <- LPT_unrelated(Z1)
    best1 <- best_cmax(Z1)
    LPT_unrelated_diff_correlation <- rbind.data.frame(LPT_unrelated_diff_correlation, 
        data.frame(corr = corr, type = "row", ratio = cmax1/best1))
    Z2 <- generate_matrix_corr(n, m, rdist, mu, sigma, 0, corr)
    cmax2 <- LPT_unrelated(Z2)
    best2 <- best_cmax(Z2)
    LPT_unrelated_diff_correlation <- rbind.data.frame(LPT_unrelated_diff_correlation, 
        data.frame(corr = corr, type = "column", ratio = cmax2/best2))
}
library(ggplot2)
p <- ggplot(data = LPT_unrelated_diff_correlation, aes(x = corr, y = ratio, 
    group = type, color = type))
p <- p + geom_point() + geom_smooth()
p
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-1

The curve where the row correlation is zero is really similar to the one obtained with Siegel's method (for which the row correlation is also zero).

Correlation coefficients used in the literature

Before generating a figure with the current partial data, we might be interesting by the properties of so-called semi-consistent and consistent matrices.

Intuitively, sorting each row should lead to a correlation close to 1 between rows without changing the correlation between columns.

Let's generate some consistent data.

n <- 100
m <- 100
Siegel_consistent_corr <- NULL
corr <- 0.5
Z <- generate_consistent_matrix_siegel_corr(n, m, rdist, mu, sigma, corr)
summary(as.vector(sapply(1:n, function(i) {
    sapply(1:m, function(j) {
        cor.test(Z[i, ], Z[j, ])$estimate
    })
})))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.967   0.990   0.993   0.992   0.996   1.000
summary(as.vector(sapply(1:n, function(i) {
    sapply(1:m, function(j) {
        cor.test(Z[, i], Z[, j])$estimate
    })
})))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.980   0.985   0.989   0.989   0.993   1.000

Both correlation coefficients are close to 1.

Now let's plot the correlations tested in the literature.

nb_point <- 100 + 1  # odd because of symmetry
Rtasks <- exp(seq(log(1), log(1e+06), length.out = nb_point))
Rmachs <- exp(seq(log(1), log(10000), length.out = nb_point))

correlation_literature <- data.frame()
for (Rtask in Rtasks) for (Rmach in Rmachs) {
    muR <- Rtask/2 + 0.5
    muC <- Rmach/2 + 0.5
    sigmaR <- sqrt((Rtask - 1)^2/12)
    sigmaC <- sqrt((Rmach - 1)^2/12)
    corr <- muR^2 * sigmaC^2/(sigmaC^2 * sigmaR^2 + sigmaC^2 * muR^2 + muC^2 * 
        sigmaR^2)
    correlation_literature <- rbind(correlation_literature, data.frame(Rtask = Rtask, 
        Rmach = Rmach, corr = corr))
}
library(ggplot2)
p <- ggplot(data = correlation_literature, aes(x = Rtask, y = Rmach, fill = corr, 
    z = corr))
p <- p + geom_tile()
p <- p + scale_x_log10()
p <- p + scale_y_log10()
p <- p + stat_contour(breaks = seq(0, 1, 0.1))
p <- p + annotation_logticks()
p <- p + theme_bw()
p <- p + theme(legend.position = "bottom")
p
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-2

Let's add four settings used in two publications:

literature <- NULL
literature <- rbind(literature, data.frame(Rtask = 1000, Rmach = 10, paper = "siegel1999a"))
literature <- rbind(literature, data.frame(Rtask = 1000, Rmach = 100, paper = "siegel1999a"))
literature <- rbind(literature, data.frame(Rtask = 3000, Rmach = 10, paper = "siegel1999a"))
literature <- rbind(literature, data.frame(Rtask = 3000, Rmach = 100, paper = "siegel1999a"))
literature <- rbind(literature, data.frame(Rtask = 100, Rmach = 10, paper = "siegel1999c"))
literature <- rbind(literature, data.frame(Rtask = 100, Rmach = 1000, paper = "siegel1999c"))
literature <- rbind(literature, data.frame(Rtask = 3000, Rmach = 10, paper = "siegel1999a"))
literature <- rbind(literature, data.frame(Rtask = 3000, Rmach = 1000, paper = "siegel1999a"))
library(scales)  # for alpha
p + geom_point(data = literature, colour = alpha("black", 1/3))
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-3

p + geom_jitter(data = literature)
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-3

p + stat_bin2d(data = literature, aes(fill = ..density..))
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-3

p + stat_binhex(data = literature, aes(fill = ..density..))
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-3

p + stat_density2d(data = literature, geom = "point", aes(alpha = ..density.., 
    size = ..density..), contour = "FALSE") + scale_size_continuous(range = c(1, 
    2))
## Warning: Removed 1 rows containing non-finite values (stat_contour).

plot of chunk unnamed-chunk-3

The first representation may be the most precise (more than the jitter one). The next two are equivalent and could be helpful if there is too much data. The last one is misleading but may be better with more points.