Summarizing range-based and CVB methods with correlation (March 26, 2015)

Let's import the data and compute the correlation (we have closed formulas for several case and we will rely on estimation for the other cases).

literature_range <- read.table("literature_range.txt", na.strings = "", col.names = c("task_range", 
    "mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis"))
literature_CV <- read.table("literature_CV.txt", na.strings = "", col.names = c("task_range", 
    "mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis", 
    "mean"))

Correlations computation

Let's compute the values for the range-based method.

result <- NULL
for (i in 1:nrow(literature_range)) {
    suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    Rtask <- param["task_range"]
    Rmach <- param["mach_range"]
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        a <- 0
        b <- 0
    } else {
        a <- param["row_consis"]
        b <- param["col_consis"]
    }
    mean_cor_row_v <- a^2 * b
    cor_sorted <- (1 + Rmach)^2/(1/3 * (Rmach - 1)^2 + (1 + Rmach)^2 + (Rmach - 
        1)^2 * (1 + Rtask)^2/(Rtask - 1)^2)
    cor_mix <- (1 + Rmach)/sqrt(1/3 * (Rmach - 1)^2 + (1 + Rmach)^2 + (Rmach - 
        1)^2 * (1 + Rtask)^2/(Rtask - 1)^2)
    if (a == 0) {
        mean_cor_col_v <- cor_sorted
    } else if (a == 1) {
        mean_cor_col_v <- b^2 + 2 * b * (1 - b) * cor_mix + (1 - b)^2 * cor_sorted
    } else {
        mat <- generate_matrix_siegel_range(300, 100, Rtask, Rmach, a, b)
        mean_cor_col_v <- mean_cor_col(mat)
    }
    result <- rbind(result, data.frame(mean_cor_row = mean_cor_row_v, mean_cor_col = mean_cor_col_v))
}
literature_range <- cbind(literature_range, result)

Let's perform the same computation for the CV-based method.

result <- NULL
for (i in 1:nrow(literature_CV)) {
    suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    Vtask <- param["task_range"]
    Vmach <- param["mach_range"]
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        a <- 0
        b <- 0
    } else {
        a <- param["row_consis"]
        b <- param["col_consis"]
    }
    mean_cor_row_v <- a^2 * b
    cor_sorted <- 1/(Vmach^2 + Vmach^2/Vtask^2 + 1)
    cor_mix <- 1/sqrt(Vmach^2 + Vmach^2/Vtask^2 + 1)
    if (a == 0) {
        mean_cor_col_v <- cor_sorted
    } else if (a == 1) {
        mean_cor_col_v <- b^2 + 2 * b * (1 - b) * cor_mix + (1 - b)^2 * cor_sorted
    } else {
        mat <- generate_matrix_siegel_CV(300, 100, param["mean"], param["mean"], 
            Vtask, Vmach, a, b)
        mean_cor_col_v <- mean_cor_col(mat)
    }
    result <- rbind(result, data.frame(mean_cor_row = mean_cor_row_v, mean_cor_col = mean_cor_col_v))
}
literature_CV <- cbind(literature_CV, result)

Figures

Let's plot both summaries with both methods.

Range-based method

library(ggplot2)
p <- ggplot(data = literature_range, aes(x = mean_cor_row, y = mean_cor_col, 
    shape = interaction(task_hetero, mach_hetero), color = interaction(task_hetero, 
        mach_hetero), fill = interaction(task_hetero, mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(rho[task]), breaks = seq(0, 2, 
    0.2), limits = extendrange(c(0, 1)))
p <- p + scale_y_continuous(name = expression(rho[mach]), breaks = seq(0, 2, 
    0.2), limits = extendrange(c(0, 1)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p

plot of chunk CV-mean-range-properties-plot

CVB method

library(ggplot2)
p <- ggplot(data = literature_CV, aes(x = mean_cor_row, y = mean_cor_col, shape = interaction(task_hetero, 
    mach_hetero), color = interaction(task_hetero, mach_hetero), fill = interaction(task_hetero, 
    mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(rho[task]), breaks = seq(0, 2, 
    0.2), limits = extendrange(c(0, 1)))
p <- p + scale_y_continuous(name = expression(rho[mach]), breaks = seq(0, 2, 
    0.2), limits = extendrange(c(0, 1)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "med.med", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "medmed", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    22, 4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p

plot of chunk CV-mean-CV-properties-plot

Validation

Let's test each case empirically:

result <- NULL
for (i in 1:nrow(literature_range)) {
    suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    mat <- generate_matrix_siegel_range(100, 100, param["task_range"], param["mach_range"], 
        param["row_consis"], param["col_consis"])
    mean_cor_row_v <- mean_cor_row(mat)
    mean_cor_col_v <- mean_cor_col(mat)
    result <- rbind(result, c(mean_cor_row_v - literature_range[i, "mean_cor_row"], 
        mean_cor_col_v - literature_range[i, "mean_cor_col"]))
}
summary(result)

##        V1                  V2          
##  Min.   :-0.023038   Min.   :-0.08172  
##  1st Qu.:-0.007847   1st Qu.:-0.04836  
##  Median :-0.001543   Median :-0.01958  
##  Mean   :-0.004802   Mean   :-0.02099  
##  3rd Qu.: 0.000622   3rd Qu.:-0.00068  
##  Max.   : 0.003960   Max.   : 0.06144

The column correlation seems harder to validate precisely. A test with a 2000-square matrix reveals this is acceptable.

result <- NULL
for (i in 1:nrow(literature_CV)) {
    suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    if (is.na(param["mean"])) 
        param["mean"] <- 1
    mat <- generate_matrix_siegel_CV(100, 100, param["mean"], param["mean"], 
        param["task_range"], param["mach_range"], param["row_consis"], param["col_consis"])
    mean_cor_row_v <- mean_cor_row(mat)
    mean_cor_col_v <- mean_cor_col(mat)
    result <- rbind(result, c(mean_cor_row_v - literature_CV[i, "mean_cor_row"], 
        mean_cor_col_v - literature_CV[i, "mean_cor_col"]))
}
summary(result)

##        V1                 V2          
##  Min.   :-0.03292   Min.   :-0.30411  
##  1st Qu.:-0.01134   1st Qu.:-0.01599  
##  Median :-0.00097   Median :-0.00264  
##  Mean   :-0.00489   Mean   :-0.00802  
##  3rd Qu.: 0.00047   3rd Qu.: 0.00656  
##  Max.   : 0.00402   Max.   : 0.08253

This is acceptable.

SV-related measures

Let's compute and show the values for the TMA and the CVSV:

SV_related_range <- NULL
for (i in 1:nrow(literature_range)) {
    suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    mat <- generate_matrix_siegel_range(300, 100, param["task_range"], param["mach_range"], 
        param["row_consis"], param["col_consis"])
    SV_related_range <- rbind(SV_related_range, data.frame(TMA1 = TMA1(mat), 
        TMA2 = TMA2(mat), CV_SV = CV_SV(mat)))
}
SV_related_CVB <- NULL
for (i in 1:nrow(literature_CV)) {
    suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    if (is.na(param["mean"])) 
        param["mean"] <- 1
    mat <- generate_matrix_siegel_CV(300, 100, param["mean"], param["mean"], 
        param["task_range"], param["mach_range"], param["row_consis"], param["col_consis"])
    SV_related_CVB <- rbind(SV_related_CVB, data.frame(TMA1 = TMA1(mat), TMA2 = TMA2(mat), 
        CV_SV = CV_SV(mat)))
}

Let's represent the TMA:

plot.ecdf(SV_related_CVB$TMA2, log = "x", xlim = c(0.001, 0.5))
plot.ecdf(SV_related_CVB$TMA1, add = TRUE, cex = 0.3, col = 2)
plot.ecdf(SV_related_range$TMA2, add = TRUE)

plot of chunk unnamed-chunk-2

length(which(SV_related_CVB$TMA2 < 0.02))/length(SV_related_CVB$TMA2)

## [1] 0.422

length(which(SV_related_CVB$TMA2 > 0.2 & SV_related_CVB$TMA2 < 0.5))/length(SV_related_CVB$TMA2)

## [1] 0.07514

max(SV_related_CVB$TMA2)

## [1] 0.7981

Many values are close to 0 (more than 40% are below 0.02). The largest values are between 0.2 and 0.5 (less than 10%). This is inconsistent with the findings of alqawashmeh2010, which reports that there is no matrix with TMA between 0.05 and 0.2 and no matrix with TMA close to zero. The maximum TMA is however consistent (0.7 in their cases against 0.8 in our).

Let's represent the CV of the SV:

plot.ecdf(SV_related_CVB$CV_SV)
plot.ecdf(SV_related_range$CV_SV, add = TRUE)

plot of chunk unnamed-chunk-3

summary(SV_related_CVB$CV_SV)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.528   1.690   2.640   3.790   5.260   9.390

The CV of the SV in the literature are irregularly distributed mostly between 1 and 9. It would be interesting to have a better control on this curve (generating matrices with a CV starting from zero until a given maximum value with more regularity). We still do not know what is the maximum value that is equivalent to the uniform case. Actually, it is difficult to know how we want to vary this parameter as it is unclear how the SV impact the heuristics. Maybe it was too soon to simplify this vector as a simple CV.

Conclusion

We know how are distributed the instances used in the literature in terms of correlation. We also show the distribution of their TMA and CV of SV. This last part is less informative as the effect of the singular values is still fuzzy.