Let's import the data and compute the correlation (we have closed formulas for several case and we will rely on estimation for the other cases).
literature_range <- read.table("literature_range.txt", na.strings = "", col.names = c("task_range",
"mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis"))
literature_CV <- read.table("literature_CV.txt", na.strings = "", col.names = c("task_range",
"mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis",
"mean"))
Let's compute the values for the range-based method.
result <- NULL
for (i in 1:nrow(literature_range)) {
suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
Rtask <- param["task_range"]
Rmach <- param["mach_range"]
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
a <- 0
b <- 0
} else {
a <- param["row_consis"]
b <- param["col_consis"]
}
mean_cor_row_v <- a^2 * b
cor_sorted <- (1 + Rmach)^2/(1/3 * (Rmach - 1)^2 + (1 + Rmach)^2 + (Rmach -
1)^2 * (1 + Rtask)^2/(Rtask - 1)^2)
cor_mix <- (1 + Rmach)/sqrt(1/3 * (Rmach - 1)^2 + (1 + Rmach)^2 + (Rmach -
1)^2 * (1 + Rtask)^2/(Rtask - 1)^2)
if (a == 0) {
mean_cor_col_v <- cor_sorted
} else if (a == 1) {
mean_cor_col_v <- b^2 + 2 * b * (1 - b) * cor_mix + (1 - b)^2 * cor_sorted
} else {
mat <- generate_matrix_siegel_range(300, 100, Rtask, Rmach, a, b)
mean_cor_col_v <- mean_cor_col(mat)
}
result <- rbind(result, data.frame(mean_cor_row = mean_cor_row_v, mean_cor_col = mean_cor_col_v))
}
literature_range <- cbind(literature_range, result)
Let's perform the same computation for the CV-based method.
result <- NULL
for (i in 1:nrow(literature_CV)) {
suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
Vtask <- param["task_range"]
Vmach <- param["mach_range"]
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
a <- 0
b <- 0
} else {
a <- param["row_consis"]
b <- param["col_consis"]
}
mean_cor_row_v <- a^2 * b
cor_sorted <- 1/(Vmach^2 + Vmach^2/Vtask^2 + 1)
cor_mix <- 1/sqrt(Vmach^2 + Vmach^2/Vtask^2 + 1)
if (a == 0) {
mean_cor_col_v <- cor_sorted
} else if (a == 1) {
mean_cor_col_v <- b^2 + 2 * b * (1 - b) * cor_mix + (1 - b)^2 * cor_sorted
} else {
mat <- generate_matrix_siegel_CV(300, 100, param["mean"], param["mean"],
Vtask, Vmach, a, b)
mean_cor_col_v <- mean_cor_col(mat)
}
result <- rbind(result, data.frame(mean_cor_row = mean_cor_row_v, mean_cor_col = mean_cor_col_v))
}
literature_CV <- cbind(literature_CV, result)
Let's plot both summaries with both methods.
library(ggplot2)
p <- ggplot(data = literature_range, aes(x = mean_cor_row, y = mean_cor_col,
shape = interaction(task_hetero, mach_hetero), color = interaction(task_hetero,
mach_hetero), fill = interaction(task_hetero, mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(rho[task]), breaks = seq(0, 2,
0.2), limits = extendrange(c(0, 1)))
p <- p + scale_y_continuous(name = expression(rho[mach]), breaks = seq(0, 2,
0.2), limits = extendrange(c(0, 1)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23,
4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p
library(ggplot2)
p <- ggplot(data = literature_CV, aes(x = mean_cor_row, y = mean_cor_col, shape = interaction(task_hetero,
mach_hetero), color = interaction(task_hetero, mach_hetero), fill = interaction(task_hetero,
mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(rho[task]), breaks = seq(0, 2,
0.2), limits = extendrange(c(0, 1)))
p <- p + scale_y_continuous(name = expression(rho[mach]), breaks = seq(0, 2,
0.2), limits = extendrange(c(0, 1)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "med.med", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "medmed", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23,
22, 4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p
Let's test each case empirically:
result <- NULL
for (i in 1:nrow(literature_range)) {
suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
param["row_consis"] <- 0
param["col_consis"] <- 0
}
mat <- generate_matrix_siegel_range(100, 100, param["task_range"], param["mach_range"],
param["row_consis"], param["col_consis"])
mean_cor_row_v <- mean_cor_row(mat)
mean_cor_col_v <- mean_cor_col(mat)
result <- rbind(result, c(mean_cor_row_v - literature_range[i, "mean_cor_row"],
mean_cor_col_v - literature_range[i, "mean_cor_col"]))
}
summary(result)
## V1 V2
## Min. :-0.023038 Min. :-0.08172
## 1st Qu.:-0.007847 1st Qu.:-0.04836
## Median :-0.001543 Median :-0.01958
## Mean :-0.004802 Mean :-0.02099
## 3rd Qu.: 0.000622 3rd Qu.:-0.00068
## Max. : 0.003960 Max. : 0.06144
The column correlation seems harder to validate precisely. A test with a 2000-square matrix reveals this is acceptable.
result <- NULL
for (i in 1:nrow(literature_CV)) {
suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
param["row_consis"] <- 0
param["col_consis"] <- 0
}
if (is.na(param["mean"]))
param["mean"] <- 1
mat <- generate_matrix_siegel_CV(100, 100, param["mean"], param["mean"],
param["task_range"], param["mach_range"], param["row_consis"], param["col_consis"])
mean_cor_row_v <- mean_cor_row(mat)
mean_cor_col_v <- mean_cor_col(mat)
result <- rbind(result, c(mean_cor_row_v - literature_CV[i, "mean_cor_row"],
mean_cor_col_v - literature_CV[i, "mean_cor_col"]))
}
summary(result)
## V1 V2
## Min. :-0.03292 Min. :-0.30411
## 1st Qu.:-0.01134 1st Qu.:-0.01599
## Median :-0.00097 Median :-0.00264
## Mean :-0.00489 Mean :-0.00802
## 3rd Qu.: 0.00047 3rd Qu.: 0.00656
## Max. : 0.00402 Max. : 0.08253
This is acceptable.
Let's compute and show the values for the TMA and the CVSV:
SV_related_range <- NULL
for (i in 1:nrow(literature_range)) {
suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
param["row_consis"] <- 0
param["col_consis"] <- 0
}
mat <- generate_matrix_siegel_range(300, 100, param["task_range"], param["mach_range"],
param["row_consis"], param["col_consis"])
SV_related_range <- rbind(SV_related_range, data.frame(TMA1 = TMA1(mat),
TMA2 = TMA2(mat), CV_SV = CV_SV(mat)))
}
SV_related_CVB <- NULL
for (i in 1:nrow(literature_CV)) {
suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]]))
as.numeric(levels(x[[1]]))[x[[1]]] else x))
if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
param["row_consis"] <- 0
param["col_consis"] <- 0
}
if (is.na(param["mean"]))
param["mean"] <- 1
mat <- generate_matrix_siegel_CV(300, 100, param["mean"], param["mean"],
param["task_range"], param["mach_range"], param["row_consis"], param["col_consis"])
SV_related_CVB <- rbind(SV_related_CVB, data.frame(TMA1 = TMA1(mat), TMA2 = TMA2(mat),
CV_SV = CV_SV(mat)))
}
Let's represent the TMA:
plot.ecdf(SV_related_CVB$TMA2, log = "x", xlim = c(0.001, 0.5))
plot.ecdf(SV_related_CVB$TMA1, add = TRUE, cex = 0.3, col = 2)
plot.ecdf(SV_related_range$TMA2, add = TRUE)
length(which(SV_related_CVB$TMA2 < 0.02))/length(SV_related_CVB$TMA2)
## [1] 0.422
length(which(SV_related_CVB$TMA2 > 0.2 & SV_related_CVB$TMA2 < 0.5))/length(SV_related_CVB$TMA2)
## [1] 0.07514
max(SV_related_CVB$TMA2)
## [1] 0.7981
Many values are close to 0 (more than 40% are below 0.02). The largest values are between 0.2 and 0.5 (less than 10%). This is inconsistent with the findings of alqawashmeh2010, which reports that there is no matrix with TMA between 0.05 and 0.2 and no matrix with TMA close to zero. The maximum TMA is however consistent (0.7 in their cases against 0.8 in our).
Let's represent the CV of the SV:
plot.ecdf(SV_related_CVB$CV_SV)
plot.ecdf(SV_related_range$CV_SV, add = TRUE)
summary(SV_related_CVB$CV_SV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.528 1.690 2.640 3.790 5.260 9.390
The CV of the SV in the literature are irregularly distributed mostly between 1 and 9. It would be interesting to have a better control on this curve (generating matrices with a CV starting from zero until a given maximum value with more regularity). We still do not know what is the maximum value that is equivalent to the uniform case. Actually, it is difficult to know how we want to vary this parameter as it is unclear how the SV impact the heuristics. Maybe it was too soon to simplify this vector as a simple CV.
We know how are distributed the instances used in the literature in terms of correlation. We also show the distribution of their TMA and CV of SV. This last part is less informative as the effect of the singular values is still fuzzy.