Relevance of asymptotic assumption (February 7, 2015)

We will study the heterogeneous properties of small matrices generated with the range-based and CVB methods. To have a stronger confidence, we will re-implement those methods.

Reimplementation

range_based <- function(n, m, Rtask, Rmach, a = 0, b = 0) {
    cost <- matrix(nrow = n, ncol = m)
    for (i in 1:n) {
        v <- runif(1, min = 1, max = Rtask)
        for (j in 1:m) cost[i, j] <- v * runif(1, min = 1, max = Rmach)
    }
    for (i in 1:as.integer(a * n)) cost[i, 1:as.integer(b * m)] <- cost[i, order(cost[i, 
        1:as.integer(b * m)])]
    cost
}

CVB <- function(n, m, Vtask, Vmach, mutask, a = 0, b = 0) {
    cost <- matrix(nrow = n, ncol = m)
    alphatask <- 1/Vtask^2
    alphamach <- 1/Vmach^2
    betatask <- mutask/alphatask
    for (i in 1:n) {
        q <- rgamma(1, shape = alphatask, scale = betatask)
        betamach <- q/alphamach
        for (j in 1:m) cost[i, j] <- rgamma(1, shape = alphamach, scale = betamach)
    }
    for (i in 1:as.integer(a * n)) cost[i, 1:as.integer(b * m)] <- cost[i, order(cost[i, 
        1:as.integer(b * m)])]
    cost
}

Let's test them with some known outcomes (n=m=1000, Rtask=Rmach=1e6, Vtask=Vmach=0.1, a=b=1):

rb <- range_based(n = 1000, m = 1000, Rtask = 1e+06, Rmach = 1e+06, a = 1, b = 1)
CV_mean_row(rb)

## [1] 0.5931

CV_mean_col(rb)

## [1] 0.577

cv <- CVB(n = 1000, m = 1000, Vtask = 0.1, Vmach = 0.1, mutask = 10, a = 1, 
    b = 1)
CV_mean_row(cv)

## [1] 0.1013

CV_mean_col(cv)

## [1] 0.09998

OK, this works and this is symmetric. Let's test with no reordering:

rb <- range_based(n = 1000, m = 1000, Rtask = 1e+06, Rmach = 1e+06, a = 0, b = 0)
CV_mean_row(rb)

## [1] 0.5788

CV_mean_col(rb)

## [1] 0.02065

cv <- CVB(n = 1000, m = 1000, Vtask = 0.1, Vmach = 0.1, mutask = 10, a = 0, 
    b = 0)
CV_mean_row(cv)

## [1] 0.09896

CV_mean_col(cv)

## [1] 0.003181

As expected, the machine heterogeneity is zero.

Range-based method

Let's generate the heterogeneity figure of previous studies with only MC for the range-based:

literature_range <- read.table("literature_range.txt", na.strings = "", col.names = c("task_range", 
    "mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis"))

result <- NULL
for (i in 1:nrow(literature_range)) {
    suppressWarnings(param <- sapply(literature_range[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    mat <- range_based(1000, 1000, param["task_range"], param["mach_range"], 
        param["row_consis"], param["col_consis"])
    result <- rbind(result, data.frame(CV_mean_row = CV_mean_row(mat), CV_mean_col = CV_mean_col(mat), 
        mean_CV_col = mean_CV_col(mat), mean_CV_row = mean_CV_row(mat)))
}
literature_range <- cbind(literature_range, result)

Let's plot these values:

library(ggplot2)
p <- ggplot(data = literature_range, aes(x = CV_mean_row, y = CV_mean_col, shape = interaction(task_hetero, 
    mach_hetero), color = interaction(task_hetero, mach_hetero), fill = interaction(task_hetero, 
    mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(V ~ mu[task]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
p <- p + scale_y_continuous(name = expression(V ~ mu[mach]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p

plot of chunk CV-mean-range-properties-MC-plot

CVB method

Now with the CVB method :

literature_CV <- read.table("literature_CV.txt", na.strings = "", col.names = c("task_range", 
    "mach_range", "task_hetero", "mach_hetero", "row_consis", "col_consis", 
    "mean"))

The following computation should take around 20 minutes.

result <- NULL
for (i in 1:nrow(literature_CV)) {
    suppressWarnings(param <- sapply(literature_CV[i, ], function(x) if (is.factor(x[[1]])) 
        as.numeric(levels(x[[1]]))[x[[1]]] else x))
    if (is.na(param["mean"])) 
        param["mean"] <- 1
    if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
        param["row_consis"] <- 0
        param["col_consis"] <- 0
    }
    mat <- CVB(1000, 1000, param["task_range"], param["mach_range"], param["mean"], 
        param["row_consis"], param["col_consis"])
    result <- rbind(result, data.frame(CV_mean_row = CV_mean_row(mat), CV_mean_col = CV_mean_col(mat), 
        mean_CV_col = mean_CV_col(mat), mean_CV_row = mean_CV_row(mat)))
}
literature_CV <- cbind(literature_CV, result)

Let's plot these values:

library(ggplot2)
p <- ggplot(data = literature_CV, aes(x = CV_mean_row, y = CV_mean_col, shape = interaction(task_hetero, 
    mach_hetero), color = interaction(task_hetero, mach_hetero), fill = interaction(task_hetero, 
    mach_hetero)))
p <- p + geom_point(size = 4, alpha = 1/2)
p <- p + scale_x_continuous(name = expression(V ~ mu[task]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
p <- p + scale_y_continuous(name = expression(V ~ mu[mach]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "med.med", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "medmed", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    22, 4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p

## Warning: Removed 7 rows containing missing values (geom_point).

plot of chunk CV-mean-CVB-properties-MC-plot

This is consistent with what we have generated analytically. The new implementations, however, are far slower than the previous. Let's swith to these:

range_based <- function(n, m, Rtask, Rmach, a = 0, b = 0) {
    generate_matrix_siegel_range(n, m, Rtask, Rmach, a, b)
}

CVB <- function(n, m, Vtask, Vmach, mutask, a = 0, b = 0) {
    generate_matrix_siegel_CV(n, m, mutask, mutask, Vtask, Vmach, a, b)
}

Small matrices properties

Let's generate a density plot by running some MC iterations for each settings with various size:

Range-based method

get_properties <- function(literature, n, m, MC) {
    result <- NULL
    for (i in 1:nrow(literature)) {
        suppressWarnings(param <- sapply(literature[i, ], function(x) if (is.factor(x[[1]])) 
            as.numeric(levels(x[[1]]))[x[[1]]] else x))
        if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
            param["row_consis"] <- 0
            param["col_consis"] <- 0
        }
        for (k in 1:MC) {
            mat <- range_based(n, m, param["task_range"], param["mach_range"], 
                param["row_consis"], param["col_consis"])
            result <- rbind(result, data.frame(literature[i, ], Vtask = CV_mean_row(mat), 
                Vmach = CV_mean_col(mat), measure = "CVmean"))
            result <- rbind(result, data.frame(literature[i, ], Vtask = mean_CV_col(mat), 
                Vmach = mean_CV_row(mat), measure = "meanCV"))
        }
    }
    result
}

M <- 10
settings_small <- get_properties(literature_range, 16, 4, M)

Let's plot the density:

library(ggplot2)
p <- ggplot(data = settings_small, aes(x = Vtask, y = Vmach))
p <- p + geom_point(aes(shape = interaction(task_hetero, mach_hetero), color = interaction(task_hetero, 
    mach_hetero), fill = interaction(task_hetero, mach_hetero)), alpha = 1/2)
p <- p + geom_density2d()
p <- p + scale_x_continuous(name = expression(V ~ mu[task]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
p <- p + scale_y_continuous(name = expression(V ~ mu[mach]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p <- p + facet_grid(. ~ measure)
p

## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk CV-mean-range-properties-MC-plot-density-small

Let's start again with a medium size and a large size:

settings_medium <- get_properties(literature_range, 64, 16, M)
settings_large <- get_properties(literature_range, 256, 64, M)
settings <- rbind(cbind(settings_small, size = "16/4"), cbind(settings_medium, 
    size = "64/16"), cbind(settings_large, size = "256/64"))

And plot the result with facets:

library(ggplot2)
p <- ggplot(data = settings, aes(x = Vtask, y = Vmach))
p <- p + geom_point(aes(shape = interaction(task_hetero, mach_hetero), color = interaction(task_hetero, 
    mach_hetero), fill = interaction(task_hetero, mach_hetero)), alpha = 1/2)
p <- p + geom_density2d()
p <- p + scale_x_continuous(name = expression(V ~ mu[task]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
p <- p + scale_y_continuous(name = expression(V ~ mu[mach]), breaks = seq(0, 
    2, 0.2), limits = extendrange(c(0.05, 1.05)))
breaks <- c("high.high", "low.high", "high.low", "low.low", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p <- p + facet_grid(measure ~ size)
p

## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk CV-mean-range-properties-MC-plot-density-facet

CVB method

get_properties_CVB <- function(literature, n, m, MC) {
    result <- NULL
    for (i in 1:nrow(literature)) {
        suppressWarnings(param <- sapply(literature[i, ], function(x) if (is.factor(x[[1]])) 
            as.numeric(levels(x[[1]]))[x[[1]]] else x))
        if (is.na(param["row_consis"]) && is.na(param["col_consis"])) {
            param["row_consis"] <- 0
            param["col_consis"] <- 0
        }
        if (is.na(param["mean"])) 
            param["mean"] <- 1
        for (k in 1:MC) {
            mat <- CVB(n, m, param["task_range"], param["mach_range"], param["mean"], 
                param["row_consis"], param["col_consis"])
            result <- rbind(result, data.frame(literature[i, ], Vtask = CV_mean_row(mat), 
                Vmach = CV_mean_col(mat), measure = "CVmean"))
            result <- rbind(result, data.frame(literature[i, ], Vtask = mean_CV_col(mat), 
                Vmach = mean_CV_row(mat), measure = "meanCV"))
        }
    }
    result
}

settings_CVB_small <- get_properties_CVB(literature_CV, 16, 4, M)
settings_CVB_medium <- get_properties_CVB(literature_CV, 64, 16, M)
settings_CVB_large <- get_properties_CVB(literature_CV, 256, 64, M)
settings_CVB <- rbind(cbind(settings_CVB_small, size = "16/4"), cbind(settings_CVB_medium, 
    size = "64/16"), cbind(settings_CVB_large, size = "256/64"))

Let's put everyting on the same plot:

settings_complete <- rbind(cbind(settings, method = "Range-based"), cbind(settings_CVB[, 
    colnames(settings)], method = "CVB"))
settings_complete$task_hetero <- factor(settings_complete$task_hetero, levels = levels(literature_CV$task_hetero))
settings_complete$mach_hetero <- factor(settings_complete$mach_hetero, levels = levels(literature_CV$mach_hetero))
levels(settings_complete$measure) <- c(expression(V ~ mu[task]/V ~ mu[mach]), 
    expression(mu ~ V[task]/mu ~ V[mach]))

library(ggplot2)
p <- ggplot(data = settings_complete, aes(x = Vtask, y = Vmach))
p <- p + geom_point(aes(shape = interaction(task_hetero, mach_hetero), color = interaction(task_hetero, 
    mach_hetero), fill = interaction(task_hetero, mach_hetero)), alpha = 1/10)
p <- p + scale_x_log10(name = "", limits = c(0.001, 10))
p <- p + scale_y_log10(name = "", limits = c(0.001, 10))
breaks <- c("high.high", "low.high", "high.low", "low.low", "med.med", "NA.NA")
labels <- c("hihi", "lohi", "hilo", "lolo", "medmed", "NA")
p <- p + scale_color_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + scale_shape_manual(name = "Heterogeneity", values = c(21, 24, 25, 23, 
    22, 4), breaks = breaks, labels = labels)
p <- p + scale_fill_discrete(name = "Heterogeneity", breaks = breaks, labels = labels)
p <- p + theme(legend.position = "bottom")
p <- p + guides(colour = guide_legend(override.aes = list(size = 4, alpha = 1)))
p <- p + facet_grid(method + measure ~ size, labeller = label_parsed)
p <- p + annotation_logticks()
p

plot of chunk CV-mean-all-properties-MC-plot-density-facet