Euro-Par simulations

Edit from 2017-01-27: fix greedy-bal-proved implem. Edit from 2017-02-01: use simpler LB.

Let’s start with a small set of instances to see what can be expected when we have only 4 types of tasks.

n <- 300
m <- 20
k <- 4
ratios <- replicate(10, {
  costs_CPU <- rep_len(rgamma_cost(4, mean = 30, sd = 5), n)
  ratio_GPU <- rep_len(runif(4, min = 2, max = 30), n)
  costs <- matrix(c(costs_CPU, costs_CPU / ratio_GPU), ncol = 2, byrow = FALSE)
  RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
             schedule_DADA(costs, m, k),
             schedule_CLB2C(costs, m, k),
             schedule_greedy_balance(costs, m, k),
             schedule_greedy_balance_proved(costs, m, k),
             schedule_DualHP(costs, m, k),
             schedule_HeteroPrio(costs, m, k))
  names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                 "greedy_bal_proved", "DualHP", "HeteroPrio")
  c(map_dbl(RR, ~.$mks), LB = LB_greedy_balance(costs, m, k))
})

Let’s plot this.

data.frame(t(ratios)) %>%
  gather(key = "algo", value = "ratio", -LB) %>%
  ggplot(aes(x = algo, y = ratio / LB)) +
  geom_boxplot() +
  geom_jitter(size = 1)

This shows that greedy_bal is on par with HeteroPrio. However, this is the most unfavorable case for greedy_bal because it has no look-ahead ability on the scheduling phase (in contrast to CLB2C and HeteroPrio which build the schedule step by step). It is not a problem to show the weakness of our proposed algorithm, but if we have to show only one thing, it would be best to also show its strength.

Let’s discard the types of tasks to see if the number of task types has some effect that would allow a more equitable presentation.

ratios <- replicate(10, {
  costs_CPU <- rgamma_cost(n, mean = 30, sd = 5)
  ratio_GPU <- runif(n, min = 2, max = 30)
  costs <- matrix(c(costs_CPU, costs_CPU / ratio_GPU), ncol = 2, byrow = FALSE)
  RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
             schedule_DADA(costs, m, k),
             schedule_CLB2C(costs, m, k),
             schedule_greedy_balance(costs, m, k),
             schedule_greedy_balance_proved(costs, m, k),
             schedule_DualHP(costs, m, k),
             schedule_HeteroPrio(costs, m, k))
  names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                 "greedy_bal_proved", "DualHP", "HeteroPrio")
  c(map_dbl(RR, ~.$mks), LB = LB_greedy_balance(costs, m, k))
})

And we plot it.

data.frame(t(ratios)) %>%
  gather(key = "algo", value = "ratio", -LB) %>%
  ggplot(aes(x = algo, y = ratio / LB)) +
  geom_boxplot() +
  geom_jitter(size = 1)

This is more favourable to greedy_bal, but it still does not show when it is better than CLB2C.

Let’s see if a larger variance on the processor costs would be less favorable to CLB2C.

ratios <- replicate(10, {
  costs_CPU <- rgamma_cost(n, mean = 30, sd = 10)
  ratio_GPU <- runif(n, min = 2, max = 30)
  costs <- matrix(c(costs_CPU, costs_CPU / ratio_GPU), ncol = 2, byrow = FALSE)
  RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
             schedule_DADA(costs, m, k),
             schedule_CLB2C(costs, m, k),
             schedule_greedy_balance(costs, m, k),
             schedule_greedy_balance_proved(costs, m, k),
             schedule_DualHP(costs, m, k),
             schedule_HeteroPrio(costs, m, k))
  names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                 "greedy_bal_proved", "DualHP", "HeteroPrio")
  c(map_dbl(RR, ~.$mks), LB = LB_greedy_balance(costs, m, k))
})

data.frame(t(ratios)) %>%
  gather(key = "algo", value = "ratio", -LB) %>%
  ggplot(aes(x = algo, y = ratio / LB)) +
  geom_boxplot() +
  geom_jitter(size = 1)

It is. So let’s show how the number of task types affects the relative medians of each method.

ratios <- map_df(seq(4, 36, 8), function(type) {
  replicate(2, {
    costs_CPU <- rep_len(rgamma_cost(type, mean = 30, sd = 10), n)
    ratio_GPU <- rep_len(runif(type, min = 2, max = 30), n)
    costs <- matrix(c(costs_CPU, costs_CPU / ratio_GPU), ncol = 2, byrow = FALSE)
    RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
               schedule_DADA(costs, m, k),
               schedule_CLB2C(costs, m, k),
               schedule_greedy_balance(costs, m, k),
               schedule_greedy_balance_proved(costs, m, k),
               schedule_DualHP(costs, m, k),
               schedule_HeteroPrio(costs, m, k))
    names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                   "greedy_bal_proved", "DualHP", "HeteroPrio")
    c(type = type, LB = LB_greedy_balance(costs, m, k), map_dbl(RR, ~.$mks))
  }) %>% t() %>% data.frame()
})

ratios %>%
  gather(key = "algo", value = "ratio", -type, -LB) %>%
  group_by(type, algo) %>%
  summarise(medratio = median(ratio / LB)) %>%
  ggplot(aes(x = factor(type), y = medratio, color = algo, group = algo)) +
  geom_point() +
  geom_line()

This seems promising. Let’s validate this plot with more data.

ratios <- map_df(2^(2:8), function(type) {
  replicate(10, {
    costs_CPU <- rep_len(rgamma_cost(type, mean = 30, sd = 10), n)
    ratio_GPU <- rep_len(runif(type, min = 2, max = 30), n)
    costs <- matrix(c(costs_CPU, costs_CPU / ratio_GPU), ncol = 2, byrow = FALSE)
    RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
               schedule_DADA(costs, m, k),
               schedule_CLB2C(costs, m, k),
               schedule_greedy_balance(costs, m, k),
               schedule_greedy_balance_proved(costs, m, k),
               schedule_DualHP(costs, m, k),
               schedule_HeteroPrio(costs, m, k))
    names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                   "greedy_bal_proved", "DualHP", "HeteroPrio")
    c(type = type, LB = LB_greedy_balance(costs, m, k), map_dbl(RR, ~.$mks))
  }) %>% t() %>% data.frame()
})

ratios %>%
  gather(key = "algo", value = "ratio", -type, -LB) %>%
  group_by(type, algo) %>%
  mutate(meanratio = mean(ratio / LB)) %>%
  ggplot(aes(x = type, y = ratio / LB, color = algo)) +
  geom_boxplot(aes(group = interaction(algo, type))) +
  geom_line(aes(y = meanratio), size = 2) +
  scale_x_log10()

This is terrible. We do not see significant variations. Let’s consider a set of instance settings instead:

CPU = rgamma(1, 0.2) & GPU = rgamma(1, 0.2)
CPU = rgamma(1, 1) & GPU = rgamma(1, 0.2)
CPU = rgamma(1, 1) & GPU = rgamma(1, 1)
CPU = 15 * rgamma(1, 0.2) & GPU = rgamma(1, 0.2)
CPU = 15 * rgamma(1, 1) & GPU = rgamma(1, 1)
CPU = 15 * rgamma(1, 1) & GPU = rgamma(1, 0.2)
- similar to CPU = 15 * rgamma(1, 0.2) & GPU = CPU / runif(1, 15)
- maybe most realistic settings for linear algebra
CPU = 15 * rgamma(1, 0.2) & GPU = rgamma(1, 1)
- similar to CPU = runif(1, 15) * GPU & GPU = rgamma(1, 0.2)

cases <- data.frame(rbind(coeff = c(rep(1, 3), rep(15, 4)),
                          cv1 = c(0.2, 1, 1, 0.2, 1, 0.2, 1),
                          cv2 = c(0.2, 0.2, 1, 0.2, 0.2, 1, 1)))
ratios <- map_df(cases, function(case) {
  replicate(5, {
    costs_CPU <- case[1] * rgamma_cost(n, mean = 1, sd = case[2])
    costs_GPU <- rgamma_cost(n, mean = 1, sd = case[3])
    costs <- matrix(c(costs_CPU, costs_GPU), ncol = 2, byrow = FALSE)
    RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
               schedule_DADA(costs, m, k),
               schedule_CLB2C(costs, m, k),
               schedule_greedy_balance(costs, m, k),
               schedule_greedy_balance_proved(costs, m, k),
               schedule_DualHP(costs, m, k),
               schedule_HeteroPrio(costs, m, k))
    names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                   "greedy_bal_proved", "DualHP", "HeteroPrio")
    c(list(case = paste(case, collapse = "-"), LB = LB_greedy_balance(costs, m, k)),
      map_dbl(RR, ~.$mks))
  }) %>% t() %>% data.frame() %>% map(unlist) %>% data.frame()
})

## Warning in rbind_all(x, .id): Unequal factor levels: coercing to character

ratios %>%
  gather(key = "algo", value = "ratio", -case, -LB) %>%
  group_by(case, algo) %>%
  ggplot(aes(x = case, y = ratio / LB, color = algo)) +
  geom_boxplot(aes(group = interaction(algo, case))) +
  coord_cartesian(ylim = c(1, 1.2))

This is more interesting. Let’s focus on the four last cases because the symmetrical cases are a bit redundant.

cases <- data.frame(rbind(coeff = rep(15, 4), cv1 = c(0.2, 1, 0.2, 1),
                          cv2 = c(0.2, 0.2, 1, 1)))
ratios <- map_df(cases, function(case) {
  replicate(20, {
    costs_CPU <- case[1] * rgamma_cost(n, mean = 1, sd = case[2])
    costs_GPU <- rgamma_cost(n, mean = 1, sd = case[3])
    costs <- matrix(c(costs_CPU, costs_GPU), ncol = 2, byrow = FALSE)
    RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
               schedule_DADA(costs, m, k),
               schedule_CLB2C(costs, m, k),
               schedule_greedy_balance(costs, m, k),
               schedule_greedy_balance_proved(costs, m, k),
               schedule_DualHP(costs, m, k),
               schedule_HeteroPrio(costs, m, k))
    names(RR) <- c("LPT_bal", "DADA", "CLB2C", "greedy_bal",
                   "greedy_bal_proved", "DualHP", "HeteroPrio")
    c(list(case = paste(case, collapse = "-"), LB = LB_greedy_balance(costs, m, k)),
      map_dbl(RR, ~.$mks))
  }) %>% t() %>% data.frame() %>% map(unlist) %>% data.frame()
})

## Warning in rbind_all(x, .id): Unequal factor levels: coercing to character

ratios %>%
  gather(key = "algo", value = "ratio", -case, -LB) %>%
  group_by(case, algo) %>%
  ggplot(aes(x = case, y = ratio / LB, color = algo)) +
  geom_boxplot(aes(group = interaction(algo, case)), outlier.size = 1) +
  coord_cartesian(ylim = c(1, 1.2))

This shows more information and in particular that no approach is always good. For the Euro-Par figure, we will keep only one greedy_bal algo, increase the number of schedule to 100, reorder algos (LPT first and HeteroPrio last, greedy_bal, DualHP, DADA and CLB2C in the middle), improve labels and adjust the size.

cases <- data.frame(rbind(cv1 = c(0.2, 1, 0.2, 1), cv2 = c(0.2, 0.2, 1, 1)))
ratios <- map_df(cases, function(case) {
  replicate(100, {
    costs_CPU <- 15 * rgamma_cost(n, mean = 1, sd = case[1])
    costs_GPU <- rgamma_cost(n, mean = 1, sd = case[2])
    costs <- matrix(c(costs_CPU, costs_GPU), ncol = 2, byrow = FALSE)
    RR <- list(schedule_LPT_balance(costs, m, k, strategy = "relsuff"),
               schedule_greedy_balance_proved(costs, m, k),
               schedule_DADA(costs, m, k),
               schedule_DualHP(costs, m, k),
               schedule_CLB2C(costs, m, k),
               schedule_HeteroPrio(costs, m, k))
    names(RR) <- c("LPT_bal", "greedy_bal", "DADA", "DualHP", "CLB2C",
                   "HeteroPrio")
    c(list(case = paste(map(case, ~ ifelse(. == 0.2, "low", "high")),
                        collapse = "/"),
           LB = LB_simple(costs, m, k)), map_dbl(RR, ~.$mks))
  }) %>% t() %>% data.frame() %>% map(unlist) %>% data.frame()
})

## Warning in rbind_all(x, .id): Unequal factor levels: coercing to character

names(ratios)[3:ncol(ratios)] <- c("Balance-LPT", "Balance-greedy", "DADA",
                                     "DualHP", "CLB2C", "HeteroPrio")
ratios[,c("case", "LB", "Balance-LPT", "Balance-greedy", "DualHP", "DADA",
          "CLB2C", "HeteroPrio")] %>%
  gather(key = "algo", value = "ratio", -case, -LB) %>%
  ggplot(aes(x = case, y = ratio / LB, color = algo)) +
  geom_boxplot(aes(group = interaction(algo, case)), outlier.size = 1) +
  coord_cartesian(ylim = c(1, 1.2)) +
  scale_color_brewer("Algorithm", palette = "Set1") +
  scale_x_discrete("Coefficient of variation (CPU/GPU)") +
  scale_y_continuous("Makespan / LB")

OK, we have some satisfying figure. It shows both the strengths and the weaknesses of our approach.

Some more data:

map_dbl(ratios[,3:8], ~quantile(. / ratios$LB, probs = 0.025))

##    Balance-LPT Balance-greedy           DADA         DualHP          CLB2C 
##       1.003053       1.003757       1.009893       1.003633       1.037046 
##     HeteroPrio 
##       1.124587

map_dbl(ratios[,3:8], ~quantile(. / ratios$LB, probs = 0.975))

##    Balance-LPT Balance-greedy           DADA         DualHP          CLB2C 
##       1.074413       1.146030       1.140963       1.145505       1.387446 
##     HeteroPrio 
##       1.403260

## R version 3.3.2 (2016-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
##  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_2.1.0 dplyr_0.4.3   tidyr_0.2.0   purrr_0.2.0   stringr_1.0.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.7        knitr_1.14         magrittr_1.5      
##  [4] munsell_0.4        colorspace_1.2-2   R6_2.1.2          
##  [7] plyr_1.8.1         tools_3.3.2        parallel_3.3.2    
## [10] grid_3.3.2         gtable_0.1.2       pacman_0.4.1      
## [13] DBI_0.3.1          htmltools_0.3.5    lazyeval_0.1.10   
## [16] yaml_2.1.13        assertthat_0.1     digest_0.6.9      
## [19] tibble_1.2         RColorBrewer_1.0-5 reshape2_1.2.2    
## [22] formatR_1.4        codetools_0.2-15   evaluate_0.10     
## [25] rmarkdown_1.1      labeling_0.1       stringi_0.5-5     
## [28] scales_0.4.0

Euro-Par simulations

L.-C. Canon

January 23, 2017