The previous analysis showed that the singular values have an important impact on the performance of HLPT. The objective is to have a better understanting of the effect if the SV distribution. In particular, summarizing it with the TMA or the CV may not be optimal. Let's generate matrices with specific distributions and plot these distributions when HLPT is either close or far from the optimal.
best_cmax <- function(costs) {
min(balance_sufferage(costs), balance_EFT(costs))
}
LPT_unrelated_perf_CV <- NULL
SV_dists_CV <- NULL
for (i in 1:100) {
cvsv <- 10^runif(1, -2, 1)
SV <- sort(rgamma_cost(50, 1, cvsv), decreasing = TRUE)
Z <- generate_matrix_sv(SV, 200, 50)
cmax <- LPT_unrelated(Z, func = min)
LPT_unrelated_perf_CV <- rbind(LPT_unrelated_perf_CV, data.frame(cvsv = cvsv,
mean_cor_row = mean_cor_row(Z), mean_cor_col = mean_cor_col(Z), TMA1 = TMA1(Z),
TMA2 = TMA2(Z), CVSV = CV_SV(Z), ratio = max(1, cmax/best_cmax(Z))))
SV_dists_CV <- c(SV_dists_CV, list(svd(Z)$d))
}
Let's plot the ecdf of the SV. Red is for the case when HLPT is close to the best case, while blue is when it is far.
plot.ecdf(SV_dists_CV[[1]], xlim = c(max(1e-10, min(unlist(SV_dists_CV))), max(unlist(SV_dists_CV))),
log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_CV))
for (i in 1:length(SV_dists_CV)) {
rank_dist <- which(order(LPT_unrelated_perf_CV$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_CV) - 1)
plot.ecdf(SV_dists_CV[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
Clearly, we can see that the worst schedules are obtained when all the SV are identical and that HLPT performs the best when there is some dispersion. However, the degree of dispersion does not indicate at which point HLPT is good. We may focus on several specific distributions:
In each case, the distribution must contain random values to avoid any structure in the matrix. Another hypothesis is that only the first few value of the SV are important. For that, we could study the case where we control the first k SV with k in [1;5].
Let's perform the same analysis for the TMA:
LPT_unrelated_perf_TMA <- NULL
SV_dists_TMA <- NULL
SV_dists_TMA_inv <- NULL
for (i in 1:100) {
TMA <- 10^runif(1, -2, 0)
width <- min(1 - TMA, TMA)
SV <- c(1, sort(runif(50 - 1, min = TMA - width, max = TMA + width), decreasing = TRUE))
Z <- generate_matrix_sv(SV, 200, 50)
Z <- 1/Z
cmax <- LPT_unrelated(Z, func = min)
LPT_unrelated_perf_TMA <- rbind(LPT_unrelated_perf_TMA, data.frame(TMA = TMA,
mean_cor_row = mean_cor_row(Z), mean_cor_col = mean_cor_col(Z), TMA1 = TMA1(Z),
TMA2 = TMA2(Z), CVSV = CV_SV(Z), ratio = max(1, cmax/best_cmax(Z))))
SV_dists_TMA <- c(SV_dists_TMA, list(svd(Z)$d))
SV_dists_TMA_inv <- c(SV_dists_TMA_inv, list(svd(1/Z)$d))
}
And the figure:
plot.ecdf(SV_dists_TMA[[1]], xlim = c(min(unlist(SV_dists_TMA)), max(unlist(SV_dists_TMA))),
log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), alpha = TRUE)(nrow(LPT_unrelated_perf_TMA))
for (i in 1:length(SV_dists_TMA)) {
rank_dist <- which(order(LPT_unrelated_perf_CV$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_TMA) - 1)
plot.ecdf(SV_dists_TMA[[i]], add = TRUE, cex = 2 * (1 - sqrt((1 - rank_dist_ratio) *
rank_dist_ratio * 4)), col = palette[rank_dist])
}
We do not see a clear structure. Let's see the distribution of the SV of the inverse matrix as it is more closely related to the TMA:
plot.ecdf(SV_dists_TMA_inv[[1]], xlim = c(min(unlist(SV_dists_TMA_inv)), max(unlist(SV_dists_TMA_inv))),
log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), alpha = TRUE)(nrow(LPT_unrelated_perf_TMA))
for (i in 1:length(SV_dists_TMA_inv)) {
rank_dist <- order(LPT_unrelated_perf_TMA$ratio)[i]
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_TMA) - 1)
plot.ecdf(SV_dists_TMA_inv[[i]], add = TRUE, cex = 2 * (1 - sqrt((1 - rank_dist_ratio) *
rank_dist_ratio * 4)), col = palette[rank_dist])
}
It is again hard to see any tendency.
Let's generate matrices with SV distributed according to a uniform one between k and 1 (with k in [0;1]).
LPT_unrelated_perf_unif <- NULL
SV_dists_unif <- NULL
for (i in 1:100) {
k <- runif(1, 0, 1)
SV <- sort(runif(50, min = k, max = 1), decreasing = TRUE)
Z <- generate_matrix_sv(SV, 200, 50)
cmax <- LPT_unrelated(Z, func = min)
LPT_unrelated_perf_unif <- rbind(LPT_unrelated_perf_unif, data.frame(k = k,
mean_cor_row = mean_cor_row(Z), mean_cor_col = mean_cor_col(Z), TMA1 = TMA1(Z),
TMA2 = TMA2(Z), CVSV = CV_SV(Z), ratio = max(1, cmax/best_cmax(Z))))
SV_dists_unif <- c(SV_dists_unif, list(svd(Z)$d))
}
Let's plot this:
plot.ecdf(SV_dists_unif[[1]], xlim = c(max(1e-10, min(unlist(SV_dists_unif))),
max(unlist(SV_dists_unif))), cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_unif))
for (i in 1:length(SV_dists_unif)) {
rank_dist <- which(order(LPT_unrelated_perf_unif$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_unif) - 1)
plot.ecdf(SV_dists_unif[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
There are some issues with the first value that may impact the entire performance.
summary(sapply(1:length(SV_dists_unif), function(i) {
SV_dists_unif[[i]][1]/SV_dists_unif[[i]][2]
}))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.14 1.43 1.86 2.23 4.53
This is wrong, the maximum value should be 2. Let's avoid those cases:
plot.ecdf(SV_dists_unif[[1]], xlim = c(0.9, 1), ylim = c(0, 1), cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_unif))
for (i in 1:length(SV_dists_unif)) if (SV_dists_unif[[i]][1] < (1.1 * SV_dists_unif[[i]][2]/SV_dists_unif[[i]][3]) *
SV_dists_unif[[i]][2]) {
rank_dist <- which(order(LPT_unrelated_perf_unif$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_unif) - 1)
plot.ecdf(SV_dists_unif[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
This removes most of the extreme cases. Let's see if the ratio between the first SV and the second has any relation with the performance:
LPT_unrelated_perf_unif <- cbind(LPT_unrelated_perf_unif, SV_ratio = sapply(1:length(SV_dists_unif),
function(i) {
SV_dists_unif[[i]][1]/SV_dists_unif[[i]][2]
}))
plot(LPT_unrelated_perf_unif$SV_ratio, LPT_unrelated_perf_unif$ratio)
summary(lm(ratio ~ SV_ratio, LPT_unrelated_perf_unif))$adj.r.squared
## [1] 0.0344
There is no relation.
Let's also consider an exponential distribution when the x-scale is logarithmic between k and 1 (with k in [1e-5;1]).
LPT_unrelated_perf_exp <- NULL
SV_dists_exp <- NULL
for (i in 1:100) {
k <- runif(1, -5, 0)
SV <- sort(10^runif(50, min = k, max = 1), decreasing = TRUE)
Z <- generate_matrix_sv(SV, 200, 50)
cmax <- LPT_unrelated(Z, func = min)
LPT_unrelated_perf_exp <- rbind(LPT_unrelated_perf_exp, data.frame(k = k,
mean_cor_row = mean_cor_row(Z), mean_cor_col = mean_cor_col(Z), TMA1 = TMA1(Z),
TMA2 = TMA2(Z), CVSV = CV_SV(Z), ratio = max(1, cmax/best_cmax(Z))))
SV_dists_exp <- c(SV_dists_exp, list(svd(Z)$d))
}
Let's plot this.
plot.ecdf(SV_dists_exp[[1]], xlim = c(max(1e-10, min(unlist(SV_dists_exp))),
max(unlist(SV_dists_exp))), log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_exp))
for (i in 1:length(SV_dists_exp)) {
rank_dist <- which(order(LPT_unrelated_perf_exp$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_exp) - 1)
plot.ecdf(SV_dists_exp[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
There is a clearer relation here: efficient situations (red) occur when the SV are low, whereas difficult instances (blue) are when the SV are the highest. Let's check the linear relation.
plot(LPT_unrelated_perf_exp$k, LPT_unrelated_perf_exp$ratio)
summary(lm(ratio ~ k, LPT_unrelated_perf_exp))$adj.r.squared
## [1] 0.4122
Not really significant.
Let's take a look to a last thing: the ratio between each successive SV.
SV_dists_CV_succ <- NULL
for (i in 1:length(SV_dists_CV)) SV_dists_CV_succ <- c(SV_dists_CV_succ, list(SV_dists_CV[[i]][-length(SV_dists_CV[[i]])]/SV_dists_CV[[i]][-1]))
SV_dists_TMA_succ <- NULL
for (i in 1:length(SV_dists_TMA)) SV_dists_TMA_succ <- c(SV_dists_TMA_succ,
list(SV_dists_TMA[[i]][-length(SV_dists_TMA[[i]])]/SV_dists_TMA[[i]][-1]))
SV_dists_unif_succ <- NULL
for (i in 1:length(SV_dists_unif)) SV_dists_unif_succ <- c(SV_dists_unif_succ,
list(SV_dists_unif[[i]][-length(SV_dists_unif[[i]])]/SV_dists_unif[[i]][-1]))
SV_dists_exp_succ <- NULL
for (i in 1:length(SV_dists_exp)) SV_dists_exp_succ <- c(SV_dists_exp_succ,
list(SV_dists_exp[[i]][-length(SV_dists_exp[[i]])]/SV_dists_exp[[i]][-1]))
Let's plot this for the first set of data:
par(mfrow = c(2, 2))
plot.ecdf(SV_dists_CV_succ[[1]], xlim = c(1, 5), log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_CV))
for (i in 1:length(SV_dists_CV_succ)) {
rank_dist <- which(order(LPT_unrelated_perf_CV$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_CV) - 1)
plot.ecdf(SV_dists_CV_succ[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
plot.ecdf(SV_dists_TMA_succ[[1]], xlim = c(1, 1.1), log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_TMA))
for (i in 1:length(SV_dists_TMA_succ)) {
rank_dist <- which(order(LPT_unrelated_perf_TMA$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_TMA) - 1)
plot.ecdf(SV_dists_TMA_succ[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
plot.ecdf(SV_dists_unif_succ[[1]], xlim = c(1, 1.1), log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_unif))
for (i in 1:length(SV_dists_unif_succ)) {
rank_dist <- which(order(LPT_unrelated_perf_unif$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_unif) - 1)
plot.ecdf(SV_dists_unif_succ[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
plot.ecdf(SV_dists_exp_succ[[1]], xlim = c(1, 1.5), log = "x", cex = 0)
palette <- colorRampPalette(c(rgb(1, 0, 0, 0.2), rgb(0, 0, 1, 0.2)), alpha = TRUE)(nrow(LPT_unrelated_perf_exp))
for (i in 1:length(SV_dists_exp_succ)) {
rank_dist <- which(order(LPT_unrelated_perf_exp$ratio) == i)
rank_dist_ratio <- (rank_dist - 1)/(nrow(LPT_unrelated_perf_exp) - 1)
plot.ecdf(SV_dists_exp_succ[[i]], add = TRUE, cex = 2 * (1 - (1 - rank_dist_ratio) *
rank_dist_ratio * 4), col = palette[rank_dist])
}
There is only a strong relation in the first case (and a bit in the last one).
We inspected the distribution of the SV with 4 different generation method and did not find a specific properties that would accuratly predict the performance of HLPT. We have no choice than to include the following measures in the planned analysis that will account for all possible properties:
We may also consider the SV of the inverted matrix, or the inverse SV.
In tems of generation, we have four global methods. We do not have any intuition on the respective qualities of each of them.