library(knitr)
options(digits = 2)
ifelse can be made significantly faster without loss of backwards-compatibility by using anyNA and falling back to the original code whenever backwards-compatibility cannot be assured.
ifelse:old_ifelse <- function (test, yes, no)
{
if(is.atomic(test)) { # do not lose attributes
if (typeof(test) != "logical")
storage.mode(test) <- "logical"
## quick return for cases where 'new_ifelse(a, x, y)' is used
## instead of 'if (a) x else y'
if (length(test) == 1 && is.null(attributes(test))) {
if (is.na(test)) return(NA)
else if (test) {
if (length(yes) == 1) {
yat <- attributes(yes)
if (is.null(yat) || (is.function(yes) &&
identical(names(yat), "srcref")))
return(yes)
}
}
else if (length(no) == 1) {
nat <- attributes(no)
if (is.null(nat) || (is.function(no) &&
identical(names(nat), "srcref")))
return(no)
}
}
}
else ## typically a "class"; storage.mode<-() typically fails
test <- if(isS4(test)) methods::as(test, "logical") else as.logical(test)
ans <- test
ok <- !is.na(test)
if (any(test[ok]))
ans[test & ok] <- rep(yes, length.out = length(ans))[test & ok]
if (any(!test[ok]))
ans[!test & ok] <- rep(no, length.out = length(ans))[!test & ok]
ans
}
The proposal is to change ifelse to take advantage of the speed of anyNA(), use is.na only once after (and only if needed), and to take advantage of the predictable nature of ifelse when the lengths of test, yes, and no are equal or length-one.
I also correct an apparent bug in ifelse where the attributes of test are not saved if isS4(test).
.new_ifelse <- function(test, yes, no) {
ans <- test
ok <- !is.na(test)
if (any(test[ok]))
ans[test & ok] <- rep(yes, length.out = length(ans))[test & ok]
if (any(!test[ok]))
ans[!test & ok] <- rep(no, length.out = length(ans))[!test & ok]
ans
}
new_ifelse <- function (test, yes, no)
{
attributes_of_test <- attributes(test)
if(is.atomic(test)) { # do not lose attributes
if (typeof(test) != "logical")
storage.mode(test) <- "logical"
## quick return for cases where 'new_ifelse(a, x, y)' is used
## instead of 'if (a) x else y'
if (length(test) == 1 && is.null(attributes_of_test)) {
if (is.na(test)) return(NA)
else if (test) {
if (length(yes) == 1) {
yat <- attributes(yes)
if (is.null(yat) || (is.function(yes) &&
identical(names(yat), "srcref")))
return(yes)
}
}
else if (length(no) == 1) {
nat <- attributes(no)
if (is.null(nat) || (is.function(no) &&
identical(names(nat), "srcref")))
return(no)
}
}
}
else ## typically a "class"; storage.mode<-() typically fails
test <- if(isS4(test)) methods::as(test, "logical") else as.logical(test)
# Give up attempting backwards-compatibility under these conditions:
if (typeof(yes) %in% c("logical", "integer", "double", "character") &&
typeof(no) %in% c("logical", "integer", "double", "character") &&
!is.factor(yes) &&
!is.factor(no) &&
length(no) != 0L &&
length(yes) != 0L) {
if (length(no) == length(test)) {
out <- no
} else if (length(no) == 1L) {
out <- rep_len(no, length(test))
} else {
return(.new_ifelse(test, yes, no))
}
if (length(yes) != 1L && length(yes) != length(test)) {
return(.new_ifelse(test, yes, no))
}
if (anyNA(test)) {
# no benefit to saving the na results
Yes <- which(test)
out[is.na(test)] <- NA
if (length(yes) == 1L) {
out[Yes] <- yes
} else if (length(yes) == length(test)) {
out[Yes] <- yes[Yes]
} else {
return(.new_ifelse(test, yes, no))
}
} else {
# No NAs to deal with
if (length(yes) == 1L) {
out[test] <- yes
} else if (length(yes) == length(test)) {
wtest <- which(test) # faster than test directly
out[wtest] <- yes[wtest]
} else {
return(.new_ifelse(test, yes, no))
}
}
if (!is.null(attributes_of_test)) {
attributes(out) <- attributes_of_test
}
out
} else {
ans <- test
ok <- !is.na(test)
if (any(test[ok]))
ans[test & ok] <- rep(yes, length.out = length(ans))[test & ok]
if (any(!test[ok]))
ans[!test & ok] <- rep(no, length.out = length(ans))[!test & ok]
ans
}
}
new_ifelse100 <- function (test, yes, no)
{
attributes_of_test <- attributes(test)
if(is.atomic(test)) { # do not lose attributes
if (typeof(test) != "logical")
storage.mode(test) <- "logical"
## quick return for cases where 'new_ifelse(a, x, y)' is used
## instead of 'if (a) x else y'
if (length(test) == 1 && is.null(attributes_of_test)) {
if (is.na(test)) return(NA)
else if (test) {
if (length(yes) == 1) {
yat <- attributes(yes)
if (is.null(yat) || (is.function(yes) &&
identical(names(yat), "srcref")))
return(yes)
}
}
else if (length(no) == 1) {
nat <- attributes(no)
if (is.null(nat) || (is.function(no) &&
identical(names(nat), "srcref")))
return(no)
}
}
}
else ## typically a "class"; storage.mode<-() typically fails
test <- if(isS4(test)) methods::as(test, "logical") else as.logical(test)
# Give up attempting backwards-compatibility under these conditions:
if (length(test) > 100L &&
typeof(yes) %in% c("logical", "integer", "double", "character") &&
typeof(no) %in% c("logical", "integer", "double", "character") &&
!is.factor(yes) &&
!is.factor(no) &&
length(no) != 0L &&
length(yes) != 0L) {
if (length(no) == length(test)) {
out <- no
} else if (length(no) == 1L) {
out <- rep_len(no, length(test))
} else {
return(.new_ifelse(test, yes, no))
}
if (length(yes) != 1L && length(yes) != length(test)) {
return(.new_ifelse(test, yes, no))
}
if (anyNA(test)) {
# no benefit to saving the na results
Yes <- which(test)
out[is.na(test)] <- NA
if (length(yes) == 1L) {
out[Yes] <- yes
} else if (length(yes) == length(test)) {
out[Yes] <- yes[Yes]
} else {
return(.new_ifelse(test, yes, no))
}
} else {
# No NAs to deal with
if (length(yes) == 1L) {
out[test] <- yes
} else if (length(yes) == length(test)) {
wtest <- which(test) # faster than test directly
out[wtest] <- yes[wtest]
} else {
return(.new_ifelse(test, yes, no))
}
}
if (!is.null(attributes_of_test)) {
attributes(out) <- attributes_of_test
}
out
} else {
ans <- test
ok <- !is.na(test)
if (any(test[ok]))
ans[test & ok] <- rep(yes, length.out = length(ans))[test & ok]
if (any(!test[ok]))
ans[!test & ok] <- rep(no, length.out = length(ans))[!test & ok]
ans
}
}
test_identicals <- function(test, yes, no) {
old <- old_ifelse(test, yes, no)
new <- new_ifelse(test, yes, no)
if (!identical(old, new)) {
# Will take a few minutes otherwise
if (length(old) + length(new) < 50) {
print(old)
print(new)
}
cat("Non-identical result\n")
return(invisible(list(old = old,
new = new)))
}
}
# Attributes?
test1 <- c(a = FALSE, b = FALSE, c = TRUE, d = TRUE, e = TRUE)
test_identicals(test1, 1:4 + 0.0, 3)
yes1 <- 1:4 + 0
names(yes1) <- "srcref"
test_identicals(test1, yes1, 2:3)
test2 <- sample(test1, size = 1e7, replace = TRUE)
test_identicals(test2, 1:4 + 0.0, 3)
test2[5e5] <- NA
test_identicals(test2, 1:4 + 0.0, 3)
As aformentioned, S4 classes did not retain attributes:
if (requireNamespace("Matrix", quietly = TRUE)) {
library(Matrix)
M <- Matrix(-10 + 1:28, 4, 7)
test_identicals(M, 1, 2)
}
## Non-identical result
These examples simply fall back to the defaults.
test_identicals(c(TRUE, TRUE), "a", raw(0x95))
test_identicals(c(TRUE, TRUE), "a", factor("a"))
test_identicals(c(FALSE, FALSE), "a", factor("a"))
test_identicals(c(FALSE, TRUE), 1 , factor("a"))
test_identicals(c(FALSE, TRUE, NA), 1, factor("a"))
library(data.table)
library(ggplot2)
library(microbenchmark)
library(scales)
get_avg_time <- function(n, yes1 = TRUE, include_NA = FALSE) {
test <- sample(c(TRUE, FALSE, if (include_NA) NA), size = n, replace = TRUE)
if (yes1) {
yes <- -1L
} else {
yes <- sample.int(n)
}
no <- 1L
old <- as.data.table(microbenchmark(old_ifelse(test, yes, no)), times = if (n < 1e3) 100 else 10)
new <- as.data.table(microbenchmark(new_ifelse(test, yes, no)), times = if (n < 1e3) 100 else 10)
new100 <- as.data.table(microbenchmark(new_ifelse100(test, yes, no)), times = if (n < 1e3) 100 else 10)
nmed <- new100[, median(time)]
omed <- old[, median(time)]
list(old[, median(time)],
new[, median(time)],
new100[, median(time)],
prop_faster = abs(new100[, mean(time < omed)] - old[, mean(time < nmed)]))
}
DT <-
CJ(n = c(1:10, 10*(2:10), 50*(3:10), 750, 1000*(1:10), 10e3, 50e3, 100e3),
yes1 = c(TRUE, FALSE),
include_NA = c(TRUE, FALSE))
DT[, I := .I]
DT[, c("old", "new", "new100", "prop_faster") := get_avg_time(n, yes1, include_NA), by = "I"]
DT[, f1 := "without NAs"]
DT[(include_NA), f1 := "with NAs"]
DT[, f2 := "yes length-n"]
DT[(yes1), f2 := "yes length-1"]
ggplot(melt(DT, measure.vars = c("old", "new", "new100"))[, `time (ms)` := value / 1e6],
aes(x = n, y = `time (ms)`, color = variable, alpha = prop_faster)) +
geom_line(size = 1.1) +
facet_wrap(~ f2 + f1) +
scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 100e3)) +
scale_y_log10(labels = scales::comma)