library(lobstr) # to help understand how objects are structured
library(dplyr)
library(microbenchmark)R reading group notes
This was about the names and values chapter in Advanced R (2nd ed). It’s mainly about understanding how objects are named in R, and what the implications are for ordinary R practitioners.
The first point is about names. We usually think about assignment as making an object called x. But it’s definitely better to think about these separately - first creating an object and then binding it to a name. That means that names have objects, rather than objects having names.
x <- c(1, 2, 3) # create an object
obj_addr(x) # location in memory[1] "0x5634b4562fd8"
y <- x # bind an additional name to the object
obj_addr(x) == obj_addr(y) # it's just one object with two names[1] TRUE
This applies to objects in general, including function definitions:
obj_addr(mean)[1] "0x5634b01cef68"
steve <- mean
obj_addr(steve)[1] "0x5634b01cef68"
We only create a new object when we modify one of the names:
y[3] <- 9
obj_addr(x) == obj_addr(y) # different objects now[1] FALSE
There are a couple of important exceptions to this general principle. First, lists have an extra step, in that they refer to references, rather than to objects directly:
l1 <- list(1, 2, 3)
l2 <- l1
obj_addr(l1)[1] "0x5634b454a688"
obj_addr(l2)[1] "0x5634b454a688"
l2[[3]] <- 99
obj_addr(l1)[1] "0x5634b454a688"
obj_addr(l2)[1] "0x5634b4282518"
ref(l1, l2)█ [1:0x5634b454a688] <list>
├─[2:0x5634b3cac0e0] <dbl>
├─[3:0x5634b3cd34f0] <dbl>
└─[4:0x5634b3cd2fb0] <dbl>
█ [5:0x5634b4282518] <list>
├─[2:0x5634b3cac0e0]
├─[3:0x5634b3cd34f0]
└─[6:0x5634b3d7a7e0] <dbl>
As tibbles (and other tabular data structures in R) are effectively lists, this is an explaination as to why row-wise operations are so slow compared to operations on columns. As tibbles are are lists of columns, updating a column just makes a new reference. Changing a row, on the other hand, makes a whole new set of objects and references:
mt_changed_col <- mtcars
mt_changed_col$hp <- mtcars$hp*9
mt_changed_row <- mtcars
mt_changed_row[1,] <- mt_changed_row[1,] * 9
ref(mtcars, mt_changed_col, mt_changed_row)█ [1:0x5634b46c1478] <df[,11]>
├─mpg = [2:0x5634b47fb510] <dbl>
├─cyl = [3:0x5634b0852240] <dbl>
├─disp = [4:0x5634ae1850c0] <dbl>
├─hp = [5:0x5634ae30e1a0] <dbl>
├─drat = [6:0x5634b44bfe50] <dbl>
├─wt = [7:0x5634aefeb5c0] <dbl>
├─qsec = [8:0x5634b0cf3e70] <dbl>
├─vs = [9:0x5634b467d710] <dbl>
├─am = [10:0x5634b02e8bd0] <dbl>
├─gear = [11:0x5634b0900b70] <dbl>
└─carb = [12:0x5634afa1fc50] <dbl>
█ [13:0x5634b46c2028] <df[,11]>
├─mpg = [2:0x5634b47fb510]
├─cyl = [3:0x5634b0852240]
├─disp = [4:0x5634ae1850c0]
├─hp = [14:0x5634b12e9510] <dbl>
├─drat = [6:0x5634b44bfe50]
├─wt = [7:0x5634aefeb5c0]
├─qsec = [8:0x5634b0cf3e70]
├─vs = [9:0x5634b467d710]
├─am = [10:0x5634b02e8bd0]
├─gear = [11:0x5634b0900b70]
└─carb = [12:0x5634afa1fc50]
█ [15:0x5634b423a048] <df[,11]>
├─mpg = [16:0x5634b17c1550] <dbl>
├─cyl = [17:0x5634af159250] <dbl>
├─disp = [18:0x5634b17501f0] <dbl>
├─hp = [19:0x5634b14c2180] <dbl>
├─drat = [20:0x5634b4581870] <dbl>
├─wt = [21:0x5634b0c15420] <dbl>
├─qsec = [22:0x5634b438f4b0] <dbl>
├─vs = [23:0x5634b47fb890] <dbl>
├─am = [24:0x5634b1115a30] <dbl>
├─gear = [25:0x5634b18d5010] <dbl>
└─carb = [26:0x5634b0e5a6d0] <dbl>
col_row <- microbenchmark(
{mt_changed_col <- mtcars
mt_changed_col["hp"] <- mtcars["hp"]*9},
{mt_changed_row <- mtcars
mt_changed_row[1,] <- mt_changed_row[1,] * 9}
)
col_row |>
mutate(expr = case_when(stringr::str_detect(expr, "col") ~ "by col",
TRUE ~ "by row")) |>
group_by(expr) |>
summarise(`mean time (μs)` = mean(time)/1000) |>
knitr::kable() # about 4x faster to change the col than the row| expr | mean time (μs) |
|---|---|
| by col | 161.9112 |
| by row | 688.6733 |
ggplot2::autoplot(col_row)
We also looked briefly at alternative representation. The range operator is the best example of highly compact representations:
obj_size(1:1000000) # approx size?680 B
obj_size(1:2) == obj_size(1:1000000) # the range operator only stores the first and last values[1] TRUE
obj_size(seq(1, 1000000)) # seq will use the same alternative representation...680 B
obj_size(seq(1, 1000000, 1.0)) # unless you ask it to make a sequence with non-1L steps 8.00 MB
object.size has lots of interesting implications for lists as it only describes the size of the references, rather than the underlying objects:
obj_size(rnorm(1e6)) # 8 mb8.00 MB
mill <- obj_size(rnorm(1e6))
obj_size(list(rnorm(1e6), rnorm(1e6), rnorm(1e6)))#??24.00 MB
obj_size(list(mill, mill, mill))#??368 B
obj_size(tibble(a = mill,
b = mill,
c = mill))1.21 kB
All strings are held in a common area of memory called the common string pool. This gives rise to a lot of interesting size consequence for vectors with shared strings:
s1 <- c("the", "cat", "sat", "mat")
s2 <- c("the", "the", "the", "the")
obj_size(s1)304 B
obj_size(s2)136 B
ref(s1, character = TRUE)█ [1:0x5634b6aa5608] <chr>
├─[2:0x5634afd301c0] <string: "the">
├─[3:0x5634ae045258] <string: "cat">
├─[4:0x5634b31d0b88] <string: "sat">
└─[5:0x5634aee46800] <string: "mat">
ref(s2, character = TRUE)█ [1:0x5634b6ab1f88] <chr>
├─[2:0x5634afd301c0] <string: "the">
├─[2:0x5634afd301c0]
├─[2:0x5634afd301c0]
└─[2:0x5634afd301c0]
obj_size(c(1,2,3,4)) # numeric vectors don't behave in the same way80 B
obj_size(c(4,4,4,4))80 B