library(lobstr) # to help understand how objects are structured
library(dplyr)
library(microbenchmark)R reading group notes
This was about the names and values chapter in Advanced R (2nd ed). It’s mainly about understanding how objects are named in R, and what the implications are for ordinary R practitioners.
The first point is about names. We usually think about assignment as making an object called x. But it’s definitely better to think about these separately - first creating an object and then binding it to a name. That means that names have objects, rather than objects having names.
x <- c(1, 2, 3) # create an object
obj_addr(x) # location in memory[1] "0x55d54d041ad8"
y <- x # bind an additional name to the object
obj_addr(x) == obj_addr(y) # it's just one object with two names[1] TRUE
This applies to objects in general, including function definitions:
obj_addr(mean)[1] "0x55d548ce1e80"
steve <- mean
obj_addr(steve)[1] "0x55d548ce1e80"
We only create a new object when we modify one of the names:
y[3] <- 9
obj_addr(x) == obj_addr(y) # different objects now[1] FALSE
There are a couple of important exceptions to this general principle. First, lists have an extra step, in that they refer to references, rather than to objects directly:
l1 <- list(1, 2, 3)
l2 <- l1
obj_addr(l1)[1] "0x55d54d6cb4c8"
obj_addr(l2)[1] "0x55d54d6cb4c8"
l2[[3]] <- 99
obj_addr(l1)[1] "0x55d54d6cb4c8"
obj_addr(l2)[1] "0x55d54d6ca1d8"
ref(l1, l2)█ [1:0x55d54d6cb4c8] <list>
├─[2:0x55d54d3578c8] <dbl>
├─[3:0x55d54d357a88] <dbl>
└─[4:0x55d54d357c48] <dbl>
█ [5:0x55d54d6ca1d8] <list>
├─[2:0x55d54d3578c8]
├─[3:0x55d54d357a88]
└─[6:0x55d54c577d38] <dbl>
As tibbles (and other tabular data structures in R) are effectively lists, this is an explaination as to why row-wise operations are so slow compared to operations on columns. As tibbles are are lists of columns, updating a column just makes a new reference. Changing a row, on the other hand, makes a whole new set of objects and references:
mt_changed_col <- mtcars
mt_changed_col$hp <- mtcars$hp*9
mt_changed_row <- mtcars
mt_changed_row[1,] <- mt_changed_row[1,] * 9
ref(mtcars, mt_changed_col, mt_changed_row)█ [1:0x55d54d1fa288] <df[,11]>
├─mpg = [2:0x55d54a37f3a0] <dbl>
├─cyl = [3:0x55d549d497a0] <dbl>
├─disp = [4:0x55d54a6eeec0] <dbl>
├─hp = [5:0x55d54d485310] <dbl>
├─drat = [6:0x55d5490e9440] <dbl>
├─wt = [7:0x55d54d2faa60] <dbl>
├─qsec = [8:0x55d54d156150] <dbl>
├─vs = [9:0x55d54801a860] <dbl>
├─am = [10:0x55d549938a70] <dbl>
├─gear = [11:0x55d548a0c780] <dbl>
└─carb = [12:0x55d549225de0] <dbl>
█ [13:0x55d54d1fa758] <df[,11]>
├─mpg = [2:0x55d54a37f3a0]
├─cyl = [3:0x55d549d497a0]
├─disp = [4:0x55d54a6eeec0]
├─hp = [14:0x55d547abdd30] <dbl>
├─drat = [6:0x55d5490e9440]
├─wt = [7:0x55d54d2faa60]
├─qsec = [8:0x55d54d156150]
├─vs = [9:0x55d54801a860]
├─am = [10:0x55d549938a70]
├─gear = [11:0x55d548a0c780]
└─carb = [12:0x55d549225de0]
█ [15:0x55d54d1f3738] <df[,11]>
├─mpg = [16:0x55d54810bc40] <dbl>
├─cyl = [17:0x55d54d43f2a0] <dbl>
├─disp = [18:0x55d547dbd210] <dbl>
├─hp = [19:0x55d54aa47c20] <dbl>
├─drat = [20:0x55d54a411bf0] <dbl>
├─wt = [21:0x55d54743fd50] <dbl>
├─qsec = [22:0x55d54951e380] <dbl>
├─vs = [23:0x55d54d35b650] <dbl>
├─am = [24:0x55d548659380] <dbl>
├─gear = [25:0x55d549ef8e30] <dbl>
└─carb = [26:0x55d5481de630] <dbl>
col_row <- microbenchmark(
{mt_changed_col <- mtcars
mt_changed_col["hp"] <- mtcars["hp"]*9},
{mt_changed_row <- mtcars
mt_changed_row[1,] <- mt_changed_row[1,] * 9}
)
col_row |>
mutate(expr = case_when(stringr::str_detect(expr, "col") ~ "by col",
TRUE ~ "by row")) |>
group_by(expr) |>
summarise(`mean time (μs)` = mean(time)/1000) |>
knitr::kable() # about 4x faster to change the col than the row| expr | mean time (μs) |
|---|---|
| by col | 200.3464 |
| by row | 820.9966 |
ggplot2::autoplot(col_row)
We also looked briefly at alternative representation. The range operator is the best example of highly compact representations:
obj_size(1:1000000) # approx size?680 B
obj_size(1:2) == obj_size(1:1000000) # the range operator only stores the first and last values[1] TRUE
obj_size(seq(1, 1000000)) # seq will use the same alternative representation...680 B
obj_size(seq(1, 1000000, 1.0)) # unless you ask it to make a sequence with non-1L steps 8.00 MB
object.size has lots of interesting implications for lists as it only describes the size of the references, rather than the underlying objects:
obj_size(rnorm(1e6)) # 8 mb8.00 MB
mill <- obj_size(rnorm(1e6))
obj_size(list(rnorm(1e6), rnorm(1e6), rnorm(1e6)))#??24.00 MB
obj_size(list(mill, mill, mill))#??368 B
obj_size(tibble(a = mill,
b = mill,
c = mill))1.21 kB
All strings are held in a common area of memory called the common string pool. This gives rise to a lot of interesting size consequence for vectors with shared strings:
s1 <- c("the", "cat", "sat", "mat")
s2 <- c("the", "the", "the", "the")
obj_size(s1)304 B
obj_size(s2)136 B
ref(s1, character = TRUE)█ [1:0x55d54faa3bb8] <chr>
├─[2:0x55d548d1c900] <string: "the">
├─[3:0x55d547033258] <string: "cat">
├─[4:0x55d54c0fb8e8] <string: "sat">
└─[5:0x55d547e32f40] <string: "mat">
ref(s2, character = TRUE)█ [1:0x55d54fab0678] <chr>
├─[2:0x55d548d1c900] <string: "the">
├─[2:0x55d548d1c900]
├─[2:0x55d548d1c900]
└─[2:0x55d548d1c900]
obj_size(c(1,2,3,4)) # numeric vectors don't behave in the same way80 B
obj_size(c(4,4,4,4))80 B