library(lobstr) # to help understand how objects are structured
library(dplyr)
library(microbenchmark)
R reading group notes
This was about the names and values chapter in Advanced R (2nd ed). It’s mainly about understanding how objects are named in R, and what the implications are for ordinary R practitioners.
The first point is about names. We usually think about assignment as making an object called x
. But it’s definitely better to think about these separately - first creating an object and then binding it to a name. That means that names have objects, rather than objects having names.
<- c(1, 2, 3) # create an object
x obj_addr(x) # location in memory
[1] "0x26c74139378"
<- x # bind an additional name to the object
y obj_addr(x) == obj_addr(y) # it's just one object with two names
[1] TRUE
This applies to objects in general, including function definitions:
obj_addr(mean)
[1] "0x26c6f61eab0"
<- mean
steve obj_addr(steve)
[1] "0x26c6f61eab0"
We only create a new object when we modify one of the names:
3] <- 9
y[obj_addr(x) == obj_addr(y) # different objects now
[1] FALSE
There are a couple of important exceptions to this general principle. First, lists have an extra step, in that they refer to references, rather than to objects directly:
<- list(1, 2, 3)
l1 <- l1
l2
obj_addr(l1)
[1] "0x26c75262678"
obj_addr(l2)
[1] "0x26c75262678"
3]] <- 99
l2[[
obj_addr(l1)
[1] "0x26c75262678"
obj_addr(l2)
[1] "0x26c75462e98"
ref(l1, l2)
█ [1:0x26c75262678] <list>
├─[2:0x26c74f7e938] <dbl>
├─[3:0x26c74f7e900] <dbl>
└─[4:0x26c74f7e8c8] <dbl>
█ [5:0x26c75462e98] <list>
├─[2:0x26c74f7e938]
├─[3:0x26c74f7e900]
└─[6:0x26c74f7e778] <dbl>
As tibbles (and other tabular data structures in R) are effectively lists, this is an explaination as to why row-wise operations are so slow compared to operations on columns. As tibbles are are lists of columns, updating a column just makes a new reference. Changing a row, on the other hand, makes a whole new set of objects and references:
<- mtcars
mt_changed_col $hp <- mtcars$hp*9
mt_changed_col
<- mtcars
mt_changed_row 1,] <- mt_changed_row[1,] * 9
mt_changed_row[
ref(mtcars, mt_changed_col, mt_changed_row)
█ [1:0x26c741467c8] <df[,11]>
├─mpg = [2:0x26c703532c0] <dbl>
├─cyl = [3:0x26c70353ea0] <dbl>
├─disp = [4:0x26c703545c0] <dbl>
├─hp = [5:0x26c703533f0] <dbl>
├─drat = [6:0x26c70354820] <dbl>
├─wt = [7:0x26c70355400] <dbl>
├─qsec = [8:0x26c70355530] <dbl>
├─vs = [9:0x26c70354bb0] <dbl>
├─am = [10:0x26c70353060] <dbl>
├─gear = [11:0x26c70353650] <dbl>
└─carb = [12:0x26c70353780] <dbl>
█ [13:0x26c74146d48] <df[,11]>
├─mpg = [2:0x26c703532c0]
├─cyl = [3:0x26c70353ea0]
├─disp = [4:0x26c703545c0]
├─hp = [14:0x26c70353fd0] <dbl>
├─drat = [6:0x26c70354820]
├─wt = [7:0x26c70355400]
├─qsec = [8:0x26c70355530]
├─vs = [9:0x26c70354bb0]
├─am = [10:0x26c70353060]
├─gear = [11:0x26c70353650]
└─carb = [12:0x26c70353780]
█ [15:0x26c72e7d748] <df[,11]>
├─mpg = [16:0x26c70354100] <dbl>
├─cyl = [17:0x26c70354230] <dbl>
├─disp = [18:0x26c70354360] <dbl>
├─hp = [19:0x26c70354950] <dbl>
├─drat = [20:0x26c703546f0] <dbl>
├─wt = [21:0x26c70356700] <dbl>
├─qsec = [22:0x26c70355eb0] <dbl>
├─vs = [23:0x26c70355fe0] <dbl>
├─am = [24:0x26c6eed38b0] <dbl>
├─gear = [25:0x26c6eed4360] <dbl>
└─carb = [26:0x26c6eed3780] <dbl>
<- microbenchmark(
col_row <- mtcars
{mt_changed_col "hp"] <- mtcars["hp"]*9},
mt_changed_col[
<- mtcars
{mt_changed_row 1,] <- mt_changed_row[1,] * 9}
mt_changed_row[
)
|>
col_row mutate(expr = case_when(stringr::str_detect(expr, "col") ~ "by col",
TRUE ~ "by row")) |>
group_by(expr) |>
summarise(`mean time (μs)` = mean(time)/1000) |>
::kable() # about 4x faster to change the col than the row knitr
expr | mean time (μs) |
---|---|
by col | 472.550 |
by row | 1393.922 |
::autoplot(col_row) ggplot2
We also looked briefly at alternative representation. The range operator is the best example of highly compact representations:
obj_size(1:1000000) # approx size?
680 B
obj_size(1:2) == obj_size(1:1000000) # the range operator only stores the first and last values
[1] TRUE
obj_size(seq(1, 1000000)) # seq will use the same alternative representation...
680 B
obj_size(seq(1, 1000000, 1.0)) # unless you ask it to make a sequence with non-1L steps
8.00 MB
object.size
has lots of interesting implications for lists as it only describes the size of the references, rather than the underlying objects:
obj_size(rnorm(1e6)) # 8 mb
8.00 MB
<- obj_size(rnorm(1e6))
mill
obj_size(list(rnorm(1e6), rnorm(1e6), rnorm(1e6)))#??
24.00 MB
obj_size(list(mill, mill, mill))#??
368 B
obj_size(tibble(a = mill,
b = mill,
c = mill))
1.21 kB
All strings are held in a common area of memory called the common string pool. This gives rise to a lot of interesting size consequence for vectors with shared strings:
<- c("the", "cat", "sat", "mat")
s1 <- c("the", "the", "the", "the")
s2
obj_size(s1)
304 B
obj_size(s2)
136 B
ref(s1, character = TRUE)
█ [1:0x26c77e629b8] <chr>
├─[2:0x26c6f69dd48] <string: "the">
├─[3:0x26c67a59378] <string: "cat">
├─[4:0x26c749bebe0] <string: "sat">
└─[5:0x26c6e575880] <string: "mat">
ref(s2, character = TRUE)
█ [1:0x26c77e61158] <chr>
├─[2:0x26c6f69dd48] <string: "the">
├─[2:0x26c6f69dd48]
├─[2:0x26c6f69dd48]
└─[2:0x26c6f69dd48]
obj_size(c(1,2,3,4)) # numeric vectors don't behave in the same way
80 B
obj_size(c(4,4,4,4))
80 B