library(lobstr) # to help understand how objects are structured
library(dplyr)
library(microbenchmark)
R reading group notes
This was about the names and values chapter in Advanced R (2nd ed). It’s mainly about understanding how objects are named in R, and what the implications are for ordinary R practitioners.
The first point is about names. We usually think about assignment as making an object called x
. But it’s definitely better to think about these separately - first creating an object and then binding it to a name. That means that names have objects, rather than objects having names.
<- c(1, 2, 3) # create an object
x obj_addr(x) # location in memory
[1] "0x14ba179f718"
<- x # bind an additional name to the object
y obj_addr(x) == obj_addr(y) # it's just one object with two names
[1] TRUE
This applies to objects in general, including function definitions:
obj_addr(mean)
[1] "0x14b9bba8068"
<- mean
steve obj_addr(steve)
[1] "0x14b9bba8068"
We only create a new object when we modify one of the names:
3] <- 9
y[obj_addr(x) == obj_addr(y) # different objects now
[1] FALSE
There are a couple of important exceptions to this general principle. First, lists have an extra step, in that they refer to references, rather than to objects directly:
<- list(1, 2, 3)
l1 <- l1
l2
obj_addr(l1)
[1] "0x14ba0bb7358"
obj_addr(l2)
[1] "0x14ba0bb7358"
3]] <- 99
l2[[
obj_addr(l1)
[1] "0x14ba0bb7358"
obj_addr(l2)
[1] "0x14ba0da9948"
ref(l1, l2)
█ [1:0x14ba0bb7358] <list>
├─[2:0x14b9e987040] <dbl>
├─[3:0x14b9e987008] <dbl>
└─[4:0x14b9e986fd0] <dbl>
█ [5:0x14ba0da9948] <list>
├─[2:0x14b9e987040]
├─[3:0x14b9e987008]
└─[6:0x14b9e986e80] <dbl>
As tibbles (and other tabular data structures in R) are effectively lists, this is an explaination as to why row-wise operations are so slow compared to operations on columns. As tibbles are are lists of columns, updating a column just makes a new reference. Changing a row, on the other hand, makes a whole new set of objects and references:
<- mtcars
mt_changed_col $hp <- mtcars$hp*9
mt_changed_col
<- mtcars
mt_changed_row 1,] <- mt_changed_row[1,] * 9
mt_changed_row[
ref(mtcars, mt_changed_col, mt_changed_row)
█ [1:0x14ba216fca8] <df[,11]>
├─mpg = [2:0x14b980d5650] <dbl>
├─cyl = [3:0x14b980d5780] <dbl>
├─disp = [4:0x14b9bef4490] <dbl>
├─hp = [5:0x14b9bef38b0] <dbl>
├─drat = [6:0x14b9bef45c0] <dbl>
├─wt = [7:0x14b9bef4360] <dbl>
├─qsec = [8:0x14b9bef4f40] <dbl>
├─vs = [9:0x14b9bef46f0] <dbl>
├─am = [10:0x14b9bef4820] <dbl>
├─gear = [11:0x14b9bef4950] <dbl>
└─carb = [12:0x14b9bef3780] <dbl>
█ [13:0x14ba216f728] <df[,11]>
├─mpg = [2:0x14b980d5650]
├─cyl = [3:0x14b980d5780]
├─disp = [4:0x14b9bef4490]
├─hp = [14:0x14b9bef4a80] <dbl>
├─drat = [6:0x14b9bef45c0]
├─wt = [7:0x14b9bef4360]
├─qsec = [8:0x14b9bef4f40]
├─vs = [9:0x14b9bef46f0]
├─am = [10:0x14b9bef4820]
├─gear = [11:0x14b9bef4950]
└─carb = [12:0x14b9bef3780]
█ [15:0x14ba2362b18] <df[,11]>
├─mpg = [16:0x14b9bef39e0] <dbl>
├─cyl = [17:0x14b9bef5400] <dbl>
├─disp = [18:0x14b9bef3ea0] <dbl>
├─hp = [19:0x14b9bef4230] <dbl>
├─drat = [20:0x14b9bef3060] <dbl>
├─wt = [21:0x14b9bef4bb0] <dbl>
├─qsec = [22:0x14b9bef5070] <dbl>
├─vs = [23:0x14b9bef4ce0] <dbl>
├─am = [24:0x14b9bef4100] <dbl>
├─gear = [25:0x14b9bef33f0] <dbl>
└─carb = [26:0x14b9bef3520] <dbl>
<- microbenchmark(
col_row <- mtcars
{mt_changed_col "hp"] <- mtcars["hp"]*9},
mt_changed_col[
<- mtcars
{mt_changed_row 1,] <- mt_changed_row[1,] * 9}
mt_changed_row[
)
|>
col_row mutate(expr = case_when(stringr::str_detect(expr, "col") ~ "by col",
TRUE ~ "by row")) |>
group_by(expr) |>
summarise(`mean time (μs)` = mean(time)/1000) |>
::kable() # about 4x faster to change the col than the row knitr
expr | mean time (μs) |
---|---|
by col | 297.377 |
by row | 1051.283 |
::autoplot(col_row) ggplot2
We also looked briefly at alternative representation. The range operator is the best example of highly compact representations:
obj_size(1:1000000) # approx size?
680 B
obj_size(1:2) == obj_size(1:1000000) # the range operator only stores the first and last values
[1] TRUE
obj_size(seq(1, 1000000)) # seq will use the same alternative representation...
680 B
obj_size(seq(1, 1000000, 1.0)) # unless you ask it to make a sequence with non-1L steps
8.00 MB
object.size
has lots of interesting implications for lists as it only describes the size of the references, rather than the underlying objects:
obj_size(rnorm(1e6)) # 8 mb
8.00 MB
<- obj_size(rnorm(1e6))
mill
obj_size(list(rnorm(1e6), rnorm(1e6), rnorm(1e6)))#??
24.00 MB
obj_size(list(mill, mill, mill))#??
368 B
obj_size(tibble(a = mill,
b = mill,
c = mill))
1.21 kB
All strings are held in a common area of memory called the common string pool. This gives rise to a lot of interesting size consequence for vectors with shared strings:
<- c("the", "cat", "sat", "mat")
s1 <- c("the", "the", "the", "the")
s2
obj_size(s1)
304 B
obj_size(s2)
136 B
ref(s1, character = TRUE)
█ [1:0x14ba2d059b8] <chr>
├─[2:0x14b98f15d48] <string: "the">
├─[3:0x14b9425a378] <string: "cat">
├─[4:0x14ba1c13c48] <string: "sat">
└─[5:0x14b97784880] <string: "mat">
ref(s2, character = TRUE)
█ [1:0x14ba5444378] <chr>
├─[2:0x14b98f15d48] <string: "the">
├─[2:0x14b98f15d48]
├─[2:0x14b98f15d48]
└─[2:0x14b98f15d48]
obj_size(c(1,2,3,4)) # numeric vectors don't behave in the same way
80 B
obj_size(c(4,4,4,4))
80 B