We ❤️ janitor

R
intermediate
Authors
Affiliations

Susanna Kirk

NHS Fife

Brendan Clarke

NHS Education for Scotland

Published

May 24, 2024

No feedback found for this session

Headline

  • Janitor is a toolkit for tidying data
  • both input-focused and output-focused
  • excellent balance between diligence and laziness

Setup

library(janitor)
library(readxl)
library(dplyr)

test_df <- as.data.frame(matrix(ncol = 6))
names(test_df) <- c("firstName", "ábc@!*", "% successful (2009)",
                    "REPEAT VALUE", "REPEAT VALUE", "")

test_df |>
  clean_names(case = "upper_lower") |>
  select(FIRSTname)
  FIRSTname
1        NA

clean_names

clean_names allows you to convert ugly column names into standardised, syntactic names. You can also specfiy the case, or bulk-tweak the names.

clean_names(test_df)
  first_name abc percent_successful_2009 repeat_value repeat_value_2  x
1         NA  NA                      NA           NA             NA NA
clean_names(test_df, case = "screaming_snake")
  FIRST_NAME ABC PERCENT_SUCCESSFUL_2009 REPEAT_VALUE REPEAT_VALUE_2  X
1         NA  NA                      NA           NA             NA NA
clean_names(test_df, case = "big_camel")
  FirstName Abc PercentSuccessful2009 RepeatValue RepeatValue_2  X
1        NA  NA                    NA          NA            NA NA
test_df %>%
  clean_names(prefix = "a_prefix")
  a_prefixfirst_name a_prefixabc a_prefixpercent_successful_2009
1                 NA          NA                              NA
  a_prefixrepeat_value a_prefixrepeat_value_2 a_prefixx
1                   NA                     NA        NA

compare_df_cols

library(palmerpenguins)

compare_df_cols(penguins,penguins_raw)
           column_name penguins penguins_raw
1        bill_depth_mm  numeric         <NA>
2       bill_length_mm  numeric         <NA>
3        Body Mass (g)     <NA>      numeric
4          body_mass_g  integer         <NA>
5    Clutch Completion     <NA>    character
6             Comments     <NA>    character
7    Culmen Depth (mm)     <NA>      numeric
8   Culmen Length (mm)     <NA>      numeric
9             Date Egg     <NA>         Date
10   Delta 13 C (o/oo)     <NA>      numeric
11   Delta 15 N (o/oo)     <NA>      numeric
12 Flipper Length (mm)     <NA>      numeric
13   flipper_length_mm  integer         <NA>
14       Individual ID     <NA>    character
15              island   factor         <NA>
16              Island     <NA>    character
17              Region     <NA>    character
18       Sample Number     <NA>      numeric
19                 sex   factor         <NA>
20                 Sex     <NA>    character
21             species   factor         <NA>
22             Species     <NA>    character
23               Stage     <NA>    character
24           studyName     <NA>    character
25                year  integer         <NA>

warnings

Data1 <- readxl::read_excel("data/JanitorDataExample.xlsx", .name_repair = make_clean_names)

# warnings switched off, otherwise we'd have literally hundreds of warnings here!

make_clean_names(c("1thing", "some opther thign"))
[1] "x1thing"           "some_opther_thign"
glimpse(Data1)
Rows: 503
Columns: 7
$ id               <dbl> 100507, 104088, 105279, 105459, 108894, 109204, 10962…
$ gender           <chr> "Male", "Male", "Female", "Male", "Male", "Male", "Ma…
$ date_of_event_1  <dbl> 44572, 44216, 44172, 44496, 44973, 45099, 44350, 4466…
$ date_of_event_2  <dbl> 44535, 44089, 45239, 43872, 45239, 44236, 44981, 4386…
$ location         <chr> "Hospital", "Hospital", "Hospital", "Home", "Home", "…
$ diagnosis_code_1 <chr> "T848", "G992", "J81X", "R55X", "J459", "K851", "J441…
$ attendances      <dbl> 3, 2, 4, 2, 2, 4, 2, 3, 1, 3, 3, 2, 3, 4, 3, 4, 4, 4,…

clean_names again

Data1_clean <- Data1 |>
  clean_names() |>
  mutate(across(contains("date"),
         excel_numeric_to_date)) |>
  mutate(Year = lubridate::year(date_of_event_1))

Data1_clean
# A tibble: 503 × 8
       id gender date_of_event_1 date_of_event_2 location diagnosis_code_1
    <dbl> <chr>  <date>          <date>          <chr>    <chr>           
 1 100507 Male   2022-01-11      2021-12-05      Hospital T848            
 2 104088 Male   2021-01-20      2020-09-15      Hospital G992            
 3 105279 Female 2020-12-07      2023-11-09      Hospital J81X            
 4 105459 Male   2021-10-27      2020-02-11      Home     R55X            
 5 108894 Male   2023-02-16      2023-11-09      Home     J459            
 6 109204 Male   2023-06-22      2021-02-09      Home     K851            
 7 109624 Male   2021-06-03      2023-02-24      Home     J441            
 8 111454 Male   2022-04-11      2020-02-07      Home     E162            
 9 113892 Male   2020-04-26      2023-03-06      Home     I279            
10 114110 Male   2021-04-04      2021-05-06      Hospital M798            
# ℹ 493 more rows
# ℹ 2 more variables: attendances <dbl>, Year <dbl>

get_dupes

Data1_clean |>
  get_dupes(contains("date"))
# A tibble: 6 × 9
  date_of_event_1 date_of_event_2 dupe_count     id gender location
  <date>          <date>               <int>  <dbl> <chr>  <chr>   
1 2021-10-09      2021-08-20               2 400760 Female Hospital
2 2021-10-09      2021-08-20               2 400760 Female Hospital
3 2022-04-04      2021-11-08               2 450284 Female Hospital
4 2022-04-04      2021-11-08               2 450284 Female Hospital
5 2023-06-29      2020-03-23               2 823894 Male   Home    
6 2023-06-29      2020-03-23               2 823894 Male   Home    
# ℹ 3 more variables: diagnosis_code_1 <chr>, attendances <dbl>, Year <dbl>

distinct

Data1_clean_distinct <- distinct(Data1_clean)

Data1_clean_distinct
# A tibble: 500 × 8
       id gender date_of_event_1 date_of_event_2 location diagnosis_code_1
    <dbl> <chr>  <date>          <date>          <chr>    <chr>           
 1 100507 Male   2022-01-11      2021-12-05      Hospital T848            
 2 104088 Male   2021-01-20      2020-09-15      Hospital G992            
 3 105279 Female 2020-12-07      2023-11-09      Hospital J81X            
 4 105459 Male   2021-10-27      2020-02-11      Home     R55X            
 5 108894 Male   2023-02-16      2023-11-09      Home     J459            
 6 109204 Male   2023-06-22      2021-02-09      Home     K851            
 7 109624 Male   2021-06-03      2023-02-24      Home     J441            
 8 111454 Male   2022-04-11      2020-02-07      Home     E162            
 9 113892 Male   2020-04-26      2023-03-06      Home     I279            
10 114110 Male   2021-04-04      2021-05-06      Hospital M798            
# ℹ 490 more rows
# ℹ 2 more variables: attendances <dbl>, Year <dbl>

tabyl / adorn

Data1_clean_distinct |>
  tabyl(gender)
 gender   n percent valid_percent
 Female 285   0.570     0.5711423
   Male 214   0.428     0.4288577
   <NA>   1   0.002            NA
Data1_clean_distinct |>
  tabyl(gender, show_na = F)
 gender   n   percent
 Female 285 0.5711423
   Male 214 0.4288577
Data1_clean_distinct |>
  tabyl(gender, location, Year, show_na = F) # excel pivot
$`2020`
 gender Home Hospital
 Female   34       33
   Male   24       25

$`2021`
 gender Home Hospital
 Female   24       48
   Male   28       22

$`2022`
 gender Home Hospital
 Female   35       35
   Male   22       30

$`2023`
 gender Home Hospital
 Female   37       39
   Male   25       38
Data1_clean_distinct |>
  tabyl(gender) |>
  class()
[1] "tabyl"      "data.frame"
Data1_clean_distinct |>
  tabyl(Year,gender, show_na = F) |>
  adorn_totals("row") |>
  adorn_percentages("row") |>
  adorn_pct_formatting() |>
  adorn_ns() |>
  adorn_title("combined")
 Year/gender      Female        Male
        2020 57.8%  (67) 42.2%  (49)
        2021 59.0%  (72) 41.0%  (50)
        2022 57.4%  (70) 42.6%  (52)
        2023 54.7%  (76) 45.3%  (63)
       Total 57.1% (285) 42.9% (214)
Data1_clean_distinct |>
  tabyl(Year,gender,show_na = F) |>
  adorn_totals("row") |>
  adorn_percentages("row") |>
  adorn_pct_formatting() |>
  adorn_ns() |>
  # adorn_title("combined") |>
  mutate(Year = paste("The year was", Year))
               Year      Female        Male
  The year was 2020 57.8%  (67) 42.2%  (49)
  The year was 2021 59.0%  (72) 41.0%  (50)
  The year was 2022 57.4%  (70) 42.6%  (52)
  The year was 2023 54.7%  (76) 45.3%  (63)
 The year was Total 57.1% (285) 42.9% (214)
Data1_clean_distinct |>
  tabyl(Year,gender,show_na = F) |>
  adorn_totals("row") |>
  adorn_percentages("row") |>
  adorn_pct_formatting() |>
  adorn_ns(position = "front") |>
  adorn_title("combined")
 Year/gender      Female        Male
        2020  67 (57.8%)  49 (42.2%)
        2021  72 (59.0%)  50 (41.0%)
        2022  70 (57.4%)  52 (42.6%)
        2023  76 (54.7%)  63 (45.3%)
       Total 285 (57.1%) 214 (42.9%)

group_by with tabyl

Data1_clean_distinct |>
  group_by(Year,gender) |>
  summarise(Attendances = n()) |>
  filter(!is.na(Year)) |>
  tidyr::pivot_wider(names_from = gender, values_from = Attendances) |>
  adorn_totals(c("row","col"))
  Year Female Male Total
  2020     67   49   116
  2021     72   50   122
  2022     70   52   122
  2023     76   63   139
 Total    285  214   499

chisq.test

Data1_clean_distinct |>
  tabyl(attendances,location,show_na = F) |>
  chisq.test()

    Pearson's Chi-squared test

data:  tabyl(Data1_clean_distinct, attendances, location, show_na = F)
X-squared = 1.9866, df = 3, p-value = 0.5752

chisq.test

titanic <- data.frame(Died  = c(122,167,528,673),
                  Survived  = c(203,118,178,212),
                  row.names = c("1st class", "2nd class", "3rd class", "crew"))

chisq.test(titanic)

    Pearson's Chi-squared test

data:  titanic
X-squared = 190.4, df = 3, p-value < 2.2e-16