No feedback found for this session
We ❤️ janitor
R
intermediate
Headline
- Janitor is a toolkit for tidying data
- both input-focused and output-focused
- excellent balance between diligence and laziness
Setup
library(janitor)
library(readxl)
library(dplyr)
<- as.data.frame(matrix(ncol = 6))
test_df names(test_df) <- c("firstName", "ábc@!*", "% successful (2009)",
"REPEAT VALUE", "REPEAT VALUE", "")
|>
test_df clean_names(case = "upper_lower") |>
select(FIRSTname)
FIRSTname
1 NA
clean_names
clean_names
allows you to convert ugly column names into standardised, syntactic names. You can also specfiy the case, or bulk-tweak the names.
clean_names(test_df)
first_name abc percent_successful_2009 repeat_value repeat_value_2 x
1 NA NA NA NA NA NA
clean_names(test_df, case = "screaming_snake")
FIRST_NAME ABC PERCENT_SUCCESSFUL_2009 REPEAT_VALUE REPEAT_VALUE_2 X
1 NA NA NA NA NA NA
clean_names(test_df, case = "big_camel")
FirstName Abc PercentSuccessful2009 RepeatValue RepeatValue_2 X
1 NA NA NA NA NA NA
%>%
test_df clean_names(prefix = "a_prefix")
a_prefixfirst_name a_prefixabc a_prefixpercent_successful_2009
1 NA NA NA
a_prefixrepeat_value a_prefixrepeat_value_2 a_prefixx
1 NA NA NA
compare_df_cols
library(palmerpenguins)
compare_df_cols(penguins,penguins_raw)
column_name penguins penguins_raw
1 bill_depth_mm numeric <NA>
2 bill_length_mm numeric <NA>
3 Body Mass (g) <NA> numeric
4 body_mass_g integer <NA>
5 Clutch Completion <NA> character
6 Comments <NA> character
7 Culmen Depth (mm) <NA> numeric
8 Culmen Length (mm) <NA> numeric
9 Date Egg <NA> Date
10 Delta 13 C (o/oo) <NA> numeric
11 Delta 15 N (o/oo) <NA> numeric
12 Flipper Length (mm) <NA> numeric
13 flipper_length_mm integer <NA>
14 Individual ID <NA> character
15 island factor <NA>
16 Island <NA> character
17 Region <NA> character
18 Sample Number <NA> numeric
19 sex factor <NA>
20 Sex <NA> character
21 species factor <NA>
22 Species <NA> character
23 Stage <NA> character
24 studyName <NA> character
25 year integer <NA>
warnings
<- readxl::read_excel("data/JanitorDataExample.xlsx", .name_repair = make_clean_names)
Data1
# warnings switched off, otherwise we'd have literally hundreds of warnings here!
make_clean_names(c("1thing", "some opther thign"))
[1] "x1thing" "some_opther_thign"
glimpse(Data1)
Rows: 503
Columns: 7
$ id <dbl> 100507, 104088, 105279, 105459, 108894, 109204, 10962…
$ gender <chr> "Male", "Male", "Female", "Male", "Male", "Male", "Ma…
$ date_of_event_1 <dbl> 44572, 44216, 44172, 44496, 44973, 45099, 44350, 4466…
$ date_of_event_2 <dbl> 44535, 44089, 45239, 43872, 45239, 44236, 44981, 4386…
$ location <chr> "Hospital", "Hospital", "Hospital", "Home", "Home", "…
$ diagnosis_code_1 <chr> "T848", "G992", "J81X", "R55X", "J459", "K851", "J441…
$ attendances <dbl> 3, 2, 4, 2, 2, 4, 2, 3, 1, 3, 3, 2, 3, 4, 3, 4, 4, 4,…
clean_names
again
<- Data1 |>
Data1_clean clean_names() |>
mutate(across(contains("date"),
|>
excel_numeric_to_date)) mutate(Year = lubridate::year(date_of_event_1))
Data1_clean
# A tibble: 503 × 8
id gender date_of_event_1 date_of_event_2 location diagnosis_code_1
<dbl> <chr> <date> <date> <chr> <chr>
1 100507 Male 2022-01-11 2021-12-05 Hospital T848
2 104088 Male 2021-01-20 2020-09-15 Hospital G992
3 105279 Female 2020-12-07 2023-11-09 Hospital J81X
4 105459 Male 2021-10-27 2020-02-11 Home R55X
5 108894 Male 2023-02-16 2023-11-09 Home J459
6 109204 Male 2023-06-22 2021-02-09 Home K851
7 109624 Male 2021-06-03 2023-02-24 Home J441
8 111454 Male 2022-04-11 2020-02-07 Home E162
9 113892 Male 2020-04-26 2023-03-06 Home I279
10 114110 Male 2021-04-04 2021-05-06 Hospital M798
# ℹ 493 more rows
# ℹ 2 more variables: attendances <dbl>, Year <dbl>
get_dupes
|>
Data1_clean get_dupes(contains("date"))
# A tibble: 6 × 9
date_of_event_1 date_of_event_2 dupe_count id gender location
<date> <date> <int> <dbl> <chr> <chr>
1 2021-10-09 2021-08-20 2 400760 Female Hospital
2 2021-10-09 2021-08-20 2 400760 Female Hospital
3 2022-04-04 2021-11-08 2 450284 Female Hospital
4 2022-04-04 2021-11-08 2 450284 Female Hospital
5 2023-06-29 2020-03-23 2 823894 Male Home
6 2023-06-29 2020-03-23 2 823894 Male Home
# ℹ 3 more variables: diagnosis_code_1 <chr>, attendances <dbl>, Year <dbl>
distinct
<- distinct(Data1_clean)
Data1_clean_distinct
Data1_clean_distinct
# A tibble: 500 × 8
id gender date_of_event_1 date_of_event_2 location diagnosis_code_1
<dbl> <chr> <date> <date> <chr> <chr>
1 100507 Male 2022-01-11 2021-12-05 Hospital T848
2 104088 Male 2021-01-20 2020-09-15 Hospital G992
3 105279 Female 2020-12-07 2023-11-09 Hospital J81X
4 105459 Male 2021-10-27 2020-02-11 Home R55X
5 108894 Male 2023-02-16 2023-11-09 Home J459
6 109204 Male 2023-06-22 2021-02-09 Home K851
7 109624 Male 2021-06-03 2023-02-24 Home J441
8 111454 Male 2022-04-11 2020-02-07 Home E162
9 113892 Male 2020-04-26 2023-03-06 Home I279
10 114110 Male 2021-04-04 2021-05-06 Hospital M798
# ℹ 490 more rows
# ℹ 2 more variables: attendances <dbl>, Year <dbl>
tabyl
/ adorn
|>
Data1_clean_distinct tabyl(gender)
gender n percent valid_percent
Female 285 0.570 0.5711423
Male 214 0.428 0.4288577
<NA> 1 0.002 NA
|>
Data1_clean_distinct tabyl(gender, show_na = F)
gender n percent
Female 285 0.5711423
Male 214 0.4288577
|>
Data1_clean_distinct tabyl(gender, location, Year, show_na = F) # excel pivot
$`2020`
gender Home Hospital
Female 34 33
Male 24 25
$`2021`
gender Home Hospital
Female 24 48
Male 28 22
$`2022`
gender Home Hospital
Female 35 35
Male 22 30
$`2023`
gender Home Hospital
Female 37 39
Male 25 38
|>
Data1_clean_distinct tabyl(gender) |>
class()
[1] "tabyl" "data.frame"
|>
Data1_clean_distinct tabyl(Year,gender, show_na = F) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns() |>
adorn_title("combined")
Year/gender Female Male
2020 57.8% (67) 42.2% (49)
2021 59.0% (72) 41.0% (50)
2022 57.4% (70) 42.6% (52)
2023 54.7% (76) 45.3% (63)
Total 57.1% (285) 42.9% (214)
|>
Data1_clean_distinct tabyl(Year,gender,show_na = F) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns() |>
# adorn_title("combined") |>
mutate(Year = paste("The year was", Year))
Year Female Male
The year was 2020 57.8% (67) 42.2% (49)
The year was 2021 59.0% (72) 41.0% (50)
The year was 2022 57.4% (70) 42.6% (52)
The year was 2023 54.7% (76) 45.3% (63)
The year was Total 57.1% (285) 42.9% (214)
|>
Data1_clean_distinct tabyl(Year,gender,show_na = F) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns(position = "front") |>
adorn_title("combined")
Year/gender Female Male
2020 67 (57.8%) 49 (42.2%)
2021 72 (59.0%) 50 (41.0%)
2022 70 (57.4%) 52 (42.6%)
2023 76 (54.7%) 63 (45.3%)
Total 285 (57.1%) 214 (42.9%)
group_by with tabyl
|>
Data1_clean_distinct group_by(Year,gender) |>
summarise(Attendances = n()) |>
filter(!is.na(Year)) |>
::pivot_wider(names_from = gender, values_from = Attendances) |>
tidyradorn_totals(c("row","col"))
Year Female Male Total
2020 67 49 116
2021 72 50 122
2022 70 52 122
2023 76 63 139
Total 285 214 499
chisq.test
|>
Data1_clean_distinct tabyl(attendances,location,show_na = F) |>
chisq.test()
Pearson's Chi-squared test
data: tabyl(Data1_clean_distinct, attendances, location, show_na = F)
X-squared = 1.9866, df = 3, p-value = 0.5752
chisq.test
<- data.frame(Died = c(122,167,528,673),
titanic Survived = c(203,118,178,212),
row.names = c("1st class", "2nd class", "3rd class", "crew"))
chisq.test(titanic)
Pearson's Chi-squared test
data: titanic
X-squared = 190.4, df = 3, p-value < 2.2e-16