Concept of tidy data, vectors and pivoting

CVEN 5999 - Summer 2025

Lars Schöbitz

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(knitr)

ggplot2::theme_set(ggplot2::theme_gray(base_size = 16))

waste_data_lord1 <- read_csv(
  here::here("slides/data/waste-characterisation-lord-of-the-bins-sheet1.csv"))

Rows: 5 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): location, day_of_collection
dbl (10): objid, bin_id, bin_id_2, non_recyclables_ Kg, pet_Kg, metal_conten...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

waste_data_lord2 <- read_csv(
  here::here("slides/data/waste-characterisation-lord-of-the-bins-sheet2.csv"))

Rows: 5 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): location, day_of_collection
dbl (10): objid, bin_id, bin_id_2, non_recyclables_ Kg, pet_Kg, metal_conten...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

survey_data <- read_csv(here::here("slides/data/survey-data-partners-in-grime.csv")) |> 
  mutate(id = seq(1:n()))

Rows: 22 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (14): date_started, age, job, residence_situation, residence_type, locat...
dbl  (1): residence_distance

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

survey_data_small <- survey_data |> 
  select(id, job, price_glass)

survey_data_tidy <- survey_data |> 
  select(id, job, residence_situation, starts_with("price")) |> 
  pivot_longer(cols = starts_with("price"),
               names_to = "waste_category",
               values_to = "price") |> 
  mutate(waste_category = str_remove(waste_category, pattern = "price_")) |> 
  mutate(price_new = case_when(
    price == "5 to 10" ~ "7.5",
    price == "05-Oct" ~ "7.5",
    str_detect(price, pattern = "20") == TRUE ~ "20",
    str_detect(price, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price
  )) |> 
  mutate(price = as.numeric(price_new)) |> 
  select(-price_new)

# survey_data_tidy |> 
#   count(job, residence_situation, waste_category, price) |> 
#   ggplot(aes(x = factor(price), y = n, fill = job)) +
#   geom_col()
#

waste_data <- bind_rows(waste_data_lord1, waste_data_lord2)

waste_data_untidy <- waste_data |> 
  filter(!is.na(objid)) |> 
  rename(non_recyclable_Kg = `non_recyclables_ Kg`,
         recyclable_Kg = recyclables_Kg) |>
  relocate(c(recyclable_Kg, non_recyclable_Kg), .before = weight_total_kg) |> 
  select(objid, location, pet_Kg:weight_total_kg) |> 
  mutate(objid = factor(objid)) |>
  rename_with(~str_remove(.x, "_Kg|_kg")) |> 
  rename_with(~str_remove(.x, "_content")) |> 
  rename(total = weight_total)


waste_category_levels <- c("glass", "metal_alu", "paper", "pet", "other")

waste_data_tidy <- waste_data_untidy |> 
  select(objid:paper, non_recyclable) |> 
  rename(other = non_recyclable) |> 
  mutate(objid = factor(objid)) |>
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight") |> 
  mutate(waste_category = factor(waste_category, levels = waste_category_levels)) |> 
  mutate(type = case_when(
    waste_category == "other" ~ "non_recyclable",
    TRUE ~ "recyclable")) |> 
  relocate(type, .before = weight) |> 
  group_by(objid) |> 
  mutate(percent = weight / sum(weight) * 100)

waste_data_tidy |> 
  write_rds(here::here("slides/data/intermediate_data/waste-characterisation-lord-of-the-bins-tidy.rds"))

waste_data_tidy |> 
  group_by(location, waste_category) |> 
  summarise(weight = mean(weight)) |> 
  group_by(location) |> 
  mutate(percent = weight / sum(weight) * 100) 

waste_data_tidy |> 
  
  ggplot(mapping = aes(x = waste_category, y = weight, color = type)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  facet_wrap(~location)

waste_data_tidy |> 
  ggplot(mapping = aes(x = objid, y = weight)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(mapping = aes(x = objid, y = weight, fill = waste_category)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(mapping = aes(x = waste_category, y = percent, color = type)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  facet_wrap(~location)


waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = waste_category)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = type)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = location)) +
  geom_col() +
  facet_wrap(~waste_category, ncol = 5)

Learning Objectives (for this week)

lobj1 <- readr::read_csv(here::here("data/tbl-02-learning-objectives.csv")) |>
  dplyr::filter(week == params$week) |>
  dplyr::pull(learning_objectives)

Rows: 26 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): title, learning_objectives
dbl (1): week
lgl (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Learners can apply functions from the dplyr R Package to transform their data from a wide to a long format and vice versa.
Learners can list the four main atomic vector types in R.
Learners can explain the three characteristics of tidy data.

Part 1: Data types and vectors

Why care about data types?

via GIPHY

Example: survey data

id	job	price_glass
1	Student	0
2	Retired	0
3	Other	0
4	Employed	10
5	Employed	See comment
6	Student	05-Oct
7	Student	0
8	Retired	0
9	Student	10
10	Employed	0
11	Employed	20 (2chf per person with 10 people in the WG)
12	Student	10
13	Student	10
14	Employed	0
15	Student	10
16	Student	0
17	Employed	5 to 10
18	Other	0
19	Student	0
20	Employed	10
21	Employed	0
22	Employed	5

Oh why won’t you work?!

survey_data_small |> 
  summarise(mean_price_glass = mean(price_glass))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA

# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Oh why won’t you still work??!!

survey_data_small |> 
  summarise(mean_price_glass = mean(price_glass, na.rm = TRUE))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass, na.rm = TRUE)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA

# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Take a breath and look at your data

id	job	price_glass
1	Student	0
2	Retired	0
3	Other	0
4	Employed	10
5	Employed	See comment
6	Student	05-Oct
7	Student	0
8	Retired	0
9	Student	10
10	Employed	0
11	Employed	20 (2chf per person with 10 people in the WG)
12	Student	10
13	Student	10
14	Employed	0
15	Student	10
16	Student	0
17	Employed	5 to 10
18	Other	0
19	Student	0
20	Employed	10
21	Employed	0
22	Employed	5

Very common data tidying step!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "2chf") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  ))

Very common data tidying step!

id	job	price_glass_new	price_glass
1	Student	0	0
2	Retired	0	0
3	Other	0	0
4	Employed	10	10
5	Employed	NA	See comment
6	Student	7.5	05-Oct
7	Student	0	0
8	Retired	0	0
9	Student	10	10
10	Employed	0	0
11	Employed	20	20 (2chf per person with 10 people in the WG)
12	Student	10	10
13	Student	10	10
14	Employed	0	0
15	Student	10	10
16	Student	0	0
17	Employed	7.5	5 to 10
18	Other	0	0
19	Student	0	0
20	Employed	10	10
21	Employed	0	0
22	Employed	5	5

Sumamrise? Argh!!!!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) |> 
  summarise(mean_price_glass = mean(price_glass_new, na.rm = TRUE))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass_new, na.rm = TRUE)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA

# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Always respect your data types!

Taking the mean of vector with type “character” is not possible.

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  ))

# A tibble: 22 × 4
      id job      price_glass price_glass_new
   <int> <chr>    <chr>       <chr>          
 1     1 Student  0           0              
 2     2 Retired  0           0              
 3     3 Other    0           0              
 4     4 Employed 10          10             
 5     5 Employed See comment <NA>           
 6     6 Student  05-Oct      7.5            
 7     7 Student  0           0              
 8     8 Retired  0           0              
 9     9 Student  10          10             
10    10 Employed 0           0              
# ℹ 12 more rows

Always respect your data types!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) |> 
  mutate(price_glass_new = as.numeric(price_glass_new)) |> 
  summarise(mean_price_glass = mean(price_glass_new, na.rm = TRUE))

# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1             4.76

Live Coding Exercise: Vectors

live-vectors

Follow along on the screen

Open the GitHub organisation for the course: https://github.com/cven5999-ss25
You will find a repository titled: wk-05-USERNAME (with your GitHub Username)
You will “clone” this repository to Posit Cloud

Break One

countdown::countdown(
  minutes = 10, seconds = 0,
  left = 0, right = 0,
  padding = "10px", margin = "5%",
  font_size = "2em"
)

10:00

Part 2: tidyr - long and wide formats

.

A grammar of data tidying

The goal of tidyr is to help you tidy your data via

pivoting for going between wide and long data
splitting and combining character columns
nesting and unnesting columns
clarifying how NAs should be treated

Pivoting data

knitr::include_graphics("img/lec-04/tidyr-longer-wider.gif")

Waste characterisation data

waste_data_untidy |> 
  select(-recyclable) |> 
  rename(other = non_recyclable) |> 
  knitr::kable(digits = 2)

objid	location	pet	metal_alu	glass	paper	other	total
900	eth	0.06	0.06	0.58	0.21	1.14	2.05
899	eth	0.14	0.01	0.18	0.28	3.04	3.64
921	old_town	0.00	0.00	0.00	0.41	1.57	1.99
916	old_town	0.17	0.04	0.80	0.55	0.62	2.19
900	eth	0.10	0.04	0.00	0.40	0.58	1.12
899	eth	0.08	0.03	0.00	0.05	0.34	0.50
921	old_town	0.08	0.03	0.30	0.40	1.52	2.33
916	old_town	0.11	0.04	0.92	1.01	1.99	4.07

How would you plot this?

ggplot(data = waste_data_untidy,
       mapping = aes(x = objid, y = total, fill = location)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Three variables

waste_data_untidy |> 
  select(objid, location, total) |> 
  knitr::kable(digits = 2)

objid	location	total
900	eth	2.05
899	eth	3.64
921	old_town	1.99
916	old_town	2.19
900	eth	1.12
899	eth	0.50
921	old_town	2.33
916	old_town	4.07

Three variables -> three aesthetics

ggplot(data = waste_data_untidy,
       mapping = aes(x = objid, 
                     y = total, 
                     fill = location)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

And how to plot this?

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Reminder: Data (in wide format)

waste_data_untidy |> 
  select(-recyclable, -total) |> 
  rename(other = non_recyclable) |> 
  knitr::kable(digits = 2)

objid	location	pet	metal_alu	glass	paper	other
900	eth	0.06	0.06	0.58	0.21	1.14
899	eth	0.14	0.01	0.18	0.28	3.04
921	old_town	0.00	0.00	0.00	0.41	1.57
916	old_town	0.17	0.04	0.80	0.55	0.62
900	eth	0.10	0.04	0.00	0.40	0.58
899	eth	0.08	0.03	0.00	0.05	0.34
921	old_town	0.08	0.03	0.30	0.40	1.52
916	old_town	0.11	0.04	0.92	1.01	1.99

You need: A long format

waste_data_tidy |> 
  select(-percent, -type) |> 
  knitr::kable(digits = 2)

objid	location	waste_category	weight
900	eth	pet	0.06
900	eth	metal_alu	0.06
900	eth	glass	0.58
900	eth	paper	0.21
900	eth	other	1.14
899	eth	pet	0.14
899	eth	metal_alu	0.01
899	eth	glass	0.18
899	eth	paper	0.28
899	eth	other	3.04
921	old_town	pet	0.00
921	old_town	metal_alu	0.00
921	old_town	glass	0.00
921	old_town	paper	0.41
921	old_town	other	1.57
916	old_town	pet	0.17
916	old_town	metal_alu	0.04
916	old_town	glass	0.80
916	old_town	paper	0.55
916	old_town	other	0.62
900	eth	pet	0.10
900	eth	metal_alu	0.04
900	eth	glass	0.00
900	eth	paper	0.40
900	eth	other	0.58
899	eth	pet	0.08
899	eth	metal_alu	0.03
899	eth	glass	0.00
899	eth	paper	0.05
899	eth	other	0.34
921	old_town	pet	0.08
921	old_town	metal_alu	0.03
921	old_town	glass	0.30
921	old_town	paper	0.40
921	old_town	other	1.52
916	old_town	pet	0.11
916	old_town	metal_alu	0.04
916	old_town	glass	0.92
916	old_town	paper	1.01
916	old_town	other	1.99

Three variables -> three aesthetics

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

How to

waste_data_untidy <- waste_data_untidy |> 
  select(-recyclable, -total) |> 
  rename(other = non_recyclable)

waste_data_untidy

objid	location	pet	metal_alu	glass	paper	other
900	eth	0.06	0.06	0.58	0.21	1.14
899	eth	0.14	0.01	0.18	0.28	3.04
921	old_town	0.00	0.00	0.00	0.41	1.57
916	old_town	0.17	0.04	0.80	0.55	0.62
900	eth	0.10	0.04	0.00	0.40	0.58
899	eth	0.08	0.03	0.00	0.05	0.34
921	old_town	0.08	0.03	0.30	0.40	1.52
916	old_town	0.11	0.04	0.92	1.01	1.99

How to

waste_data_untidy |> 
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight")

objid	location	waste_category	weight
900	eth	pet	0.06
900	eth	metal_alu	0.06
900	eth	glass	0.58
900	eth	paper	0.21
900	eth	other	1.14
899	eth	pet	0.14
899	eth	metal_alu	0.01
899	eth	glass	0.18
899	eth	paper	0.28
899	eth	other	3.04
921	old_town	pet	0.00
921	old_town	metal_alu	0.00
921	old_town	glass	0.00
921	old_town	paper	0.41
921	old_town	other	1.57
916	old_town	pet	0.17
916	old_town	metal_alu	0.04
916	old_town	glass	0.80
916	old_town	paper	0.55
916	old_town	other	0.62
900	eth	pet	0.10
900	eth	metal_alu	0.04
900	eth	glass	0.00
900	eth	paper	0.40
900	eth	other	0.58
899	eth	pet	0.08
899	eth	metal_alu	0.03
899	eth	glass	0.00
899	eth	paper	0.05
899	eth	other	0.34
921	old_town	pet	0.08
921	old_town	metal_alu	0.03
921	old_town	glass	0.30
921	old_town	paper	0.40
921	old_town	other	1.52
916	old_town	pet	0.11
916	old_town	metal_alu	0.04
916	old_town	glass	0.92
916	old_town	paper	1.01
916	old_town	other	1.99

How to

waste_category_levels <- c("glass", "metal_alu", "paper", "pet", "other")

waste_data_untidy |> 
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight") |> 
  mutate(waste_category = factor(waste_category, levels = waste_category_levels))

objid	location	waste_category	weight
900	eth	pet	0.06
900	eth	metal_alu	0.06
900	eth	glass	0.58
900	eth	paper	0.21
900	eth	other	1.14
899	eth	pet	0.14
899	eth	metal_alu	0.01
899	eth	glass	0.18
899	eth	paper	0.28
899	eth	other	3.04
921	old_town	pet	0.00
921	old_town	metal_alu	0.00
921	old_town	glass	0.00
921	old_town	paper	0.41
921	old_town	other	1.57
916	old_town	pet	0.17
916	old_town	metal_alu	0.04
916	old_town	glass	0.80
916	old_town	paper	0.55
916	old_town	other	0.62
900	eth	pet	0.10
900	eth	metal_alu	0.04
900	eth	glass	0.00
900	eth	paper	0.40
900	eth	other	0.58
899	eth	pet	0.08
899	eth	metal_alu	0.03
899	eth	glass	0.00
899	eth	paper	0.05
899	eth	other	0.34
921	old_town	pet	0.08
921	old_town	metal_alu	0.03
921	old_town	glass	0.30
921	old_town	paper	0.40
921	old_town	other	1.52
916	old_town	pet	0.11
916	old_town	metal_alu	0.04
916	old_town	glass	0.92
916	old_town	paper	1.01
916	old_town	other	1.99

Three variables -> three aesthetics

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Live Coding Exercise: Pivoting

live-05a-tidyr-pivoting

Head over to posit.cloud
Open the workspace for the course cven5999-ss25
Open “Content”
Open your project
Follow along with me

Homework week 5

Homework due dates

All material on course website
Homework assignment & learning reflection due: 2025-07-04

Thanks! 🌻

Slides created via revealjs and Quarto: https://quarto.org/docs/presentations/revealjs/

Access slides as PDF on GitHub

All material is licensed under Creative Commons Attribution Share Alike 4.0 International.

References

Ben Aleya, Ali, Daniel Biek, Lin Boynton, Julia Jaeggi, Sebastian Camilo Loos, Chiara Meyer-Piening, Jonathan Olal Ogwang, et al. 2022. “Research Beyond the Lab, Spring Term 2022, Global Health Engineering, ETH Zurich. Raw Data and Analysis-Ready Derived Data on Waste Management in Public Spaces in Zurich, Switzerland.” Zenodo. https://doi.org/10.5281/ZENODO.7331119.