Concept of tidy data, vectors and pivoting

CVEN 5999 - Summer 2025

Lars Schöbitz
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)

ggplot2::theme_set(ggplot2::theme_gray(base_size = 16))
waste_data_lord1 <- read_csv(
  here::here("slides/data/waste-characterisation-lord-of-the-bins-sheet1.csv"))
Rows: 5 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): location, day_of_collection
dbl (10): objid, bin_id, bin_id_2, non_recyclables_ Kg, pet_Kg, metal_conten...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
waste_data_lord2 <- read_csv(
  here::here("slides/data/waste-characterisation-lord-of-the-bins-sheet2.csv"))
Rows: 5 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): location, day_of_collection
dbl (10): objid, bin_id, bin_id_2, non_recyclables_ Kg, pet_Kg, metal_conten...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survey_data <- read_csv(here::here("slides/data/survey-data-partners-in-grime.csv")) |> 
  mutate(id = seq(1:n()))
Rows: 22 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (14): date_started, age, job, residence_situation, residence_type, locat...
dbl  (1): residence_distance

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survey_data_small <- survey_data |> 
  select(id, job, price_glass)

survey_data_tidy <- survey_data |> 
  select(id, job, residence_situation, starts_with("price")) |> 
  pivot_longer(cols = starts_with("price"),
               names_to = "waste_category",
               values_to = "price") |> 
  mutate(waste_category = str_remove(waste_category, pattern = "price_")) |> 
  mutate(price_new = case_when(
    price == "5 to 10" ~ "7.5",
    price == "05-Oct" ~ "7.5",
    str_detect(price, pattern = "20") == TRUE ~ "20",
    str_detect(price, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price
  )) |> 
  mutate(price = as.numeric(price_new)) |> 
  select(-price_new)

# survey_data_tidy |> 
#   count(job, residence_situation, waste_category, price) |> 
#   ggplot(aes(x = factor(price), y = n, fill = job)) +
#   geom_col()
# 
waste_data <- bind_rows(waste_data_lord1, waste_data_lord2)

waste_data_untidy <- waste_data |> 
  filter(!is.na(objid)) |> 
  rename(non_recyclable_Kg = `non_recyclables_ Kg`,
         recyclable_Kg = recyclables_Kg) |>
  relocate(c(recyclable_Kg, non_recyclable_Kg), .before = weight_total_kg) |> 
  select(objid, location, pet_Kg:weight_total_kg) |> 
  mutate(objid = factor(objid)) |>
  rename_with(~str_remove(.x, "_Kg|_kg")) |> 
  rename_with(~str_remove(.x, "_content")) |> 
  rename(total = weight_total)


waste_category_levels <- c("glass", "metal_alu", "paper", "pet", "other")

waste_data_tidy <- waste_data_untidy |> 
  select(objid:paper, non_recyclable) |> 
  rename(other = non_recyclable) |> 
  mutate(objid = factor(objid)) |>
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight") |> 
  mutate(waste_category = factor(waste_category, levels = waste_category_levels)) |> 
  mutate(type = case_when(
    waste_category == "other" ~ "non_recyclable",
    TRUE ~ "recyclable")) |> 
  relocate(type, .before = weight) |> 
  group_by(objid) |> 
  mutate(percent = weight / sum(weight) * 100)

waste_data_tidy |> 
  write_rds(here::here("slides/data/intermediate_data/waste-characterisation-lord-of-the-bins-tidy.rds"))
waste_data_tidy |> 
  group_by(location, waste_category) |> 
  summarise(weight = mean(weight)) |> 
  group_by(location) |> 
  mutate(percent = weight / sum(weight) * 100) 

waste_data_tidy |> 
  
  ggplot(mapping = aes(x = waste_category, y = weight, color = type)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  facet_wrap(~location)

waste_data_tidy |> 
  ggplot(mapping = aes(x = objid, y = weight)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(mapping = aes(x = objid, y = weight, fill = waste_category)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(mapping = aes(x = waste_category, y = percent, color = type)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  facet_wrap(~location)


waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = waste_category)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = type)) +
  geom_col() 

waste_data_tidy |> 
  ggplot(aes(x = objid, y = percent, fill = location)) +
  geom_col() +
  facet_wrap(~waste_category, ncol = 5)

Learning Objectives (for this week)

lobj1 <- readr::read_csv(here::here("data/tbl-02-learning-objectives.csv")) |>
  dplyr::filter(week == params$week) |>
  dplyr::pull(learning_objectives)
Rows: 26 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): title, learning_objectives
dbl (1): week
lgl (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  1. Learners can apply functions from the dplyr R Package to transform their data from a wide to a long format and vice versa.
  2. Learners can list the four main atomic vector types in R.
  3. Learners can explain the three characteristics of tidy data.

Part 1: Data types and vectors

Why care about data types?

via GIPHY

Example: survey data

id job price_glass
1 Student 0
2 Retired 0
3 Other 0
4 Employed 10
5 Employed See comment
6 Student 05-Oct
7 Student 0
8 Retired 0
9 Student 10
10 Employed 0
11 Employed 20 (2chf per person with 10 people in the WG)
12 Student 10
13 Student 10
14 Employed 0
15 Student 10
16 Student 0
17 Employed 5 to 10
18 Other 0
19 Student 0
20 Employed 10
21 Employed 0
22 Employed 5

Oh why won’t you work?!

survey_data_small |> 
  summarise(mean_price_glass = mean(price_glass))
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA
# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Oh why won’t you still work??!!

survey_data_small |> 
  summarise(mean_price_glass = mean(price_glass, na.rm = TRUE))
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass, na.rm = TRUE)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA
# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Take a breath and look at your data

id job price_glass
1 Student 0
2 Retired 0
3 Other 0
4 Employed 10
5 Employed See comment
6 Student 05-Oct
7 Student 0
8 Retired 0
9 Student 10
10 Employed 0
11 Employed 20 (2chf per person with 10 people in the WG)
12 Student 10
13 Student 10
14 Employed 0
15 Student 10
16 Student 0
17 Employed 5 to 10
18 Other 0
19 Student 0
20 Employed 10
21 Employed 0
22 Employed 5

Very common data tidying step!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "2chf") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) 

Very common data tidying step!

id job price_glass_new price_glass
1 Student 0 0
2 Retired 0 0
3 Other 0 0
4 Employed 10 10
5 Employed NA See comment
6 Student 7.5 05-Oct
7 Student 0 0
8 Retired 0 0
9 Student 10 10
10 Employed 0 0
11 Employed 20 20 (2chf per person with 10 people in the WG)
12 Student 10 10
13 Student 10 10
14 Employed 0 0
15 Student 10 10
16 Student 0 0
17 Employed 7.5 5 to 10
18 Other 0 0
19 Student 0 0
20 Employed 10 10
21 Employed 0 0
22 Employed 5 5

Sumamrise? Argh!!!!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) |> 
  summarise(mean_price_glass = mean(price_glass_new, na.rm = TRUE))
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mean_price_glass = mean(price_glass_new, na.rm = TRUE)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA
# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1               NA

Always respect your data types!

Taking the mean of vector with type “character” is not possible.

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) 
# A tibble: 22 × 4
      id job      price_glass price_glass_new
   <int> <chr>    <chr>       <chr>          
 1     1 Student  0           0              
 2     2 Retired  0           0              
 3     3 Other    0           0              
 4     4 Employed 10          10             
 5     5 Employed See comment <NA>           
 6     6 Student  05-Oct      7.5            
 7     7 Student  0           0              
 8     8 Retired  0           0              
 9     9 Student  10          10             
10    10 Employed 0           0              
# ℹ 12 more rows

Always respect your data types!

survey_data_small |> 
  mutate(price_glass_new = case_when(
    price_glass == "5 to 10" ~ "7.5",
    price_glass == "05-Oct" ~ "7.5",
    str_detect(price_glass, pattern = "20") == TRUE ~ "20",
    str_detect(price_glass, pattern = "See comment") == TRUE ~ NA_character_,
    TRUE ~ price_glass
  )) |> 
  mutate(price_glass_new = as.numeric(price_glass_new)) |> 
  summarise(mean_price_glass = mean(price_glass_new, na.rm = TRUE))
# A tibble: 1 × 1
  mean_price_glass
             <dbl>
1             4.76

Live Coding Exercise: Vectors

live-vectors

Follow along on the screen

  1. Open the GitHub organisation for the course: https://github.com/cven5999-ss25
  2. You will find a repository titled: wk-05-USERNAME (with your GitHub Username)
  3. You will “clone” this repository to Posit Cloud

Break One

countdown::countdown(
  minutes = 10, seconds = 0,
  left = 0, right = 0,
  padding = "10px", margin = "5%",
  font_size = "2em"
)
10:00

Part 2: tidyr - long and wide formats

.

.

.

A grammar of data tidying

The goal of tidyr is to help you tidy your data via

  • pivoting for going between wide and long data
  • splitting and combining character columns
  • nesting and unnesting columns
  • clarifying how NAs should be treated

Pivoting data

knitr::include_graphics("img/lec-04/tidyr-longer-wider.gif")

Waste characterisation data

waste_data_untidy |> 
  select(-recyclable) |> 
  rename(other = non_recyclable) |> 
  knitr::kable(digits = 2)
objid location pet metal_alu glass paper other total
900 eth 0.06 0.06 0.58 0.21 1.14 2.05
899 eth 0.14 0.01 0.18 0.28 3.04 3.64
921 old_town 0.00 0.00 0.00 0.41 1.57 1.99
916 old_town 0.17 0.04 0.80 0.55 0.62 2.19
900 eth 0.10 0.04 0.00 0.40 0.58 1.12
899 eth 0.08 0.03 0.00 0.05 0.34 0.50
921 old_town 0.08 0.03 0.30 0.40 1.52 2.33
916 old_town 0.11 0.04 0.92 1.01 1.99 4.07

How would you plot this?

ggplot(data = waste_data_untidy,
       mapping = aes(x = objid, y = total, fill = location)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Three variables

waste_data_untidy |> 
  select(objid, location, total) |> 
  knitr::kable(digits = 2)
objid location total
900 eth 2.05
899 eth 3.64
921 old_town 1.99
916 old_town 2.19
900 eth 1.12
899 eth 0.50
921 old_town 2.33
916 old_town 4.07

Three variables -> three aesthetics

ggplot(data = waste_data_untidy,
       mapping = aes(x = objid, 
                     y = total, 
                     fill = location)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

And how to plot this?

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Reminder: Data (in wide format)

waste_data_untidy |> 
  select(-recyclable, -total) |> 
  rename(other = non_recyclable) |> 
  knitr::kable(digits = 2)
objid location pet metal_alu glass paper other
900 eth 0.06 0.06 0.58 0.21 1.14
899 eth 0.14 0.01 0.18 0.28 3.04
921 old_town 0.00 0.00 0.00 0.41 1.57
916 old_town 0.17 0.04 0.80 0.55 0.62
900 eth 0.10 0.04 0.00 0.40 0.58
899 eth 0.08 0.03 0.00 0.05 0.34
921 old_town 0.08 0.03 0.30 0.40 1.52
916 old_town 0.11 0.04 0.92 1.01 1.99

You need: A long format

waste_data_tidy |> 
  select(-percent, -type) |> 
  knitr::kable(digits = 2)
objid location waste_category weight
900 eth pet 0.06
900 eth metal_alu 0.06
900 eth glass 0.58
900 eth paper 0.21
900 eth other 1.14
899 eth pet 0.14
899 eth metal_alu 0.01
899 eth glass 0.18
899 eth paper 0.28
899 eth other 3.04
921 old_town pet 0.00
921 old_town metal_alu 0.00
921 old_town glass 0.00
921 old_town paper 0.41
921 old_town other 1.57
916 old_town pet 0.17
916 old_town metal_alu 0.04
916 old_town glass 0.80
916 old_town paper 0.55
916 old_town other 0.62
900 eth pet 0.10
900 eth metal_alu 0.04
900 eth glass 0.00
900 eth paper 0.40
900 eth other 0.58
899 eth pet 0.08
899 eth metal_alu 0.03
899 eth glass 0.00
899 eth paper 0.05
899 eth other 0.34
921 old_town pet 0.08
921 old_town metal_alu 0.03
921 old_town glass 0.30
921 old_town paper 0.40
921 old_town other 1.52
916 old_town pet 0.11
916 old_town metal_alu 0.04
916 old_town glass 0.92
916 old_town paper 1.01
916 old_town other 1.99

Three variables -> three aesthetics

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

How to

waste_data_untidy <- waste_data_untidy |> 
  select(-recyclable, -total) |> 
  rename(other = non_recyclable)
waste_data_untidy 
objid location pet metal_alu glass paper other
900 eth 0.06 0.06 0.58 0.21 1.14
899 eth 0.14 0.01 0.18 0.28 3.04
921 old_town 0.00 0.00 0.00 0.41 1.57
916 old_town 0.17 0.04 0.80 0.55 0.62
900 eth 0.10 0.04 0.00 0.40 0.58
899 eth 0.08 0.03 0.00 0.05 0.34
921 old_town 0.08 0.03 0.30 0.40 1.52
916 old_town 0.11 0.04 0.92 1.01 1.99

How to

waste_data_untidy |> 
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight") 
objid location waste_category weight
900 eth pet 0.06
900 eth metal_alu 0.06
900 eth glass 0.58
900 eth paper 0.21
900 eth other 1.14
899 eth pet 0.14
899 eth metal_alu 0.01
899 eth glass 0.18
899 eth paper 0.28
899 eth other 3.04
921 old_town pet 0.00
921 old_town metal_alu 0.00
921 old_town glass 0.00
921 old_town paper 0.41
921 old_town other 1.57
916 old_town pet 0.17
916 old_town metal_alu 0.04
916 old_town glass 0.80
916 old_town paper 0.55
916 old_town other 0.62
900 eth pet 0.10
900 eth metal_alu 0.04
900 eth glass 0.00
900 eth paper 0.40
900 eth other 0.58
899 eth pet 0.08
899 eth metal_alu 0.03
899 eth glass 0.00
899 eth paper 0.05
899 eth other 0.34
921 old_town pet 0.08
921 old_town metal_alu 0.03
921 old_town glass 0.30
921 old_town paper 0.40
921 old_town other 1.52
916 old_town pet 0.11
916 old_town metal_alu 0.04
916 old_town glass 0.92
916 old_town paper 1.01
916 old_town other 1.99

How to

waste_category_levels <- c("glass", "metal_alu", "paper", "pet", "other")

waste_data_untidy |> 
  pivot_longer(cols = pet:other,
               names_to = "waste_category",
               values_to = "weight") |> 
  mutate(waste_category = factor(waste_category, levels = waste_category_levels)) 
objid location waste_category weight
900 eth pet 0.06
900 eth metal_alu 0.06
900 eth glass 0.58
900 eth paper 0.21
900 eth other 1.14
899 eth pet 0.14
899 eth metal_alu 0.01
899 eth glass 0.18
899 eth paper 0.28
899 eth other 3.04
921 old_town pet 0.00
921 old_town metal_alu 0.00
921 old_town glass 0.00
921 old_town paper 0.41
921 old_town other 1.57
916 old_town pet 0.17
916 old_town metal_alu 0.04
916 old_town glass 0.80
916 old_town paper 0.55
916 old_town other 0.62
900 eth pet 0.10
900 eth metal_alu 0.04
900 eth glass 0.00
900 eth paper 0.40
900 eth other 0.58
899 eth pet 0.08
899 eth metal_alu 0.03
899 eth glass 0.00
899 eth paper 0.05
899 eth other 0.34
921 old_town pet 0.08
921 old_town metal_alu 0.03
921 old_town glass 0.30
921 old_town paper 0.40
921 old_town other 1.52
916 old_town pet 0.11
916 old_town metal_alu 0.04
916 old_town glass 0.92
916 old_town paper 1.01
916 old_town other 1.99

Three variables -> three aesthetics

ggplot(data = waste_data_tidy,
       mapping = aes(x = objid, 
                     y = weight, 
                     fill = waste_category)) +
  geom_col() + 
  scale_fill_brewer(type = "qual")

Live Coding Exercise: Pivoting

live-05a-tidyr-pivoting

  1. Head over to posit.cloud
  2. Open the workspace for the course cven5999-ss25
  3. Open “Content”
  4. Open your project
  5. Follow along with me

Homework week 5

Homework due dates

  • All material on course website
  • Homework assignment & learning reflection due: 2025-07-04

Thanks! 🌻

Slides created via revealjs and Quarto: https://quarto.org/docs/presentations/revealjs/

Access slides as PDF on GitHub

All material is licensed under Creative Commons Attribution Share Alike 4.0 International.

References

Ben Aleya, Ali, Daniel Biek, Lin Boynton, Julia Jaeggi, Sebastian Camilo Loos, Chiara Meyer-Piening, Jonathan Olal Ogwang, et al. 2022. “Research Beyond the Lab, Spring Term 2022, Global Health Engineering, ETH Zurich. Raw Data and Analysis-Ready Derived Data on Waste Management in Public Spaces in Zurich, Switzerland.” Zenodo. https://doi.org/10.5281/ZENODO.7331119.