1. Data Manipulations

Selecting

select(litters_data, group, litter_number, starts_with("pups"))

## # A tibble: 49 x 5
##   group litter_number pups_born_alive pups_dead_birth pups_survive
##   <chr> <chr>                   <int>           <int>        <int>
## 1 Con7  #85                         3               4            3
## 2 Con7  #1/2/95/2                   8               0            7
## 3 Con7  #5/5/3/83/3-3               6               0            5
## # … with 46 more rows

select(litters_data, litter_number, group, everything())

## # A tibble: 49 x 8
##   litter_number group gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr>         <chr>      <dbl>       <dbl>       <int>           <int>
## 1 #85           Con7        19.7        34.7          20               3
## 2 #1/2/95/2     Con7        27          42            19               8
## 3 #5/5/3/83/3-3 Con7        26          41.4          19               6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

# remove variable
select(litters_data, -group)

## # A tibble: 49 x 7
##   litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr>              <dbl>       <dbl>       <int>           <int>
## 1 #85                 19.7        34.7          20               3
## 2 #1/2/95/2           27          42            19               8
## 3 #5/5/3/83/3-3       26          41.4          19               6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

select(litters_data, litter_number, gd0_weight:pups_born_alive)

## # A tibble: 49 x 5
##   litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr>              <dbl>       <dbl>       <int>           <int>
## 1 #85                 19.7        34.7          20               3
## 2 #1/2/95/2           27          42            19               8
## 3 #5/5/3/83/3-3       26          41.4          19               6
## # … with 46 more rows

# renmae the variable
select(litters_data, GROUP = group, litter_number)

## # A tibble: 49 x 2
##   GROUP litter_number
##   <chr> <chr>        
## 1 Con7  #85          
## 2 Con7  #1/2/95/2    
## 3 Con7  #5/5/3/83/3-3
## # … with 46 more rows

# just rename the variable but no selection
rename(litters_data, GROUP = group)

## # A tibble: 49 x 8
##   GROUP litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #1/2/95/2           27          42            19               8
## 3 Con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

Filtering

filter(litters_data, group == 'Con7')

## # A tibble: 7 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #1/2/95/2           27          42            19               8
## 3 Con7  #5/5/3/83/3-3       26          41.4          19               6
## 4 Con7  #5/4/2/95/2         28.5        44.1          19               5
## 5 Con7  #4/2/95/3-3         NA          NA            20               6
## 6 Con7  #2/2/95/3-2         NA          NA            20               6
## 7 Con7  #1/5/3/83/3-…       NA          NA            20               9
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>

filter(litters_data, pups_born_alive < 6)

## # A tibble: 8 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #5/4/2/95/2         28.5        44.1          19               5
## 3 Con8  #2/2/95/2           NA          NA            19               5
## 4 Mod7  #3/82/3-2           28          45.9          20               5
## 5 Mod7  #5/3/83/5-2         22.6        37            19               5
## 6 Mod7  #106                21.7        37.8          20               5
## 7 Low7  #111                25.5        44.6          20               3
## 8 Low8  #4/84               21.8        35.2          20               4
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>

# group == con7 and/or con8
filter(litters_data, group %in% c("Con7", "Con8"))

## # A tibble: 15 x 8
##    group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##    <chr> <chr>              <dbl>       <dbl>       <int>           <int>
##  1 Con7  #85                 19.7        34.7          20               3
##  2 Con7  #1/2/95/2           27          42            19               8
##  3 Con7  #5/5/3/83/3-3       26          41.4          19               6
##  4 Con7  #5/4/2/95/2         28.5        44.1          19               5
##  5 Con7  #4/2/95/3-3         NA          NA            20               6
##  6 Con7  #2/2/95/3-2         NA          NA            20               6
##  7 Con7  #1/5/3/83/3-…       NA          NA            20               9
##  8 Con8  #3/83/3-3           NA          NA            20               9
##  9 Con8  #2/95/3             NA          NA            20               8
## 10 Con8  #3/5/2/2/95         28.5        NA            20               8
## 11 Con8  #5/4/3/83/3         28          NA            19               9
## 12 Con8  #1/6/2/2/95-2       NA          NA            20               7
## 13 Con8  #3/5/3/83/3-…       NA          NA            20               8
## 14 Con8  #2/2/95/2           NA          NA            19               5
## 15 Con8  #3/6/2/2/95-3       NA          NA            20               7
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>

filter(litters_data, group == 'Con7' | group == 'Con8')

## # A tibble: 15 x 8
##    group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##    <chr> <chr>              <dbl>       <dbl>       <int>           <int>
##  1 Con7  #85                 19.7        34.7          20               3
##  2 Con7  #1/2/95/2           27          42            19               8
##  3 Con7  #5/5/3/83/3-3       26          41.4          19               6
##  4 Con7  #5/4/2/95/2         28.5        44.1          19               5
##  5 Con7  #4/2/95/3-3         NA          NA            20               6
##  6 Con7  #2/2/95/3-2         NA          NA            20               6
##  7 Con7  #1/5/3/83/3-…       NA          NA            20               9
##  8 Con8  #3/83/3-3           NA          NA            20               9
##  9 Con8  #2/95/3             NA          NA            20               8
## 10 Con8  #3/5/2/2/95         28.5        NA            20               8
## 11 Con8  #5/4/3/83/3         28          NA            19               9
## 12 Con8  #1/6/2/2/95-2       NA          NA            20               7
## 13 Con8  #3/5/3/83/3-…       NA          NA            20               8
## 14 Con8  #2/2/95/2           NA          NA            19               5
## 15 Con8  #3/6/2/2/95-3       NA          NA            20               7
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>

filter(litters_data, gd0_weight < gd18_weight)

## # A tibble: 31 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #1/2/95/2           27          42            19               8
## 3 Con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 28 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

# don't do this:
filter(litters_data, !is.na(gd0_weight))

## # A tibble: 34 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #1/2/95/2           27          42            19               8
## 3 Con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 31 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

# recommend:
drop_na(litters_data, gd0_weight)

## # A tibble: 34 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Con7  #1/2/95/2           27          42            19               8
## 3 Con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 31 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

Mutate the data

mutate(
  litters_data, 
  wt_gain = gd18_weight - gd0_weight,
  group = str_to_lower(group),
  group = str_to_upper(group))

## # A tibble: 49 x 9
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 CON7  #85                 19.7        34.7          20               3
## 2 CON7  #1/2/95/2           27          42            19               8
## 3 CON7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 46 more rows, and 3 more variables: pups_dead_birth <int>,
## #   pups_survive <int>, wt_gain <dbl>

Arrange

arrange(litters_data, pups_born_alive)

## # A tibble: 49 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Low7  #111                25.5        44.6          20               3
## 3 Low8  #4/84               21.8        35.2          20               4
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

arrange(litters_data, pups_born_alive, gd0_weight)

## # A tibble: 49 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 Con7  #85                 19.7        34.7          20               3
## 2 Low7  #111                25.5        44.6          20               3
## 3 Low8  #4/84               21.8        35.2          20               4
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   pups_survive <int>

An Example

litters_data = 
  read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
  janitor::clean_names() %>%
  select(-pups_survive) %>%
  mutate(
    wt_gain = gd18_weight - gd0_weight,
    group = str_to_lower(group)) %>% 
  drop_na(wt_gain)

litters_data

## # A tibble: 31 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 con7  #85                 19.7        34.7          20               3
## 2 con7  #1/2/95/2           27          42            19               8
## 3 con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 28 more rows, and 2 more variables: pups_dead_birth <int>,
## #   wt_gain <dbl>

litters_data %>% view()

2. Tidy the Data

pulse_data = 
  haven::read_sas("./data/public_pulse_data.sas7bdat") %>%
  janitor::clean_names() %>%
  pivot_longer(
    bdi_score_bl:bdi_score_12m,
    names_to = 'visit',
    names_prefix = 'bdi_score_',
    values_to = 'bdi'
  ) %>%
  mutate(
    visit = recode(visit, 'bl' = '00m')
  )

pulse_data

## # A tibble: 4,348 x 5
##      id   age sex   visit   bdi
##   <dbl> <dbl> <chr> <chr> <dbl>
## 1 10003  48.0 male  00m       7
## 2 10003  48.0 male  01m       1
## 3 10003  48.0 male  06m       2
## # … with 4,345 more rows

litters_data = 
  read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>% 
   janitor::clean_names() %>%
   separate(col = group, into = c('dose','day_of_tx'), sep = 3)

Untidy the data

analysis_result = tibble(
  group = c("treatment", "treatment", "placebo", "placebo"),
  time = c("pre", "post", "pre", "post"),
  mean = c(4, 8, 3.5, 4)
)

analysis_result

## # A tibble: 4 x 3
##   group     time   mean
##   <chr>     <chr> <dbl>
## 1 treatment pre     4  
## 2 treatment post    8  
## 3 placebo   pre     3.5
## 4 placebo   post    4

pivot_wider(
  analysis_result, 
  names_from = "time", 
  values_from = "mean")

## # A tibble: 2 x 3
##   group       pre  post
##   <chr>     <dbl> <dbl>
## 1 treatment   4       8
## 2 placebo     3.5     4

fellowship_ring = 
  readxl::read_excel("./data/LotR_Words.xlsx", range = "B3:D6") %>%
  mutate(movie = "fellowship_ring")

two_towers = 
  readxl::read_excel("./data/LotR_Words.xlsx", range = "F3:H6") %>%
  mutate(movie = "two_towers")

return_king = 
  readxl::read_excel("./data/LotR_Words.xlsx", range = "J3:L6") %>%
  mutate(movie = "return_king")

lotr_tidy = 
  bind_rows(fellowship_ring, two_towers, return_king) %>%
  janitor::clean_names() %>%
  pivot_longer(
    female:male,
    names_to = "sex", 
    values_to = "words") %>%
  mutate(race = str_to_lower(race)) %>% 
  select(movie, everything()) 

lotr_tidy

## # A tibble: 18 x 4
##    movie           race   sex    words
##    <chr>           <chr>  <chr>  <dbl>
##  1 fellowship_ring elf    female  1229
##  2 fellowship_ring elf    male     971
##  3 fellowship_ring hobbit female    14
##  4 fellowship_ring hobbit male    3644
##  5 fellowship_ring man    female     0
##  6 fellowship_ring man    male    1995
##  7 two_towers      elf    female   331
##  8 two_towers      elf    male     513
##  9 two_towers      hobbit female     0
## 10 two_towers      hobbit male    2463
## 11 two_towers      man    female   401
## 12 two_towers      man    male    3589
## 13 return_king     elf    female   183
## 14 return_king     elf    male     510
## 15 return_king     hobbit female     2
## 16 return_king     hobbit male    2673
## 17 return_king     man    female   268
## 18 return_king     man    male    2459

Joining datasets

pup_data = 
  read_csv("./data/FAS_pups.csv", col_types = "ciiiii") %>%
  janitor::clean_names() %>%
  mutate(sex = recode(sex, `1` = "male", `2` = "female")) 

pup_data

## # A tibble: 313 x 6
##   litter_number sex   pd_ears pd_eyes pd_pivot pd_walk
##   <chr>         <chr>   <int>   <int>    <int>   <int>
## 1 #85           male        4      13        7      11
## 2 #85           male        4      13        7      12
## 3 #1/2/95/2     male        5      13        7       9
## # … with 310 more rows

litter_data = 
  read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
  janitor::clean_names() %>%
  select(-pups_survive) %>%
  mutate(
    wt_gain = gd18_weight - gd0_weight,
    group = str_to_lower(group))

litter_data

## # A tibble: 49 x 8
##   group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
##   <chr> <chr>              <dbl>       <dbl>       <int>           <int>
## 1 con7  #85                 19.7        34.7          20               3
## 2 con7  #1/2/95/2           27          42            19               8
## 3 con7  #5/5/3/83/3-3       26          41.4          19               6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## #   wt_gain <dbl>

fas_data = 
  left_join(pup_data, litter_data, by = "litter_number") # by = c('xx', 'xx') or by = c('a' = 'b')

fas_data

## # A tibble: 313 x 13
##   litter_number sex   pd_ears pd_eyes pd_pivot pd_walk group gd0_weight
##   <chr>         <chr>   <int>   <int>    <int>   <int> <chr>      <dbl>
## 1 #85           male        4      13        7      11 con7        19.7
## 2 #85           male        4      13        7      12 con7        19.7
## 3 #1/2/95/2     male        5      13        7       9 con7        27  
## # … with 310 more rows, and 5 more variables: gd18_weight <dbl>,
## #   gd_of_birth <int>, pups_born_alive <int>, pups_dead_birth <int>,
## #   wt_gain <dbl>

full_join(pup_data, litter_data, by = 'litter_number')

## # A tibble: 315 x 13
##   litter_number sex   pd_ears pd_eyes pd_pivot pd_walk group gd0_weight
##   <chr>         <chr>   <int>   <int>    <int>   <int> <chr>      <dbl>
## 1 #85           male        4      13        7      11 con7        19.7
## 2 #85           male        4      13        7      12 con7        19.7
## 3 #1/2/95/2     male        5      13        7       9 con7        27  
## # … with 312 more rows, and 5 more variables: gd18_weight <dbl>,
## #   gd_of_birth <int>, pups_born_alive <int>, pups_dead_birth <int>,
## #   wt_gain <dbl>

Data Manipulation with dplyr

1. Data Manipulations

Selecting

Filtering

Mutate the data

Arrange

An Example

2. Tidy the Data

Untidy the data

Joining datasets