1. Data Manipulations
Selecting
select(litters_data, group, litter_number, starts_with("pups"))
## # A tibble: 49 x 5
## group litter_number pups_born_alive pups_dead_birth pups_survive
## <chr> <chr> <int> <int> <int>
## 1 Con7 #85 3 4 3
## 2 Con7 #1/2/95/2 8 0 7
## 3 Con7 #5/5/3/83/3-3 6 0 5
## # … with 46 more rows
select(litters_data, litter_number, group, everything())
## # A tibble: 49 x 8
## litter_number group gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 #85 Con7 19.7 34.7 20 3
## 2 #1/2/95/2 Con7 27 42 19 8
## 3 #5/5/3/83/3-3 Con7 26 41.4 19 6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
# remove variable
select(litters_data, -group)
## # A tibble: 49 x 7
## litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <dbl> <dbl> <int> <int>
## 1 #85 19.7 34.7 20 3
## 2 #1/2/95/2 27 42 19 8
## 3 #5/5/3/83/3-3 26 41.4 19 6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
select(litters_data, litter_number, gd0_weight:pups_born_alive)
## # A tibble: 49 x 5
## litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <dbl> <dbl> <int> <int>
## 1 #85 19.7 34.7 20 3
## 2 #1/2/95/2 27 42 19 8
## 3 #5/5/3/83/3-3 26 41.4 19 6
## # … with 46 more rows
# renmae the variable
select(litters_data, GROUP = group, litter_number)
## # A tibble: 49 x 2
## GROUP litter_number
## <chr> <chr>
## 1 Con7 #85
## 2 Con7 #1/2/95/2
## 3 Con7 #5/5/3/83/3-3
## # … with 46 more rows
# just rename the variable but no selection
rename(litters_data, GROUP = group)
## # A tibble: 49 x 8
## GROUP litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
Filtering
filter(litters_data, group == 'Con7')
## # A tibble: 7 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## 4 Con7 #5/4/2/95/2 28.5 44.1 19 5
## 5 Con7 #4/2/95/3-3 NA NA 20 6
## 6 Con7 #2/2/95/3-2 NA NA 20 6
## 7 Con7 #1/5/3/83/3-… NA NA 20 9
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>
filter(litters_data, pups_born_alive < 6)
## # A tibble: 8 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #5/4/2/95/2 28.5 44.1 19 5
## 3 Con8 #2/2/95/2 NA NA 19 5
## 4 Mod7 #3/82/3-2 28 45.9 20 5
## 5 Mod7 #5/3/83/5-2 22.6 37 19 5
## 6 Mod7 #106 21.7 37.8 20 5
## 7 Low7 #111 25.5 44.6 20 3
## 8 Low8 #4/84 21.8 35.2 20 4
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>
# group == con7 and/or con8
filter(litters_data, group %in% c("Con7", "Con8"))
## # A tibble: 15 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## 4 Con7 #5/4/2/95/2 28.5 44.1 19 5
## 5 Con7 #4/2/95/3-3 NA NA 20 6
## 6 Con7 #2/2/95/3-2 NA NA 20 6
## 7 Con7 #1/5/3/83/3-… NA NA 20 9
## 8 Con8 #3/83/3-3 NA NA 20 9
## 9 Con8 #2/95/3 NA NA 20 8
## 10 Con8 #3/5/2/2/95 28.5 NA 20 8
## 11 Con8 #5/4/3/83/3 28 NA 19 9
## 12 Con8 #1/6/2/2/95-2 NA NA 20 7
## 13 Con8 #3/5/3/83/3-… NA NA 20 8
## 14 Con8 #2/2/95/2 NA NA 19 5
## 15 Con8 #3/6/2/2/95-3 NA NA 20 7
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>
filter(litters_data, group == 'Con7' | group == 'Con8')
## # A tibble: 15 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## 4 Con7 #5/4/2/95/2 28.5 44.1 19 5
## 5 Con7 #4/2/95/3-3 NA NA 20 6
## 6 Con7 #2/2/95/3-2 NA NA 20 6
## 7 Con7 #1/5/3/83/3-… NA NA 20 9
## 8 Con8 #3/83/3-3 NA NA 20 9
## 9 Con8 #2/95/3 NA NA 20 8
## 10 Con8 #3/5/2/2/95 28.5 NA 20 8
## 11 Con8 #5/4/3/83/3 28 NA 19 9
## 12 Con8 #1/6/2/2/95-2 NA NA 20 7
## 13 Con8 #3/5/3/83/3-… NA NA 20 8
## 14 Con8 #2/2/95/2 NA NA 19 5
## 15 Con8 #3/6/2/2/95-3 NA NA 20 7
## # … with 2 more variables: pups_dead_birth <int>, pups_survive <int>
filter(litters_data, gd0_weight < gd18_weight)
## # A tibble: 31 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 28 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
# don't do this:
filter(litters_data, !is.na(gd0_weight))
## # A tibble: 34 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 31 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
# recommend:
drop_na(litters_data, gd0_weight)
## # A tibble: 34 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Con7 #1/2/95/2 27 42 19 8
## 3 Con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 31 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
Mutate the data
mutate(
litters_data,
wt_gain = gd18_weight - gd0_weight,
group = str_to_lower(group),
group = str_to_upper(group))
## # A tibble: 49 x 9
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 CON7 #85 19.7 34.7 20 3
## 2 CON7 #1/2/95/2 27 42 19 8
## 3 CON7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 46 more rows, and 3 more variables: pups_dead_birth <int>,
## # pups_survive <int>, wt_gain <dbl>
Arrange
arrange(litters_data, pups_born_alive)
## # A tibble: 49 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Low7 #111 25.5 44.6 20 3
## 3 Low8 #4/84 21.8 35.2 20 4
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
arrange(litters_data, pups_born_alive, gd0_weight)
## # A tibble: 49 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 Con7 #85 19.7 34.7 20 3
## 2 Low7 #111 25.5 44.6 20 3
## 3 Low8 #4/84 21.8 35.2 20 4
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # pups_survive <int>
An Example
litters_data =
read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
janitor::clean_names() %>%
select(-pups_survive) %>%
mutate(
wt_gain = gd18_weight - gd0_weight,
group = str_to_lower(group)) %>%
drop_na(wt_gain)
litters_data
## # A tibble: 31 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 con7 #85 19.7 34.7 20 3
## 2 con7 #1/2/95/2 27 42 19 8
## 3 con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 28 more rows, and 2 more variables: pups_dead_birth <int>,
## # wt_gain <dbl>
litters_data %>% view()
2. Tidy the Data
pulse_data =
haven::read_sas("./data/public_pulse_data.sas7bdat") %>%
janitor::clean_names() %>%
pivot_longer(
bdi_score_bl:bdi_score_12m,
names_to = 'visit',
names_prefix = 'bdi_score_',
values_to = 'bdi'
) %>%
mutate(
visit = recode(visit, 'bl' = '00m')
)
pulse_data
## # A tibble: 4,348 x 5
## id age sex visit bdi
## <dbl> <dbl> <chr> <chr> <dbl>
## 1 10003 48.0 male 00m 7
## 2 10003 48.0 male 01m 1
## 3 10003 48.0 male 06m 2
## # … with 4,345 more rows
litters_data =
read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
janitor::clean_names() %>%
separate(col = group, into = c('dose','day_of_tx'), sep = 3)
Untidy the data
analysis_result = tibble(
group = c("treatment", "treatment", "placebo", "placebo"),
time = c("pre", "post", "pre", "post"),
mean = c(4, 8, 3.5, 4)
)
analysis_result
## # A tibble: 4 x 3
## group time mean
## <chr> <chr> <dbl>
## 1 treatment pre 4
## 2 treatment post 8
## 3 placebo pre 3.5
## 4 placebo post 4
pivot_wider(
analysis_result,
names_from = "time",
values_from = "mean")
## # A tibble: 2 x 3
## group pre post
## <chr> <dbl> <dbl>
## 1 treatment 4 8
## 2 placebo 3.5 4
fellowship_ring =
readxl::read_excel("./data/LotR_Words.xlsx", range = "B3:D6") %>%
mutate(movie = "fellowship_ring")
two_towers =
readxl::read_excel("./data/LotR_Words.xlsx", range = "F3:H6") %>%
mutate(movie = "two_towers")
return_king =
readxl::read_excel("./data/LotR_Words.xlsx", range = "J3:L6") %>%
mutate(movie = "return_king")
lotr_tidy =
bind_rows(fellowship_ring, two_towers, return_king) %>%
janitor::clean_names() %>%
pivot_longer(
female:male,
names_to = "sex",
values_to = "words") %>%
mutate(race = str_to_lower(race)) %>%
select(movie, everything())
lotr_tidy
## # A tibble: 18 x 4
## movie race sex words
## <chr> <chr> <chr> <dbl>
## 1 fellowship_ring elf female 1229
## 2 fellowship_ring elf male 971
## 3 fellowship_ring hobbit female 14
## 4 fellowship_ring hobbit male 3644
## 5 fellowship_ring man female 0
## 6 fellowship_ring man male 1995
## 7 two_towers elf female 331
## 8 two_towers elf male 513
## 9 two_towers hobbit female 0
## 10 two_towers hobbit male 2463
## 11 two_towers man female 401
## 12 two_towers man male 3589
## 13 return_king elf female 183
## 14 return_king elf male 510
## 15 return_king hobbit female 2
## 16 return_king hobbit male 2673
## 17 return_king man female 268
## 18 return_king man male 2459
Joining datasets
pup_data =
read_csv("./data/FAS_pups.csv", col_types = "ciiiii") %>%
janitor::clean_names() %>%
mutate(sex = recode(sex, `1` = "male", `2` = "female"))
pup_data
## # A tibble: 313 x 6
## litter_number sex pd_ears pd_eyes pd_pivot pd_walk
## <chr> <chr> <int> <int> <int> <int>
## 1 #85 male 4 13 7 11
## 2 #85 male 4 13 7 12
## 3 #1/2/95/2 male 5 13 7 9
## # … with 310 more rows
litter_data =
read_csv("./data/FAS_litters.csv", col_types = "ccddiiii") %>%
janitor::clean_names() %>%
select(-pups_survive) %>%
mutate(
wt_gain = gd18_weight - gd0_weight,
group = str_to_lower(group))
litter_data
## # A tibble: 49 x 8
## group litter_number gd0_weight gd18_weight gd_of_birth pups_born_alive
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 con7 #85 19.7 34.7 20 3
## 2 con7 #1/2/95/2 27 42 19 8
## 3 con7 #5/5/3/83/3-3 26 41.4 19 6
## # … with 46 more rows, and 2 more variables: pups_dead_birth <int>,
## # wt_gain <dbl>
fas_data =
left_join(pup_data, litter_data, by = "litter_number") # by = c('xx', 'xx') or by = c('a' = 'b')
fas_data
## # A tibble: 313 x 13
## litter_number sex pd_ears pd_eyes pd_pivot pd_walk group gd0_weight
## <chr> <chr> <int> <int> <int> <int> <chr> <dbl>
## 1 #85 male 4 13 7 11 con7 19.7
## 2 #85 male 4 13 7 12 con7 19.7
## 3 #1/2/95/2 male 5 13 7 9 con7 27
## # … with 310 more rows, and 5 more variables: gd18_weight <dbl>,
## # gd_of_birth <int>, pups_born_alive <int>, pups_dead_birth <int>,
## # wt_gain <dbl>
full_join(pup_data, litter_data, by = 'litter_number')
## # A tibble: 315 x 13
## litter_number sex pd_ears pd_eyes pd_pivot pd_walk group gd0_weight
## <chr> <chr> <int> <int> <int> <int> <chr> <dbl>
## 1 #85 male 4 13 7 11 con7 19.7
## 2 #85 male 4 13 7 12 con7 19.7
## 3 #1/2/95/2 male 5 13 7 9 con7 27
## # … with 312 more rows, and 5 more variables: gd18_weight <dbl>,
## # gd_of_birth <int>, pups_born_alive <int>, pups_dead_birth <int>,
## # wt_gain <dbl>