com um clique
tidyverse-patterns
// Modern tidyverse patterns for R including pipes, joins, grouping, purrr, and stringr. Use when writing tidyverse R code.
// Modern tidyverse patterns for R including pipes, joins, grouping, purrr, and stringr. Use when writing tidyverse R code.
R style guide covering naming conventions, spacing, layout, and function design best practices. Use when writing R code.
Patterns for Bayesian inference in R using brms, including multilevel models, DAG validation, and marginal effects. Use when performing Bayesian analysis.
Test-driven development workflow for R using testthat. Use when writing new features, fixing bugs, or refactoring code. Enforces test-first development with 80%+ coverage.
R object-oriented programming guide for S7, S3, S4, and vctrs. Use when designing R classes or choosing an OOP system.
R package development guide covering dependencies, API design, testing, and documentation. Use when developing R packages.
R performance best practices including profiling, benchmarking, vctrs, and optimization strategies. Use when optimizing R code.
| name | tidyverse-patterns |
| description | Modern tidyverse patterns for R including pipes, joins, grouping, purrr, and stringr. Use when writing tidyverse R code. |
Best practices for modern tidyverse development with dplyr 1.1+ and R 4.3+
|> not %>%)|> instead of magrittr %>%# Good - Modern native pipe
data |>
filter(year >= 2020) |>
summarise(mean_value = mean(value))
# Avoid - Legacy magrittr pipe
data %>%
filter(year >= 2020) %>%
summarise(mean_value = mean(value))
join_by() instead of character vectors for joins# Good - Modern join syntax
transactions |>
inner_join(companies, by = join_by(company == id))
# Good - Inequality joins
transactions |>
inner_join(companies, join_by(company == id, year >= since))
# Good - Rolling joins (closest match)
transactions |>
inner_join(companies, join_by(company == id, closest(year >= since)))
# Avoid - Old character vector syntax
transactions |>
inner_join(companies, by = c("company" = "id"))
relationship to validate join assumptionsunmatched = "error" to catch unexpected non-matchesna_matches = "never" to prevent silent NA joinstidylog:: prefix interactively to verify join results# Validate 1:1 relationship — errors if violated
inner_join(x, y, by = join_by(id),
relationship = "one-to-one")
# Validate many-to-one (left has duplicates, right does not)
left_join(transactions, companies, by = join_by(company == id),
relationship = "many-to-one")
# Ensure all rows from left match something in right
inner_join(x, y, by = join_by(id),
unmatched = "error")
# Prevent NA values from matching each other silently
left_join(x, y, by = join_by(id),
na_matches = "never")
# Combine for strict joins
inner_join(x, y, by = join_by(id),
relationship = "one-to-one",
unmatched = "error",
na_matches = "never")
# Interactive verification with tidylog
# tidylog prints a summary of rows matched/dropped
tidylog::inner_join(x, y, by = join_by(id))
{{}} (embrace) for function arguments.data[[]] for character vectors# Data masking functions: arrange(), filter(), mutate(), summarise()
# Tidy selection functions: select(), relocate(), across()
# Function arguments - embrace with {{}}
my_summary <- function(data, group_var, summary_var) {
data |>
group_by({{ group_var }}) |>
summarise(mean_val = mean({{ summary_var }}))
}
# Character vectors - use .data[[]]
for (var in names(mtcars)) {
mtcars |> count(.data[[var]]) |> print()
}
# Multiple columns - use across()
data |>
summarise(across({{ summary_vars }}, ~ mean(.x, na.rm = TRUE)))
.by for per-operation grouping (dplyr 1.1+)pick() for column selection inside data-masking functionsacross() for applying functions to multiple columnsreframe() for multi-row summaries# Good - Per-operation grouping (always returns ungrouped)
data |>
summarise(mean_value = mean(value), .by = category)
# Good - Multiple grouping variables
data |>
summarise(total = sum(revenue), .by = c(company, year))
# Good - pick() for column selection
data |>
summarise(
n_x_cols = ncol(pick(starts_with("x"))),
n_y_cols = ncol(pick(starts_with("y")))
)
# Good - across() for applying functions
data |>
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"), .by = group)
# Good - reframe() for multi-row results
data |>
reframe(quantiles = quantile(x, c(0.25, 0.5, 0.75)), .by = group)
# Avoid - Old persistent grouping pattern
data |>
group_by(category) |>
summarise(mean_value = mean(value)) |>
ungroup()
filter_out() instead of negating conditions — negation (!condition) silently drops NAswhen_any() and when_all() for multi-column OR/AND filters (dplyr 1.2+)# Problem: negation silently drops rows where condition is NA
filter(data, !(value < 0)) # drops rows where value is NA — silent!
# Good - filter_out() passes NAs through safely
filter_out(data, value < 0) # rows where value is NA are kept
# Good - when_any() for OR across columns (dplyr 1.2+)
filter(data, when_any(x, y, z, \(col) col > 0)) # any column > 0
# Good - when_all() for AND across columns
filter(data, when_all(x, y, z, \(col) !is.na(col))) # no NAs in any
# Avoid - verbose base patterns
filter(data, !(value < 0) | is.na(value)) # workaround, not idiomatic
replace_when() for in-place conditional updates — avoids case_when() with .default = xcase_when() with .unmatched = "error" when all cases should be handled# Good - replace_when() for in-place updates (type-stable, NAs unaffected)
mutate(data, status = replace_when(status,
value < 0 ~ "negative",
value == 0 ~ "zero"
))
# Avoid - case_when() requires restating the variable in .default
mutate(data, status = case_when(
value < 0 ~ "negative",
value == 0 ~ "zero",
.default = status # repetitive
))
# Good - case_when() with strict exhaustiveness check
mutate(data, grade = case_when(
score >= 90 ~ "A",
score >= 80 ~ "B",
score >= 70 ~ "C",
.unmatched = "error" # error if any row falls through
))
qs2 for fast serialization — successor to qs, not backwards-compatible# Good - qs2 (use .qs2 extension)
qs2::qs_save(object, "data/results.qs2")
object <- qs2::qs_read("data/results.qs2")
# Avoid - older qs package
qs::qsave(object, "data/results.qs") # outdated
map() |> list_rbind() instead of superseded map_dfr()walk() for side effects (file writing, plotting)in_parallel() for scaling across cores# Modern data frame row binding (purrr 1.0+)
models <- data_splits |>
map(\(split) train_model(split)) |>
list_rbind() # Replaces map_dfr()
# Column binding
summaries <- data_list |>
map(\(df) get_summary_stats(df)) |>
list_cbind() # Replaces map_dfc()
# Side effects with walk()
plots <- walk2(data_list, plot_names, \(df, name) {
p <- ggplot(df, aes(x, y)) + geom_point()
ggsave(name, p)
})
# Parallel processing (purrr 1.1.0+)
library(mirai)
daemons(4)
results <- large_datasets |>
map(in_parallel(expensive_computation))
daemons(0)
str_ prefix and string-first argument order# Good - stringr (consistent, pipe-friendly)
text |>
str_to_lower() |>
str_trim() |>
str_replace_all("pattern", "replacement") |>
str_extract("\\d+")
# Common patterns
str_detect(text, "pattern") # vs grepl("pattern", text)
str_extract(text, "pattern") # vs complex regmatches()
str_replace_all(text, "a", "b") # vs gsub("a", "b", text)
str_split(text, ",") # vs strsplit(text, ",")
str_length(text) # vs nchar(text)
str_sub(text, 1, 5) # vs substr(text, 1, 5)
# String combination and formatting
str_c("a", "b", "c") # vs paste0()
str_glue("Hello {name}!") # templating
str_pad(text, 10, "left") # padding
str_wrap(text, width = 80) # text wrapping
# Case conversion
str_to_lower(text) # vs tolower()
str_to_upper(text) # vs toupper()
str_to_title(text) # vs tools::toTitleCase()
# Pattern helpers for clarity
str_detect(text, fixed("$")) # literal match
str_detect(text, regex("\\d+")) # explicit regex
str_detect(text, coll("e", locale = "fr")) # collation
# Avoid - inconsistent base R functions
grepl("pattern", text) # argument order varies
regmatches(text, regexpr(...)) # complex extraction
gsub("a", "b", text) # different arg order
# Good - vectorized operations
result <- x + y
# Good - Type-stable purrr functions
map_dbl(data, mean) # always returns double
map_chr(data, class) # always returns character
# Avoid - Type-unstable base functions
sapply(data, mean) # might return list or vector
# Avoid - explicit loops for simple operations
result <- numeric(length(x))
for(i in seq_along(x)) {
result[i] <- x[i] + y[i]
}
# Avoid - Old pipe
data %>% function()
# Avoid - Old join syntax
inner_join(x, y, by = c("a" = "b"))
# Avoid - Implicit type conversion
sapply() # Use map_*() instead
# Avoid - String manipulation in data masking
mutate(data, !!paste0("new_", var) := value)
# Use across() or other approaches instead
# Avoid - Growing objects in loops
result <- c()
for(i in 1:n) {
result <- c(result, compute(i)) # Slow!
}
# Good - Pre-allocate
result <- vector("list", n)
for(i in 1:n) {
result[[i]] <- compute(i)
}
# Better - Use purrr
result <- map(1:n, compute)
# Data manipulation
subset(data, condition) -> filter(data, condition)
data[order(data$x), ] -> arrange(data, x)
aggregate(x ~ y, data, mean) -> summarise(data, mean(x), .by = y)
# Functional programming
sapply(x, f) -> map(x, f) # type-stable
lapply(x, f) -> map(x, f)
# String manipulation
grepl("pattern", text) -> str_detect(text, "pattern")
gsub("old", "new", text) -> str_replace_all(text, "old", "new")
substr(text, 1, 5) -> str_sub(text, 1, 5)
nchar(text) -> str_length(text)
strsplit(text, ",") -> str_split(text, ",")
paste0(a, b) -> str_c(a, b)
tolower(text) -> str_to_lower(text)
# Pipes
data %>% function() -> data |> function()
# Grouping (dplyr 1.1+)
group_by(data, x) |>
summarise(mean(y)) |>
ungroup() -> summarise(data, mean(y), .by = x)
# Column selection
across(starts_with("x")) -> pick(starts_with("x")) # for selection only
# Joins
by = c("a" = "b") -> by = join_by(a == b)
# Multi-row summaries
summarise(data, x, .groups = "drop") -> reframe(data, x)
# Data reshaping
gather()/spread() -> pivot_longer()/pivot_wider()
# String separation (tidyr 1.3+)
separate(col, into = c("a", "b")) -> separate_wider_delim(col, delim = "_", names = c("a", "b"))
extract(col, into = "x", regex) -> separate_wider_regex(col, patterns = c(x = regex))
map_dfr(x, f) -> map(x, f) |> list_rbind()
map_dfc(x, f) -> map(x, f) |> list_cbind()
map2_dfr(x, y, f) -> map2(x, y, f) |> list_rbind()
pmap_dfr(list, f) -> pmap(list, f) |> list_rbind()
imap_dfr(x, f) -> imap(x, f) |> list_rbind()
# For side effects
walk(x, write_file) # instead of for loops
walk2(data, paths, write_csv) # multiple arguments