Title: | Additional Tools for Splitting and Cleaning Data |
---|---|
Description: | Move between data frames and lists more efficiently with precision splitting via 'dplyr' verbs. Easily cast variables to different data types. Keep rows with NAs. Shift row values. |
Authors: | David Ranzolin [aut, cre, cph] |
Maintainer: | David Ranzolin <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.0.2 |
Built: | 2025-02-08 04:07:24 UTC |
Source: | https://github.com/daranzolin/hacksaw |
Cast columns to a specified data type
cast_character(.data, ...) cast_numeric(.data, ...) cast_logical(.data, ...)
cast_character(.data, ...) cast_numeric(.data, ...) cast_logical(.data, ...)
.data |
a table of data. |
... |
A selection of columns. |
a data frame.
library(dplyr) df <- tibble(x = 1:3, y = as.character(1:3), z = c(0, 0, 1)) df %>% cast_character(x) df %>% cast_numeric(y) df %>% cast_logical(z)
library(dplyr) df <- tibble(x = 1:3, y = as.character(1:3), z = c(0, 0, 1)) df %>% cast_character(x) df %>% cast_numeric(y) df %>% cast_logical(z)
Grep and filter a data frame by pattern
filter_pattern(.data, col, pattern, ...)
filter_pattern(.data, col, pattern, ...)
.data |
a table of data. |
col |
a variable. |
pattern |
string containing a regular expression to be matched in the given character vector. |
... |
additional arguments passed to grepl |
a data frame.
library(dplyr) starwars %>% filter_pattern(homeworld, "oo")
library(dplyr) starwars %>% filter_pattern(homeworld, "oo")
Evaluate expressions over a data frame, resulting in a list.
filter_split(.data, ...) select_split(.data, ...) count_split(.data, ...) rolling_count_split(.data, ...) mutate_split(.data, ...) distinct_split(.data, ..., simplify = TRUE) transmute_split(.data, ..., simplify = TRUE) slice_split(.data, ...) pull_split(.data, ...) group_by_split(.data, ...) rolling_group_by_split(.data, ...) nest_by_split(.data, ...) rolling_nest_by_split(.data, ...) eval_split(.data, ...) precision_split(.data, ...)
filter_split(.data, ...) select_split(.data, ...) count_split(.data, ...) rolling_count_split(.data, ...) mutate_split(.data, ...) distinct_split(.data, ..., simplify = TRUE) transmute_split(.data, ..., simplify = TRUE) slice_split(.data, ...) pull_split(.data, ...) group_by_split(.data, ...) rolling_group_by_split(.data, ...) nest_by_split(.data, ...) rolling_nest_by_split(.data, ...) eval_split(.data, ...) precision_split(.data, ...)
.data |
A table of data. |
... |
Expressions to be evaluated. |
simplify |
Boolean, whether to unlist the returned split. |
A list.
library(dplyr) mtcars %>% filter_split(cyl == 4, cyl == 6) iris %>% select_split(starts_with("Sepal"), starts_with("Petal")) mtcars %>% count_split(gear, carb, across(c(cyl, gear))) mtcars %>% rolling_count_split(gear, carb, gear) mtcars %>% mutate_split(mpg2 = mpg^2, mpg3 = mpg^3) mtcars %>% distinct_split(cyl, carb) mtcars %>% transmute_split(mpg^2, sqrt(mpg)) mtcars %>% slice_split(1:10, 11:20) mtcars %>% pull_split(mpg, hp) mtcars %>% group_by_split(cyl, gear, across(c(cyl, gear))) mtcars %>% rolling_group_by_split(cyl, gear, am) mtcars %>% nest_by_split(cyl, gear, am) mtcars %>% rolling_nest_by_split(cyl, gear, am) mtcars %>% eval_split(select(mpg, hp), filter(mpg>25), mutate(mpg2 = mpg^2)) mtcars %>% precision_split(mpg > 25)
library(dplyr) mtcars %>% filter_split(cyl == 4, cyl == 6) iris %>% select_split(starts_with("Sepal"), starts_with("Petal")) mtcars %>% count_split(gear, carb, across(c(cyl, gear))) mtcars %>% rolling_count_split(gear, carb, gear) mtcars %>% mutate_split(mpg2 = mpg^2, mpg3 = mpg^3) mtcars %>% distinct_split(cyl, carb) mtcars %>% transmute_split(mpg^2, sqrt(mpg)) mtcars %>% slice_split(1:10, 11:20) mtcars %>% pull_split(mpg, hp) mtcars %>% group_by_split(cyl, gear, across(c(cyl, gear))) mtcars %>% rolling_group_by_split(cyl, gear, am) mtcars %>% nest_by_split(cyl, gear, am) mtcars %>% rolling_nest_by_split(cyl, gear, am) mtcars %>% eval_split(select(mpg, hp), filter(mpg>25), mutate(mpg2 = mpg^2)) mtcars %>% precision_split(mpg > 25)
Keep rows containing missing values
keep_na(.data, ..., .logic = c("AND", "OR"))
keep_na(.data, ..., .logic = c("AND", "OR"))
.data |
A table of data. |
... |
A selection of columns. If empty, all columns are selected. |
.logic |
boolean, either 'AND' or 'OR'. Logic for keeping NAs. |
A data frame.
library(dplyr) df <- tibble(x = c(1, 2, NA, NA), y = c("a", NA, "b", NA)) df %>% keep_na() df %>% keep_na(x) vars <- "y" df %>% keep_na(x, any_of(vars))
library(dplyr) df <- tibble(x = c(1, 2, NA, NA), y = c("a", NA, "b", NA)) df %>% keep_na() df %>% keep_na(x) vars <- "y" df %>% keep_na(x, any_of(vars))
Grep, keep or discard a list or vector by pattern
keep_pattern(x, pattern, ...) discard_pattern(x, pattern, ...)
keep_pattern(x, pattern, ...) discard_pattern(x, pattern, ...)
x |
a list or vector. |
pattern |
string containing a regular expression to be matched in the given character vector. |
... |
additional arguments passed to grepl. |
A list.
l <- list("David", "Daniel", "Damien", "Eric", "Jared", "Zach") l %>% keep_pattern("^D") l %>% discard_pattern("^D")
l <- list("David", "Daniel", "Damien", "Eric", "Jared", "Zach") l %>% keep_pattern("^D") l %>% discard_pattern("^D")
These joins will coerce key columns to a common atomic type.
left_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) inner_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) right_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) full_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE )
left_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) inner_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) right_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE ) full_join2( x, y, by = NULL, coerce_on_conflict = c("character", "numeric"), suffix = c(".x", ".y"), ..., keep = FALSE )
x |
A data frame |
y |
A data frame |
by |
A character vector of variables to join by. Can be NULL. |
coerce_on_conflict |
Either 'character' or 'numeric'. |
suffix |
If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. |
... |
Other parameters passed on to methods |
keep |
Should the join keys from both x and y be preserved in the output? |
a data frame
df1 <- data.frame(x = 1:10, b = 1:10, y = letters[1:10]) df2 <- data.frame(x = as.character(1:10), z = letters[11:20]) left_join2(df1, df2)
df1 <- data.frame(x = 1:10, b = 1:10, y = letters[1:10]) df2 <- data.frame(x = as.character(1:10), z = letters[11:20]) left_join2(df1, df2)
Pluck a value based on other criteria
pluck_when(.x, .p, .i = 1, .else = NA)
pluck_when(.x, .p, .i = 1, .else = NA)
.x |
Vector from which to select value. |
.p |
Logical expression. |
.i |
First TRUE index to return. |
.else |
If no matches from .p, value to return. |
A vector of length 1.
library(dplyr) df <- tibble( id = c(1, 1, 1, 2, 2, 2, 3, 3), tested = c("no", "no", "yes", "no", "no", "no", "yes", "yes"), year = c(2015:2017, 2010:2012, 2019:2020) ) df %>% group_by(id) %>% mutate(year_first_tested = pluck_when(year, tested == "yes"))
library(dplyr) df <- tibble( id = c(1, 1, 1, 2, 2, 2, 3, 3), tested = c("no", "no", "yes", "no", "no", "no", "yes", "yes"), year = c(2015:2017, 2010:2012, 2019:2020) ) df %>% group_by(id) %>% mutate(year_first_tested = pluck_when(year, tested == "yes"))
Shift row values left or right
shift_row_values(.data, .dir = "left", at = NULL)
shift_row_values(.data, .dir = "left", at = NULL)
.data |
a table of data. |
.dir |
the shift direction as a string, one of "left" or "right". |
at |
the row indices at which to shift. |
a data frame.
library(dplyr) df <- tibble( s = c(NA, 1, NA, NA), t = c(NA, NA, 1, NA), u = c(NA, NA, 2, 5), v = c(5, 1, 9, 2), x = c(1, 5, 6, 7), y = c(NA, NA, 8, NA), z = 1:4 ) df %>% shift_row_values() df %>% shift_row_values(at = 1:3) df %>% shift_row_values(at = 1:2, .dir = "right")
library(dplyr) df <- tibble( s = c(NA, 1, NA, NA), t = c(NA, NA, 1, NA), u = c(NA, NA, 2, 5), v = c(5, 1, 9, 2), x = c(1, 5, 6, 7), y = c(NA, NA, 8, NA), z = 1:4 ) df %>% shift_row_values() df %>% shift_row_values(at = 1:3) df %>% shift_row_values(at = 1:2, .dir = "right")
Return the indices of n max values of a variable
var_max(var, n = 6, value = FALSE)
var_max(var, n = 6, value = FALSE)
var |
the variable to use. |
n |
number of rows to return. |
value |
if FALSE, a vector containing the (integer) indices is returned, and if TRUE, a vector containing the elements themselves is returned. |
var_max(1:10)
var_max(1:10)
Return the indices of n min values of a variable
var_min(var, n = 6, value = FALSE)
var_min(var, n = 6, value = FALSE)
var |
the variable to use. |
n |
number of rows to return. |
value |
if FALSE, a vector containing the (integer) indices is returned, and if TRUE, a vector containing the elements themselves is returned. |
var_min(1:10)
var_min(1:10)