library(dplyr) library(nycflights13)
There are several verbs most commonly used in dplyr
to modify datasets.
Select tailnum
, type
, model
variables from the dataframe planes
:
select(planes, tailnum, type, model)
## # A tibble: 3,322 × 3
## tailnum type model
## <chr> <chr> <chr>
## 1 N10156 Fixed wing multi engine EMB-145XR
## 2 N102UW Fixed wing multi engine A320-214
## 3 N103US Fixed wing multi engine A320-214
## 4 N104UW Fixed wing multi engine A320-214
## 5 N10575 Fixed wing multi engine EMB-145LR
## 6 N105UW Fixed wing multi engine A320-214
## 7 N107US Fixed wing multi engine A320-214
## 8 N108UW Fixed wing multi engine A320-214
## 9 N109UW Fixed wing multi engine A320-214
## 10 N110UW Fixed wing multi engine A320-214
## # ... with 3,312 more rows
Rewrite the statement above with the forward-pipe operator (%>%
) from the magrittr package:
planes %>% select(tailnum, type, model)
## # A tibble: 3,322 × 3
## tailnum type model
## <chr> <chr> <chr>
## 1 N10156 Fixed wing multi engine EMB-145XR
## 2 N102UW Fixed wing multi engine A320-214
## 3 N103US Fixed wing multi engine A320-214
## 4 N104UW Fixed wing multi engine A320-214
## 5 N10575 Fixed wing multi engine EMB-145LR
## 6 N105UW Fixed wing multi engine A320-214
## 7 N107US Fixed wing multi engine A320-214
## 8 N108UW Fixed wing multi engine A320-214
## 9 N109UW Fixed wing multi engine A320-214
## 10 N110UW Fixed wing multi engine A320-214
## # ... with 3,312 more rows
filter
rows based on crieria.
Return a dataset where manufacturer
is "EMBRAER":
planes %>% filter(manufacturer == "EMBRAER")
## # A tibble: 299 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 3 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 289 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Return a dataset where manufacturer
is "EMBRAER" and model
is "EMB-145XR":
planes %>% filter(manufacturer == "EMBRAER", model == "EMB-145XR")
## # A tibble: 104 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11140 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 94 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
The statement above is the same as writing an "AND" condition.
planes %>% filter(manufacturer == "EMBRAER" & model == "EMB-145XR")
## # A tibble: 104 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11140 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 94 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Use the pipe (|) character for "OR" conditions:
planes %>% filter(manufacturer == "EMBRAER" | model == "EMB-145XR")
## # A tibble: 299 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 3 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 289 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Use grepl
in combination with filter
for pattern-matching conditions.
planes %>% filter(grepl("^172.", model))
## # A tibble: 3 × 9
## tailnum year type manufacturer model engines seats
## <chr> <int> <chr> <chr> <chr> <int> <int>
## 1 N378AA 1963 Fixed wing single engine CESSNA 172E 1 4
## 2 N621AA 1975 Fixed wing single engine CESSNA 172M 1 4
## 3 N737MQ 1977 Fixed wing single engine CESSNA 172N 1 4
## # ... with 2 more variables: speed <int>, engine <chr>
Return all rows where year
is between
2004 and 2005:
planes %>% filter(between(year, 2004, 2005))
## # A tibble: 354 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11155 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11164 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11165 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11176 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11181 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11184 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11187 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11189 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11191 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 344 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
slice
returns only rows by the given index.
Return the first five rows of data (same as the base head
function):
planes %>% slice(1:5)
## # A tibble: 5 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
Return the 1st, 3rd, and 5th rows of data:
planes %>% slice(c(1, 3, 5))
## # A tibble: 3 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
Return the first and last rows:
planes %>% slice(c(1, nrow(planes)))
## # A tibble: 2 × 9
## tailnum year type manufacturer
## <chr> <int> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER
## 2 N999DN 1992 Fixed wing multi engine MCDONNELL DOUGLAS CORPORATION
## # ... with 5 more variables: model <chr>, engines <int>, seats <int>,
## # speed <int>, engine <chr>
mutate
can add new variables or modify existing variables.
Add a dummy variable, engine.dummy
with a default value of 0:
planes %>% mutate(engine.dummy = 0) %>% select(engine, engine.dummy)
## # A tibble: 3,322 × 2
## engine engine.dummy
## <chr> <dbl>
## 1 Turbo-fan 0
## 2 Turbo-fan 0
## 3 Turbo-fan 0
## 4 Turbo-fan 0
## 5 Turbo-fan 0
## 6 Turbo-fan 0
## 7 Turbo-fan 0
## 8 Turbo-fan 0
## 9 Turbo-fan 0
## 10 Turbo-fan 0
## # ... with 3,312 more rows
Using dplyr::if_else
, add engine.dummy
set to 1 if engine
== "Turbo-fan", otherwise set engine.dummy
to 0:
planes %>% mutate(engine.dummy = if_else(engine == "Turbo-fan", 1, 0)) %>% select(engine, engine.dummy)
## # A tibble: 3,322 × 2
## engine engine.dummy
## <chr> <dbl>
## 1 Turbo-fan 1
## 2 Turbo-fan 1
## 3 Turbo-fan 1
## 4 Turbo-fan 1
## 5 Turbo-fan 1
## 6 Turbo-fan 1
## 7 Turbo-fan 1
## 8 Turbo-fan 1
## 9 Turbo-fan 1
## 10 Turbo-fan 1
## # ... with 3,312 more rows
Convert planes$engine
to a factor.
planes %>% mutate(engine = as.factor(engine)) %>% select(engine)
## # A tibble: 3,322 × 1
## engine
## <fctr>
## 1 Turbo-fan
## 2 Turbo-fan
## 3 Turbo-fan
## 4 Turbo-fan
## 5 Turbo-fan
## 6 Turbo-fan
## 7 Turbo-fan
## 8 Turbo-fan
## 9 Turbo-fan
## 10 Turbo-fan
## # ... with 3,312 more rows
Use arrange
to sort your dataframe.
Arrange planes
by year
:
planes %>% arrange(year)
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N381AA 1956 Fixed wing multi engine DOUGLAS DC-7BF 4
## 2 N201AA 1959 Fixed wing single engine CESSNA 150 1
## 3 N567AA 1959 Fixed wing single engine DEHAVILLAND OTTER DHC-3 1
## 4 N378AA 1963 Fixed wing single engine CESSNA 172E 1
## 5 N575AA 1963 Fixed wing single engine CESSNA 210-5(205) 1
## 6 N14629 1965 Fixed wing multi engine BOEING 737-524 2
## 7 N615AA 1967 Fixed wing multi engine BEECH 65-A90 2
## 8 N425AA 1968 Fixed wing single engine PIPER PA-28-180 1
## 9 N383AA 1972 Fixed wing multi engine BEECH E-90 2
## 10 N364AA 1973 Fixed wing multi engine CESSNA 310Q 2
## # ... with 3,312 more rows, and 3 more variables: seats <int>,
## # speed <int>, engine <chr>
arrange
planes
by year
desc
:
planes %>% arrange(desc(year))
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N150UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 2 N151UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 3 N152UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 4 N153UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 5 N154UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 6 N155UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 7 N156UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 8 N157UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 9 N198UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 10 N199UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## # ... with 3,312 more rows, and 3 more variables: seats <int>,
## # speed <int>, engine <chr>
group_by
allows you to perform operations on a dataframe by subsets without extracting the subset.
df <- planes %>% group_by(manufacturer, model)
The returned dataframe may not appear grouped. However, the class
and attributes
of the dataframe will confirm it is.
class(df)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## [[1]]
## manufacturer
##
## [[2]]
## model
head(attributes(df)$labels, n = 5L)
## manufacturer model
## 1 AGUSTA SPA A109E
## 2 AIRBUS A319-112
## 3 AIRBUS A319-114
## 4 AIRBUS A319-115
## 5 AIRBUS A319-131
If you wish to add grouping elements to the dataframe without removing existing grouping elements, use the add
parameter set to TRUE (set to FALSE by default):
df <- df %>% group_by(type, year, add = TRUE)
class(df)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## [[1]]
## manufacturer
##
## [[2]]
## model
##
## [[3]]
## type
##
## [[4]]
## year
head(attributes(df)$labels, n = 5L)
## manufacturer model type year
## 1 AGUSTA SPA A109E Rotorcraft 2001
## 2 AIRBUS A319-112 Fixed wing multi engine 2002
## 3 AIRBUS A319-112 Fixed wing multi engine 2005
## 4 AIRBUS A319-112 Fixed wing multi engine 2006
## 5 AIRBUS A319-112 Fixed wing multi engine 2007
If you want to remove grouping use ungroup
.
df <- df %>% ungroup()
class(df)
## [1] "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## NULL
attributes(df)$labels
## NULL
summarise
is used to perform calculations on a dataset either as a whole or by groups.
Find the mean
number of of seats
per manufacturer
?
planes %>% group_by(manufacturer) %>% summarise(Mean = mean(seats))
## # A tibble: 35 × 2
## manufacturer Mean
## <chr> <dbl>
## 1 AGUSTA SPA 8.0000
## 2 AIRBUS 221.2024
## 3 AIRBUS INDUSTRIE 187.4025
## 4 AMERICAN AIRCRAFT INC 2.0000
## 5 AVIAT AIRCRAFT INC 2.0000
## 6 AVIONS MARCEL DASSAULT 12.0000
## 7 BARKER JACK L 2.0000
## 8 BEECH 9.5000
## 9 BELL 8.0000
## 10 BOEING 175.1877
## # ... with 25 more rows
summarise
will not return variables that are not explicitly grouped or included in summary functions. If you want to add another variable you must pass it as a predicate to group_by
or summarise
.
planes %>% group_by(year, manufacturer) %>% summarise(Mean = mean(seats))
## Source: local data frame [164 x 3]
## Groups: year [?]
##
## year manufacturer Mean
## <int> <chr> <dbl>
## 1 1956 DOUGLAS 102
## 2 1959 CESSNA 2
## 3 1959 DEHAVILLAND 16
## 4 1963 CESSNA 5
## 5 1965 BOEING 149
## 6 1967 BEECH 9
## 7 1968 PIPER 4
## 8 1972 BEECH 10
## 9 1973 CESSNA 6
## 10 1974 CANADAIR LTD 2
## # ... with 154 more rows
rename
a variable:
planes %>% rename(Mfr = manufacturer) %>% names()
## [1] "tailnum" "year" "type" "Mfr" "model" "engines" "seats"
## [8] "speed" "engine"