library(dplyr) library(nycflights13)
There are several verbs most commonly used in dplyr to modify datasets.
Select tailnum, type, model variables from the dataframe planes:
select(planes, tailnum, type, model)
## # A tibble: 3,322 × 3
## tailnum type model
## <chr> <chr> <chr>
## 1 N10156 Fixed wing multi engine EMB-145XR
## 2 N102UW Fixed wing multi engine A320-214
## 3 N103US Fixed wing multi engine A320-214
## 4 N104UW Fixed wing multi engine A320-214
## 5 N10575 Fixed wing multi engine EMB-145LR
## 6 N105UW Fixed wing multi engine A320-214
## 7 N107US Fixed wing multi engine A320-214
## 8 N108UW Fixed wing multi engine A320-214
## 9 N109UW Fixed wing multi engine A320-214
## 10 N110UW Fixed wing multi engine A320-214
## # ... with 3,312 more rows
Rewrite the statement above with the forward-pipe operator (%>%) from the magrittr package:
planes %>% select(tailnum, type, model)
## # A tibble: 3,322 × 3
## tailnum type model
## <chr> <chr> <chr>
## 1 N10156 Fixed wing multi engine EMB-145XR
## 2 N102UW Fixed wing multi engine A320-214
## 3 N103US Fixed wing multi engine A320-214
## 4 N104UW Fixed wing multi engine A320-214
## 5 N10575 Fixed wing multi engine EMB-145LR
## 6 N105UW Fixed wing multi engine A320-214
## 7 N107US Fixed wing multi engine A320-214
## 8 N108UW Fixed wing multi engine A320-214
## 9 N109UW Fixed wing multi engine A320-214
## 10 N110UW Fixed wing multi engine A320-214
## # ... with 3,312 more rows
filter rows based on crieria.
Return a dataset where manufacturer is "EMBRAER":
planes %>% filter(manufacturer == "EMBRAER")
## # A tibble: 299 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 3 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 289 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Return a dataset where manufacturer is "EMBRAER" and model is "EMB-145XR":
planes %>% filter(manufacturer == "EMBRAER", model == "EMB-145XR")
## # A tibble: 104 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11140 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 94 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
The statement above is the same as writing an "AND" condition.
planes %>% filter(manufacturer == "EMBRAER" & model == "EMB-145XR")
## # A tibble: 104 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11140 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 94 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Use the pipe (|) character for "OR" conditions:
planes %>% filter(manufacturer == "EMBRAER" | model == "EMB-145XR")
## # A tibble: 299 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 3 N11106 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11107 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11109 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11113 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11119 2002 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11121 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11127 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11137 2003 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 289 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
Use grepl in combination with filter for pattern-matching conditions.
planes %>% filter(grepl("^172.", model))
## # A tibble: 3 × 9
## tailnum year type manufacturer model engines seats
## <chr> <int> <chr> <chr> <chr> <int> <int>
## 1 N378AA 1963 Fixed wing single engine CESSNA 172E 1 4
## 2 N621AA 1975 Fixed wing single engine CESSNA 172M 1 4
## 3 N737MQ 1977 Fixed wing single engine CESSNA 172N 1 4
## # ... with 2 more variables: speed <int>, engine <chr>
Return all rows where year is between 2004 and 2005:
planes %>% filter(between(year, 2004, 2005))
## # A tibble: 354 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N11155 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 3 N11164 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 4 N11165 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 5 N11176 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 6 N11181 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 7 N11184 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 8 N11187 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 9 N11189 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## 10 N11191 2005 Fixed wing multi engine EMBRAER EMB-145XR 2
## # ... with 344 more rows, and 3 more variables: seats <int>, speed <int>,
## # engine <chr>
slice returns only rows by the given index.
Return the first five rows of data (same as the base head function):
planes %>% slice(1:5)
## # A tibble: 5 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
Return the 1st, 3rd, and 5th rows of data:
planes %>% slice(c(1, 3, 5))
## # A tibble: 3 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
Return the first and last rows:
planes %>% slice(c(1, nrow(planes)))
## # A tibble: 2 × 9
## tailnum year type manufacturer
## <chr> <int> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER
## 2 N999DN 1992 Fixed wing multi engine MCDONNELL DOUGLAS CORPORATION
## # ... with 5 more variables: model <chr>, engines <int>, seats <int>,
## # speed <int>, engine <chr>
mutate can add new variables or modify existing variables.
Add a dummy variable, engine.dummy with a default value of 0:
planes %>% mutate(engine.dummy = 0) %>% select(engine, engine.dummy)
## # A tibble: 3,322 × 2
## engine engine.dummy
## <chr> <dbl>
## 1 Turbo-fan 0
## 2 Turbo-fan 0
## 3 Turbo-fan 0
## 4 Turbo-fan 0
## 5 Turbo-fan 0
## 6 Turbo-fan 0
## 7 Turbo-fan 0
## 8 Turbo-fan 0
## 9 Turbo-fan 0
## 10 Turbo-fan 0
## # ... with 3,312 more rows
Using dplyr::if_else, add engine.dummy set to 1 if engine == "Turbo-fan", otherwise set engine.dummy to 0:
planes %>% mutate(engine.dummy = if_else(engine == "Turbo-fan", 1, 0)) %>% select(engine, engine.dummy)
## # A tibble: 3,322 × 2
## engine engine.dummy
## <chr> <dbl>
## 1 Turbo-fan 1
## 2 Turbo-fan 1
## 3 Turbo-fan 1
## 4 Turbo-fan 1
## 5 Turbo-fan 1
## 6 Turbo-fan 1
## 7 Turbo-fan 1
## 8 Turbo-fan 1
## 9 Turbo-fan 1
## 10 Turbo-fan 1
## # ... with 3,312 more rows
Convert planes$engine to a factor.
planes %>% mutate(engine = as.factor(engine)) %>% select(engine)
## # A tibble: 3,322 × 1
## engine
## <fctr>
## 1 Turbo-fan
## 2 Turbo-fan
## 3 Turbo-fan
## 4 Turbo-fan
## 5 Turbo-fan
## 6 Turbo-fan
## 7 Turbo-fan
## 8 Turbo-fan
## 9 Turbo-fan
## 10 Turbo-fan
## # ... with 3,312 more rows
Use arrange to sort your dataframe.
Arrange planes by year:
planes %>% arrange(year)
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N381AA 1956 Fixed wing multi engine DOUGLAS DC-7BF 4
## 2 N201AA 1959 Fixed wing single engine CESSNA 150 1
## 3 N567AA 1959 Fixed wing single engine DEHAVILLAND OTTER DHC-3 1
## 4 N378AA 1963 Fixed wing single engine CESSNA 172E 1
## 5 N575AA 1963 Fixed wing single engine CESSNA 210-5(205) 1
## 6 N14629 1965 Fixed wing multi engine BOEING 737-524 2
## 7 N615AA 1967 Fixed wing multi engine BEECH 65-A90 2
## 8 N425AA 1968 Fixed wing single engine PIPER PA-28-180 1
## 9 N383AA 1972 Fixed wing multi engine BEECH E-90 2
## 10 N364AA 1973 Fixed wing multi engine CESSNA 310Q 2
## # ... with 3,312 more rows, and 3 more variables: seats <int>,
## # speed <int>, engine <chr>
arrange planes by year desc:
planes %>% arrange(desc(year))
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N150UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 2 N151UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 3 N152UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 4 N153UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 5 N154UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 6 N155UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 7 N156UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 8 N157UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 9 N198UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## 10 N199UW 2013 Fixed wing multi engine AIRBUS A321-211 2
## # ... with 3,312 more rows, and 3 more variables: seats <int>,
## # speed <int>, engine <chr>
group_by allows you to perform operations on a dataframe by subsets without extracting the subset.
df <- planes %>% group_by(manufacturer, model)
The returned dataframe may not appear grouped. However, the class and attributes of the dataframe will confirm it is.
class(df)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## [[1]]
## manufacturer
##
## [[2]]
## model
head(attributes(df)$labels, n = 5L)
## manufacturer model
## 1 AGUSTA SPA A109E
## 2 AIRBUS A319-112
## 3 AIRBUS A319-114
## 4 AIRBUS A319-115
## 5 AIRBUS A319-131
If you wish to add grouping elements to the dataframe without removing existing grouping elements, use the add parameter set to TRUE (set to FALSE by default):
df <- df %>% group_by(type, year, add = TRUE)
class(df)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## [[1]]
## manufacturer
##
## [[2]]
## model
##
## [[3]]
## type
##
## [[4]]
## year
head(attributes(df)$labels, n = 5L)
## manufacturer model type year
## 1 AGUSTA SPA A109E Rotorcraft 2001
## 2 AIRBUS A319-112 Fixed wing multi engine 2002
## 3 AIRBUS A319-112 Fixed wing multi engine 2005
## 4 AIRBUS A319-112 Fixed wing multi engine 2006
## 5 AIRBUS A319-112 Fixed wing multi engine 2007
If you want to remove grouping use ungroup.
df <- df %>% ungroup()
class(df)
## [1] "tbl_df" "tbl" "data.frame"
attributes(df)$vars
## NULL
attributes(df)$labels
## NULL
summarise is used to perform calculations on a dataset either as a whole or by groups.
Find the mean number of of seats per manufacturer?
planes %>% group_by(manufacturer) %>% summarise(Mean = mean(seats))
## # A tibble: 35 × 2
## manufacturer Mean
## <chr> <dbl>
## 1 AGUSTA SPA 8.0000
## 2 AIRBUS 221.2024
## 3 AIRBUS INDUSTRIE 187.4025
## 4 AMERICAN AIRCRAFT INC 2.0000
## 5 AVIAT AIRCRAFT INC 2.0000
## 6 AVIONS MARCEL DASSAULT 12.0000
## 7 BARKER JACK L 2.0000
## 8 BEECH 9.5000
## 9 BELL 8.0000
## 10 BOEING 175.1877
## # ... with 25 more rows
summarise will not return variables that are not explicitly grouped or included in summary functions. If you want to add another variable you must pass it as a predicate to group_by or summarise.
planes %>% group_by(year, manufacturer) %>% summarise(Mean = mean(seats))
## Source: local data frame [164 x 3]
## Groups: year [?]
##
## year manufacturer Mean
## <int> <chr> <dbl>
## 1 1956 DOUGLAS 102
## 2 1959 CESSNA 2
## 3 1959 DEHAVILLAND 16
## 4 1963 CESSNA 5
## 5 1965 BOEING 149
## 6 1967 BEECH 9
## 7 1968 PIPER 4
## 8 1972 BEECH 10
## 9 1973 CESSNA 6
## 10 1974 CANADAIR LTD 2
## # ... with 154 more rows
rename a variable:
planes %>% rename(Mfr = manufacturer) %>% names()
## [1] "tailnum" "year" "type" "Mfr" "model" "engines" "seats"
## [8] "speed" "engine"