Setting up my environment

Notes: setting up my R environment by loading the ‘tidyverse’, ‘lubridate’, ‘ggplot2’, ‘dplyr’, and ‘readr’ packages

library(tidyverse)  
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)  
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)  
library(dplyr)
library(readr)

displays my working directory

getwd() 
## [1] "/cloud/project/Google_Data_Analytics/Member_vs_Casual_Riders/CSV"

=====================

STEP 1: COLLECT DATA

=====================

Upload Divvy datasets (csv files) here
q1_2020 <- read_csv("Divvy_Trips_2020_Q1.csv")
## Rows: 215737 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q4_2019 <- read_csv("Divvy_Trips_2019_Q4.csv")
## Rows: 224770 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_2019 <- read_csv("Divvy_Trips_2019_Q3.csv")
## Rows: 301024 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q2_2019 <- read_csv("Divvy_Trips_2019_Q2.csv")
## Rows: 306200 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): 01 - Rental Details Local Start Time, 01 - Rental Details Local End...
## dbl (5): 01 - Rental Details Rental ID, 01 - Rental Details Bike ID, 03 - Re...
## num (1): 01 - Rental Details Duration In Seconds Uncapped
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

====================================================

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

====================================================

Compare column names each of the files
While the names don’t have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file
colnames(q1_2020)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q2_2019)
##  [1] "01 - Rental Details Rental ID"                   
##  [2] "01 - Rental Details Local Start Time"            
##  [3] "01 - Rental Details Local End Time"              
##  [4] "01 - Rental Details Bike ID"                     
##  [5] "01 - Rental Details Duration In Seconds Uncapped"
##  [6] "03 - Rental Start Station ID"                    
##  [7] "03 - Rental Start Station Name"                  
##  [8] "02 - Rental End Station ID"                      
##  [9] "02 - Rental End Station Name"                    
## [10] "User Type"                                       
## [11] "Member Gender"                                   
## [12] "05 - Member Details Member Birthday Year"
colnames(q3_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
colnames(q4_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
remove extra columns from all quarter
q1_2020 <- q1_2020 %>%  
  select(-c(start_lat, start_lng, end_lat, end_lng))

q4_2019 <- q4_2019 %>%  
  select(-c(birthyear, gender,"tripduration"))

q3_2019 <- q3_2019 %>%  
  select(-c(birthyear, gender,"tripduration"))

q2_2019 <- q2_2019 %>%  
  select(-c("01 - Rental Details Duration In Seconds Uncapped", "Member Gender","05 - Member Details Member Birthday Year"))
Rename columns to make them consistent with q1_2020 (as this will be the supposed going-forward table design for Divvy)
(q4_2019 <- rename(q4_2019
                   ,ride_id = trip_id
                   ,rideable_type = bikeid 
                   ,started_at = start_time  
                   ,ended_at = end_time  
                   ,start_station_name = from_station_name 
                   ,start_station_id = from_station_id 
                   ,end_station_name = to_station_name 
                   ,end_station_id = to_station_id 
                   ,member_casual = usertype))
## # A tibble: 224,770 × 9
##     ride_id started_at   ended…¹ ridea…² start…³ start…⁴ end_s…⁵ end_s…⁶ membe…⁷
##       <dbl> <chr>        <chr>     <dbl>   <dbl> <chr>     <dbl> <chr>   <chr>  
##  1 25223640 10/1/2019 0… 10/1/2…    2215      20 Sheffi…     309 Leavit… Subscr…
##  2 25223641 10/1/2019 0… 10/1/2…    6328      19 Throop…     241 Morgan… Subscr…
##  3 25223642 10/1/2019 0… 10/1/2…    3003      84 Milwau…     199 Wabash… Subscr…
##  4 25223643 10/1/2019 0… 10/1/2…    3275     313 Lakevi…     290 Kedzie… Subscr…
##  5 25223644 10/1/2019 0… 10/1/2…    5294     210 Ashlan…     382 Wester… Subscr…
##  6 25223645 10/1/2019 0… 10/1/2…    1891     156 Clark …     226 Racine… Subscr…
##  7 25223646 10/1/2019 0… 10/1/2…    1061      84 Milwau…     142 McClur… Subscr…
##  8 25223647 10/1/2019 0… 10/1/2…    1274     156 Clark …     463 Clark … Subscr…
##  9 25223648 10/1/2019 0… 10/1/2…    6011     156 Clark …     463 Clark … Subscr…
## 10 25223649 10/1/2019 0… 10/1/2…    2957     336 Cottag…     336 Cottag… Custom…
## # … with 224,760 more rows, and abbreviated variable names ¹​ended_at,
## #   ²​rideable_type, ³​start_station_id, ⁴​start_station_name, ⁵​end_station_id,
## #   ⁶​end_station_name, ⁷​member_casual
(q3_2019 <- rename(q3_2019
                   ,ride_id = trip_id
                   ,rideable_type = bikeid 
                   ,started_at = start_time  
                   ,ended_at = end_time  
                   ,start_station_name = from_station_name 
                   ,start_station_id = from_station_id 
                   ,end_station_name = to_station_name 
                   ,end_station_id = to_station_id 
                   ,member_casual = usertype))
## # A tibble: 301,024 × 9
##     ride_id started_at   ended…¹ ridea…² start…³ start…⁴ end_s…⁵ end_s…⁶ membe…⁷
##       <dbl> <chr>        <chr>     <dbl>   <dbl> <chr>     <dbl> <chr>   <chr>  
##  1 23479388 7/1/2019 0:… 7/1/20…    3591     117 Wilton…     497 Kimbal… Subscr…
##  2 23479389 7/1/2019 0:… 7/1/20…    5353     381 Wester…     203 Wester… Custom…
##  3 23479390 7/1/2019 0:… 7/1/20…    6180     313 Lakevi…     144 Larrab… Custom…
##  4 23479391 7/1/2019 0:… 7/1/20…    5540     313 Lakevi…     144 Larrab… Custom…
##  5 23479392 7/1/2019 0:… 7/1/20…    6014     168 Michig…      62 McCorm… Custom…
##  6 23479393 7/1/2019 0:… 7/1/20…    4941     300 Broadw…     232 Pine G… Subscr…
##  7 23479394 7/1/2019 0:… 7/1/20…    3770     168 Michig…      62 McCorm… Custom…
##  8 23479395 7/1/2019 0:… 7/1/20…    5442     313 Lakevi…     144 Larrab… Custom…
##  9 23479396 7/1/2019 0:… 7/1/20…    2957      43 Michig…     195 Columb… Custom…
## 10 23479397 7/1/2019 0:… 7/1/20…    6091      43 Michig…     195 Columb… Custom…
## # … with 301,014 more rows, and abbreviated variable names ¹​ended_at,
## #   ²​rideable_type, ³​start_station_id, ⁴​start_station_name, ⁵​end_station_id,
## #   ⁶​end_station_name, ⁷​member_casual
(q2_2019 <- rename(q2_2019
                   ,ride_id = "01 - Rental Details Rental ID"
                   ,rideable_type = "01 - Rental Details Bike ID" 
                   ,started_at = "01 - Rental Details Local Start Time"  
                   ,ended_at = "01 - Rental Details Local End Time"  
                   ,start_station_name = "03 - Rental Start Station Name" 
                   ,start_station_id = "03 - Rental Start Station ID"
                   ,end_station_name = "02 - Rental End Station Name" 
                   ,end_station_id = "02 - Rental End Station ID"
                   ,member_casual = "User Type"))
## # A tibble: 306,200 × 9
##     ride_id started_at   ended…¹ ridea…² start…³ start…⁴ end_s…⁵ end_s…⁶ membe…⁷
##       <dbl> <chr>        <chr>     <dbl>   <dbl> <chr>     <dbl> <chr>   <chr>  
##  1 22178529 4/1/2019 0:… 4/1/20…    6251      81 Daley …      56 Despla… Subscr…
##  2 22178530 4/1/2019 0:… 4/1/20…    6226     317 Wood S…      59 Wabash… Subscr…
##  3 22178531 4/1/2019 0:… 4/1/20…    5649     283 LaSall…     174 Canal … Subscr…
##  4 22178532 4/1/2019 0:… 4/1/20…    4151      26 McClur…     133 Kingsb… Subscr…
##  5 22178533 4/1/2019 0:… 4/1/20…    3270     202 Halste…     129 Blue I… Subscr…
##  6 22178534 4/1/2019 0:… 4/1/20…    3123     420 Ellis …     426 Ellis … Subscr…
##  7 22178535 4/1/2019 0:… 4/1/20…    6418     503 Drake …     500 Centra… Subscr…
##  8 22178536 4/1/2019 0:… 4/1/20…    4513     260 Kedzie…     499 Kosciu… Subscr…
##  9 22178537 4/1/2019 0:… 4/1/20…    3280     211 St. Cl…     211 St. Cl… Custom…
## 10 22178538 4/1/2019 0:… 4/1/20…    5534     211 St. Cl…     211 St. Cl… Custom…
## # … with 306,190 more rows, and abbreviated variable names ¹​ended_at,
## #   ²​rideable_type, ³​start_station_id, ⁴​start_station_name, ⁵​end_station_id,
## #   ⁶​end_station_name, ⁷​member_casual
Inspect the dataframes and look for incongruencies
str(q1_2020)
## tibble [215,737 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:215737] "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
##  $ rideable_type     : chr [1:215737] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr [1:215737] "1/21/2020 20:06" "1/30/2020 14:22" "1/9/2020 19:29" "1/6/2020 16:17" ...
##  $ ended_at          : chr [1:215737] "1/21/2020 20:14" "1/30/2020 14:26" "1/9/2020 19:32" "1/6/2020 16:25" ...
##  $ start_station_name: chr [1:215737] "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
##  $ start_station_id  : num [1:215737] 239 234 296 51 66 212 96 96 212 38 ...
##  $ end_station_name  : chr [1:215737] "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
##  $ end_station_id    : num [1:215737] 326 318 117 24 212 96 212 212 96 100 ...
##  $ member_casual     : chr [1:215737] "member" "member" "member" "member" ...
str(q4_2019)
## tibble [224,770 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:224770] 25223640 25223641 25223642 25223643 25223644 ...
##  $ started_at        : chr [1:224770] "10/1/2019 0:01" "10/1/2019 0:02" "10/1/2019 0:04" "10/1/2019 0:04" ...
##  $ ended_at          : chr [1:224770] "10/1/2019 0:17" "10/1/2019 0:06" "10/1/2019 0:18" "10/1/2019 0:43" ...
##  $ rideable_type     : num [1:224770] 2215 6328 3003 3275 5294 ...
##  $ start_station_id  : num [1:224770] 20 19 84 313 210 156 84 156 156 336 ...
##  $ start_station_name: chr [1:224770] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:224770] 309 241 199 290 382 226 142 463 463 336 ...
##  $ end_station_name  : chr [1:224770] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ member_casual     : chr [1:224770] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
str(q3_2019)
## tibble [301,024 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:301024] 23479388 23479389 23479390 23479391 23479392 ...
##  $ started_at        : chr [1:301024] "7/1/2019 0:00" "7/1/2019 0:01" "7/1/2019 0:01" "7/1/2019 0:02" ...
##  $ ended_at          : chr [1:301024] "7/1/2019 0:20" "7/1/2019 0:18" "7/1/2019 0:27" "7/1/2019 0:27" ...
##  $ rideable_type     : num [1:301024] 3591 5353 6180 5540 6014 ...
##  $ start_station_id  : num [1:301024] 117 381 313 313 168 300 168 313 43 43 ...
##  $ start_station_name: chr [1:301024] "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:301024] 497 203 144 144 62 232 62 144 195 195 ...
##  $ end_station_name  : chr [1:301024] "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
##  $ member_casual     : chr [1:301024] "Subscriber" "Customer" "Customer" "Customer" ...
str(q2_2019)
## tibble [306,200 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:306200] 22178529 22178530 22178531 22178532 22178533 ...
##  $ started_at        : chr [1:306200] "4/1/2019 0:02" "4/1/2019 0:03" "4/1/2019 0:11" "4/1/2019 0:13" ...
##  $ ended_at          : chr [1:306200] "4/1/2019 0:09" "4/1/2019 0:20" "4/1/2019 0:15" "4/1/2019 0:18" ...
##  $ rideable_type     : num [1:306200] 6251 6226 5649 4151 3270 ...
##  $ start_station_id  : num [1:306200] 81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr [1:306200] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : num [1:306200] 56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr [1:306200] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr [1:306200] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
Convert ride_id and rideable_type to character so that they can stack correctly
q4_2019 <-  mutate(q4_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type)) 

q3_2019 <-  mutate(q3_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type)) 

q2_2019 <-  mutate(q2_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type)) 
Stack quarterly data frames into one big data frame
all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)
remove quarterly data frames to clear up space in the environment there is memory allocation limitation in the RStudio free version
remove(q1_2020, q2_2019, q4_2019, q3_2019)

======================================================

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

======================================================

List of column names
colnames(all_trips) 
## [1] "ride_id"            "started_at"         "ended_at"          
## [4] "rideable_type"      "start_station_id"   "start_station_name"
## [7] "end_station_id"     "end_station_name"   "member_casual"
How many rows are in data frame?
nrow(all_trips) 
## [1] 2546189
Dimensions of the data frame?
dim(all_trips) 
## [1] 2546189       9
See the first 6 rows of data frame. Also tail(all_trips)
head(all_trips) 
## # A tibble: 6 × 9
##   ride_id  started_at    ended…¹ ridea…² start…³ start…⁴ end_s…⁵ end_s…⁶ membe…⁷
##   <chr>    <chr>         <chr>   <chr>     <dbl> <chr>     <dbl> <chr>   <chr>  
## 1 22178529 4/1/2019 0:02 4/1/20… 6251         81 Daley …      56 Despla… Subscr…
## 2 22178530 4/1/2019 0:03 4/1/20… 6226        317 Wood S…      59 Wabash… Subscr…
## 3 22178531 4/1/2019 0:11 4/1/20… 5649        283 LaSall…     174 Canal … Subscr…
## 4 22178532 4/1/2019 0:13 4/1/20… 4151         26 McClur…     133 Kingsb… Subscr…
## 5 22178533 4/1/2019 0:19 4/1/20… 3270        202 Halste…     129 Blue I… Subscr…
## 6 22178534 4/1/2019 0:19 4/1/20… 3123        420 Ellis …     426 Ellis … Subscr…
## # … with abbreviated variable names ¹​ended_at, ²​rideable_type,
## #   ³​start_station_id, ⁴​start_station_name, ⁵​end_station_id, ⁶​end_station_name,
## #   ⁷​member_casual
See list of columns and data types (numeric, character, etc)
str(all_trips) 
## tibble [1,047,731 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:2546189] "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : chr [1:2546189] "4/1/2019 0:02" "4/1/2019 0:03" "4/1/2019 0:11" "4/1/2019 0:13" ...
##  $ ended_at          : chr [1:2546189] "4/1/2019 0:09" "4/1/2019 0:20" "4/1/2019 0:15" "4/1/2019 0:18" ...
##  $ rideable_type     : chr [1:2546189] "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : num [1:2546189] 81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr [1:2546189] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : num [1:2546189] 56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr [1:2546189] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr [1:2546189] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
Statistical summary of data. Mainly for numerics
summary(all_trips) 
##   started_at          ended_at         member_casual     
##  Length:2546189     Length:2546189     Length:2546189    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

There are a few problems we will need to fix:

(1) In the “member_casual” column, there are two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). We will need to consolidate that from four to two labels.
(2) The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data – such as day, month, year – that provide additional opportunities to aggregate the data.
(3) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the “tripduration” column. We will add “ride_length” to the entire dataframe for consistency.
(4) There are some rides where tripduration shows up as negative, including several hundred rides where Divvy took bikes out of circulation for Quality Control reasons. We will want to delete these rides.
In the “member_casual” column, replace “Subscriber” with “member” and “Customer” with “casual”
Before 2020, Divvy used different labels for these two types of riders … we will want to make our dataframe consistent with their current nomenclature
N.B.: “Level” is a special property of a column that is retained even if a subset does not contain any values from a specific level
Begin by seeing how many observations fall under each usertype
table(all_trips$member_casual)
## 
##     casual   Customer     member Subscriber 
##      183728     345675     378407     1638379
Check to make sure the proper number of observations were reassigned
all_trips <-  all_trips %>% 
  mutate(member_casual = recode(member_casual
                                ,"Subscriber" = "member"
                                ,"Customer" = "casual"))
Check to make sure the proper number of observations were reassigned
table(all_trips$member_casual)
## 
## casual member 
## 529403 2016786
create columns date, month, day, and year of each ride and then derived them from started_at attribute.
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
Convert started_at and ended_at to datetime in order to later calculate the ride length
all_trips <-  mutate(all_trips, started_at = mdy_hm(started_at)
                   , ended_at = mdy_hm(ended_at)) 
Create the column ride_length and calculate it by subtracting ended_at from started_at and then convert it to minutes
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at, units='mins')
Inspect the structure of the columns
str(all_trips)
## tibble [1,047,731 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:2546189] "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : POSIXct[1:2546189], format: "2019-04-01 00:02:00" "2019-04-01 00:03:00" ...
##  $ ended_at          : POSIXct[1:2546189], format: "2019-04-01 00:09:00" "2019-04-01 00:20:00" ...
##  $ rideable_type     : chr [1:2546189] "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : num [1:2546189] 81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr [1:2546189] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : num [1:2546189] 56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr [1:2546189] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr [1:2546189] "member" "member" "member" "member" ...
##  $ date              : Date[1:2546189], format: "4-01-20" "4-01-20" ...
##  $ month             : chr [1:2546189] "01" "01" "01" "01" ...
##  $ day               : chr [1:2546189] "20" "20" "20" "20" ...
##  $ year              : chr [1:2546189] "4" "4" "4" "4" ...
##  $ day_of_week       : chr [1:2546189] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
##  $ ride_length       : 'difftime' num [1:2546189] 7 17 4 5 ...
##   ..- attr(*, "units")= chr "mins"

clean the data

Check number of rows before cleaning data
nrow(all_trips) 
## [1] 2546189
remove rows with NA values
all_trips <- na.omit(all_trips) 
remove duplicate rows
all_trips <- distinct(all_trips) 
remove where ride_length is 0 or negative
all_trips <- all_trips[!(all_trips$ride_length <=0),] 
Check number of rows after cleaning data
nrow(all_trips) 
## [1] 2540092
================================

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

=====================================
Descriptive analysis on ride_length (all figures in minutes)
mean(all_trips$ride_length) #straight average (total ride length / rides)
## Time difference of 23.86972 mins
median(all_trips$ride_length) #midpoint number in the ascending array of ride lengths
## Time difference of 11 mins
max(all_trips$ride_length) #longest ride
## Time difference of 156450 mins
min(all_trips$ride_length) #shortest ride
## Time difference of 1 mins
You can condense the four lines above to one line using summary() on the specific attribute
summary(all_trips$ride_length)
##   Length    Class     Mode 
##   2540092 difftime  numeric
Total ride by user type
table(all_trips$member_casual)
## 
## casual member 
## 525712  2014380
Compare members and casual users
aggregate(all_trips$ride_length ~ all_trips$member_casual, FUN = mean) #straight average (total ride length / rides)
##   all_trips$member_casual all_trips$ride_length
## 1                  casual         62.26152 mins
## 2                  member         13.85024 mins
aggregate(all_trips$ride_length ~ all_trips$member_casual, FUN = median) #midpoint number in the ascending array of ride lengths
##   all_trips$member_casual all_trips$ride_length
## 1                  casual               26 mins
## 2                  member               10 mins
aggregate(all_trips$ride_length ~ all_trips$member_casual, FUN = max) #longest ride
##   all_trips$member_casual all_trips$ride_length
## 1                  casual           156450 mins
## 2                  member           150944 mins
aggregate(all_trips$ride_length ~ all_trips$member_casual, FUN = min) #shortest ride
##   all_trips$member_casual all_trips$ride_length
## 1                  casual                1 mins
## 2                  member                1 mins
See the average ride time by each day for members vs casual users
aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week, FUN = mean)
##    all_trips$member_casual all_trips$day_of_week all_trips$ride_length
## 1                        casual                Friday         52.70968 mins
## 2                        member                Friday         13.68401 mins
## 3                       casual               Monday         53.82897 mins
## 4                       member               Monday         13.48617 mins
## 5                       casual             Saturday         70.13204 mins
## 6                       member             Saturday         12.78588 mins
## 7                       casual               Sunday         67.81096 mins
## 8                       member               Sunday         13.49104 mins
## 9                       casual             Thursday         76.56711 mins
## 10                      member             Thursday         13.87723 mins
## 11                      casual              Tuesday         54.19205 mins
## 12                      member              Tuesday         14.01073 mins
## 13                      casual            Wednesday         67.37661 mins
## 14                      member            Wednesday         13.83719 mins
Notice that the days of the week are out of order. Let’s fix that.
all_trips$day_of_week <- ordered(all_trips$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
Now, let’s run the average ride time by each day for members vs casual users
aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week, FUN = mean)
##    all_trips$member_casual all_trips$day_of_week all_trips$ride_length
## 1                       casual               Sunday         67.81096 mins
## 2                       member               Sunday         13.49104 mins
## 3                       casual               Monday         53.82897 mins
## 4                       member               Monday         13.48617 mins
## 5                       casual              Tuesday         54.19205 mins
## 6                       member              Tuesday         14.01073 mins
## 7                       casual            Wednesday         67.37661 mins
## 8                       member            Wednesday         13.83719 mins
## 9                       casual             Thursday         76.56711 mins
## 10                      member             Thursday         13.87723 mins
## 11                      casual               Friday         52.70968 mins
## 12                      member               Friday         13.68401 mins
## 13                      casual             Saturday         70.13204 mins
## 14                      member             Saturday         12.78588 mins
analyze ridership data by type and weekday
all_trips %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(member_casual, weekday) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n()                           #calculates the number of rides and average duration 
            ,average_duration = mean(ride_length)) %>%      # calculates the average duration
  arrange(member_casual, weekday)   
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int> <drtn>          
##  1 casual            Sun              116201 59.26285 mins   
##  2 casual            Mon               61917 57.25147 mins   
##  3 casual            Tue               54226 64.46007 mins   
##  4 casual            Wed               60343 67.73447 mins   
##  5 casual            Thu               56161 67.83439 mins   
##  6 casual            Fri               65851 68.38865 mins   
##  7 casual            Sat              111013 57.69203 mins   
##  8 member            Sun              183539 15.36367 mins   
##  9 member            Mon              331667 13.80016 mins   
## 10 member            Tue              362604 13.48410 mins   
## 11 member            Wed              355027 13.07827 mins   
## 12 member            Thu              309712 13.52449 mins   
## 13 member            Fri              292576 13.34896 mins   
## 14 member            Sat              179255 16.04394 mins
Let’s visualize the number of rides by rider type
all_trips %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot() +
  geom_col(aes(x = weekday, y = number_of_rides, fill = member_casual), position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Let’s create a visualization for average duration
all_trips %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration_in_minutes = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot() +
  geom_col(aes(x = weekday, y = average_duration_in_minutes, fill = member_casual), position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

=================================================

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS

=================================================
Create a csv file that we will visualize in Excel, Tableau, or my presentation software
counts <- aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week, FUN = mean)
write.csv(counts, file = '/cloud/project/Google_Data_Analytics/Member_vs_Casual_Riders/CSV/avg_ride_length.csv')

I am done! Congratulations!