-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathsysdata.Rmd
122 lines (105 loc) · 5.1 KB
/
sysdata.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# The sysdata.rda object
## DC station locations
The function `R/stations.R/bike_get_dc_stations` has code to extract and process
DC stations. The data can be obtained from
http://opendata.dc.gov/datasets/capital-bike-share-locations/, using
Download->Spreadsheet. The code is reproduced here
```{r}
stations_dc <- read.csv ("Capital_Bike_Share_Locations.csv")
names (stations_dc) <- tolower (names (stations_dc))
name <- noquote (gsub ("'", "", stations_dc$address)) #nolint
name <- trimws (name, which = 'right') # trim terminal white space
stations_dc <- data.frame (id = stations_dc$terminal_number,
name = name,
lon = stations_dc$longitude,
lat = stations_dc$latitude,
stringsAsFactors = FALSE)
```
## Bike Header Field Names
The fields stored in the `bikedata` database are:
| number | field |
| ---- | ----------------------- |
| 1 | duration |
| 2 | start_time |
| 3 | end_time |
| 4 | start_station_id |
| 5 | start_station_name |
| 6 | start_station_latitude |
| 7 | start_station_longitude |
| 8 | end_station_id |
| 9 | end_station_name |
| 10 | end_station_latitude |
| 11 | end_station_longitude |
| 12 | bike_id |
| 13 | user_type |
| 14 | birth_year |
| 15 | gender |
Each file has at least some of these fields, but different systems naturally use
different nomenclatures. The `header_names` structure maps different system
names for these fields onto the above names. All names are converted to lower
case and all white space and underscores removed, so entries here should be all
lower case with no white space.
old DC files had "Duration (ms)", but no longer do.
LA has "passholder_type", which can be "Flex Pass" = annual, or "Monthly Pass"
PH has "passholder_type", which can be "IndegoFlex" or "Indego30"
Note that an extra city column is needed because LA has "start_station" and
"end_station" for the ID columns, while MN has these for the station name
columns.
```{r}
fields <- c ("duration", "starttime", "endtime", "startstationid",
"startstationname", "startstationlatitude",
"startstationlongitude", "endstationid", "endstationname",
"endstationlatitude", "endstationlongitude", "bikeid",
"usertype", "birthyear", "gender")
duration <- c ("duration", "tripduration", "totalduration", "durationsec",
"durationseconds", "totalduration(ms)")
starttime <- c ("starttime", "startdate", "iniciodelviaje")
endtime <- c ("endtime", "enddate", "stoptime", "findelviaje")
startstationid <- c ("startstationid", "startstationnumber", "fromstationid",
"startterminal", "startstation", "startstationcode",
"origenid")
startstationname <- c ("startstationname", "fromstationname", "startstation")
startstationlatitude <- c ("startstationlatitude", "startlat")
startstationlongitude <- c ("startstationlongitude", "startlon")
endstationid <- c ("endstationid", "endstationnumber", "tostationid",
"endstation", "endterminal", "endstationcode",
"destinoid")
endstationname <- c ("endstationname", "tostationname", "endstation")
endstationlatitude <- c ("endstationlatitude", "endlat")
endstationlongitude <- c ("endstationlongitude", "endlon")
bikeid <- c ("bikeid", "bikenumber", "bike#")
usertype <- c ("usertype", "membertype", "type", "subscribertype",
"subscriptiontype", "accounttype", "passholdertype",
"ismember", "usuarioid")
birthyear <- c ("birthyear", "birthday", "memberbirthyear",
"edad","anodenacimento")
gender <- c ("gender", "membergender", "genero")
field_names <- data.frame (matrix (nrow = 0, ncol = 2))
for (f in fields)
{
field_names <- rbind (field_names,
cbind (rep (f, length (get (f))), get (f)))
}
names (field_names) <- c ("field", "variation")
field_names$index <- field_names$field
levels (field_names$index) <- seq (unique (field_names$index))
field_names$index <- as.numeric (field_names$index)
field_names$city <- "all"
field_names$city [field_names$field == "startstationid" &
field_names$variation == "startstation"] <- "la"
field_names$city [field_names$field == "endstationname" &
field_names$variation == "endstation"] <- "mn"
field_names$city [field_names$field == "startstationname" &
field_names$variation == "startstation"] <- "mn"
field_names$city [field_names$field == "endstationid" &
field_names$variation == "endstation"] <- "la"
```
And this then saves the correponding `data.frame` to the package data:
```{r}
data_dir <- file.path (here::here (), "R")
f <- file.path (data_dir, "sysdata.rda")
load ("./R/sysdata.rda")
stations_dc <- sysdata$stations_dc # comment out to refresh using above code
sysdata <- list (stations_dc = stations_dc, field_names = field_names)
save (sysdata, file = f, compress = "xz")
```