-
Notifications
You must be signed in to change notification settings - Fork 0
/
rf-analysis.R
157 lines (123 loc) · 6.27 KB
/
rf-analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Predicting Brook Trout Streams with Random Forest
# Dakota Benjamin
# Case Western Reserve University
# Last Updated 9/24/2014
# Version 0.1
# Intro ----
#
# Initialization ----
rm(list=ls())
setwd("../brooktrout")
#Libraries
library(randomForest) #for the random forest
library(sp) #Classes and methods for spatial data
library(rgdal) #bindings for GDAL
library(raster)
library(RODBC)
# Load data ----
# Load the basic terrain analyses
# #catchment <- raster("tifs/repr/catchment.tif")
# tpi2k <- raster("Predictors/tpi2k.tif")
# tpi200 <- raster("Predictors/tpi200.tif")
# aspect <- raster("Predictors/aspect.tif")
# channel_altitude <- raster("Predictors/channel_altitude.tif")
# channel_base <- raster("Predictors/channel_base.tif")
# convergence <- raster("Predictors/convergence.tif")
# hcurv <- raster("Predictors/hcurv.tif")
# vcurv <- raster("Predictors/vcurv.tif")
# # There is something wrong with ls-factor ls_factor <- raster("Predictors/ls_factor.tif")
# relative_slope_position <- raster("Predictors/relative_slope_position.tif")
# shade <- raster("Predictors/shade.tif")
# #sinks <- raster("Predictors/sinks.tif")
# slope <- raster("Predictors/slope.tif")
# twi <- raster("Predictors/twi.tif")
# valley_depth <- raster("Predictors/valley_depth.tif")
# historicalforest <- raster("Predictors/historicalforest.tif")
# predictors <- addLayer(tpi2k, tpi200, aspect, channel_altitude, channel_base, convergence, hcurv, vcurv, relative_slope_position, shade, slope, valley_depth, twi, historicalforest)
#Soils
# need soils for both counties first.
soils <- readOGR("Predictors/soils/merge/", "soilmu_a_merged")
#Sandstone
sandstone <- readOGR("Predictors//geology//PennsylvanianShape//Maps//", "ohbedpm-s")
sandstone <- subset(sandstone, AQUA= "Pm-s")
#crs(sandstone) <- "+proj=lcc +lat_1=40.03333333333333 +lat_2=38.73333333333333 +lat_0=38 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs "
crs(sandstone) <- "+proj=lcc +lat_1=38.73333333333333 +lat_2=40.03333333333333 +lat_0=38 +lon_0=-82.5 +x_0=609601.2192024384 +y_0=0 +ellps=clrk66 +datum=NAD27 +to_meter=0.3048006096012192 +no_defs "
#DEM for now, need twi next
dem <- raster("cuy_gea_dem.tif")
#Training Data -----
# conn<-odbcConnect("oldfield\\388_Fall2008\\fwdbrooktroutdata\\Brook Trout HSI 12-5-2006.mdb")
# train.tables <- sqlTables(conn)
# #info on streams, lat/long, etc.
# train.streams <- sqlFetch(conn,"tblStreamDetail")
# #Habitat- depth types, substrate
# train.habitat <- sqlFetch(conn,"tblHabitat")
# #Temperature- all, i think.
# train.temps <- sqlFetch(conn,"tblTemperature")
# #Chemical data
# train.chem <- sqlFetch(conn,"tblChemical")
train.streams <- read.csv("streams/tblStreamDetail.txt")
#Change train.stream lat/long to State Plane
coordinates(train.streams) <- c("Longitude", "Latitude")
proj4string(train.streams) <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84")
train.streams <- spTransform(train.streams, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs"))
#Add to the train.streams table the average temps, etc.
#extract raster values for the points
ras <- raster::extract(dem,train.streams) # c(train.streams$Latitude, train.streams$Longitude)
train.streams@data = data.frame(train.streams, ras)
#train.streams <- na.omit(train.streams@data)
#Put all the data together ----
# sandstone
#transform to same crs as training
sandstone <- spTransform(sandstone, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs +towgs84=0,0,0"))
points.sand <- as.vector(over(x=train.streams, y=sandstone)$AQUA)
points.sand[is.na(points.sand)] <- "na"
# soils
# #gea
# #sptransform
# soils.gea <- spTransform(soils.gea, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs +towgs84=0,0,0"))
# points.soils.gea <- over(x=train.streams, y=soils.gea)
#
# #cuy
# soils.cuy <- spTransform(soils.cuy, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs +towgs84=0,0,0"))
#
# #lake
# soils.lake <- spTransform(soils.lake, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs +towgs84=0,0,0"))
# points.soils.lake <- over(x=train.streams, y=soils.lake)
#
# #merge the two tables where there are no NAs
# for(i in points.soils.gea) {
# if (is.na(i)) {
# points.soils.up <- points.soils.lake[as.integer(rownames(i))]
# }
# }
soils <- spTransform(soils, CRS("+proj=lcc +lat_1=41.7 +lat_2=40.43333333333333 +lat_0=39.66666666666666 +lon_0=-82.5 +x_0=600000 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs +towgs84=0,0,0"))
points.soils <- as.vector(over(x=train.streams, y=soils))
#all together now
train.streams@data <- data.frame(train.streams, points.sand, points.soils$MUSYM)
# Prediction Streams -----
#load streams
#add characteristics
# Random Forest Training ----
training <- data.frame(train.streams$bkt.Outlook, train.streams$ras, train.streams$points.sand, as.character(train.streams$points.soils.MUSYM))
names(training) <- c("outlook", "dem", "sandstone", "soils")
# set the seed
set.seed(241)
train.rf <- randomForest(outlook ~ ., data=training, importance=T, ntree=1500, do.trace=100, proximity=T, na.action=na.exclude) # apply the proper mtry and ntree
print(train.rf)
# Variable Importance
par(mfrow=c(1,4))
for (i in 1:4) {
barplot(sort(train.rf$importance[,i], dec=T),
main=attributes(train.rf$importance)$dimnames[[2]][i], cex.names=0.6)
}
#Look at just Mean Decrease in Accuracy:
par(mfrow=c(1,1))
barplot(sort(train.rf$importance[,4], dec=T),main="Mean Decrease in Accuracy", cex.names=0.6)
#Outliers
outlier <- outlier(train.rf)
par(mfcol=c(1,1))
plot(outlier, type="h", main="Outlier data points in the RF")
# Predict classification for other streams ----
#Load Streams
cm.streams <- readOGR("streams/", "headwater_points")
rf.pred <- clusterR(xvars, predict, args=list(model=train.rf), progress='text')