Random Forest
This commit is contained in:
parent
8a846d5dc9
commit
b6c4bcbb4b
128
Dropout RF.R
Normal file
128
Dropout RF.R
Normal file
@ -0,0 +1,128 @@
|
||||
library(tidyverse)
|
||||
library(caret)
|
||||
library(randomForest)
|
||||
|
||||
dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv")
|
||||
factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target")
|
||||
dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor)
|
||||
str(dropout_clean)
|
||||
|
||||
## Random Forest 1
|
||||
dropout_clean_rf1=dropout_clean
|
||||
str(dropout_clean_rf1)
|
||||
|
||||
df.train1 <- sample_frac(dropout_clean_rf1, size = .85)
|
||||
df.test1 <- setdiff(dropout_clean_rf1, df.train1)
|
||||
|
||||
dropout.rf1 <- randomForest(Target ~ .,
|
||||
importance = TRUE,
|
||||
#proximity = TRUE,
|
||||
data = df.train1)
|
||||
print(dropout.rf1)
|
||||
pred1=predict(dropout.rf1, df.test1, type="response")
|
||||
print(confusionMatrix(as.factor(df.test1$Target), pred1)$overall[1])
|
||||
|
||||
oob.error.data <- data.frame(
|
||||
Trees=rep(1:nrow(dropout.rf1$err.rate), times=4),
|
||||
Type=rep(c("OOB", "Dropout", "Enrolled",
|
||||
"Graduate"), each=nrow(dropout.rf1$err.rate)),
|
||||
Error=c(dropout.rf1$err.rate[,"OOB"],
|
||||
dropout.rf1$err.rate[,"Dropout"],
|
||||
dropout.rf1$err.rate[,"Enrolled"],
|
||||
dropout.rf1$err.rate[,"Graduate"]))
|
||||
|
||||
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
|
||||
geom_line(aes(color=Type))+
|
||||
ggtitle("Random Forest #1 Error vs Number Of Trees")
|
||||
|
||||
|
||||
## Random Forest 2
|
||||
dropout_clean_rf2=dropout_clean
|
||||
str(dropout_clean_rf2)
|
||||
|
||||
df.train2 <- sample_frac(dropout_clean_rf2, size = .85)
|
||||
df.test2 <- setdiff(dropout_clean_rf2, df.train2)
|
||||
|
||||
dropout.rf2 <- randomForest(Target ~ .,
|
||||
importance = TRUE,
|
||||
#proximity = TRUE,
|
||||
data = df.train2,
|
||||
mtry=2)
|
||||
print(dropout.rf2)
|
||||
pred2=predict(dropout.rf2, df.test2, type="response")
|
||||
print(confusionMatrix(as.factor(df.test2$Target), pred2)$overall[2])
|
||||
|
||||
oob.error.data <- data.frame(
|
||||
Trees=rep(1:nrow(dropout.rf2$err.rate), times=4),
|
||||
Type=rep(c("OOB", "Dropout", "Enrolled",
|
||||
"Graduate"), each=nrow(dropout.rf2$err.rate)),
|
||||
Error=c(dropout.rf2$err.rate[,"OOB"],
|
||||
dropout.rf2$err.rate[,"Dropout"],
|
||||
dropout.rf2$err.rate[,"Enrolled"],
|
||||
dropout.rf2$err.rate[,"Graduate"]))
|
||||
|
||||
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
|
||||
geom_line(aes(color=Type))+
|
||||
ggtitle("Random Forest #2 Error vs Number Of Trees")
|
||||
|
||||
|
||||
|
||||
## Random Forest 3
|
||||
dropout_clean_rf3=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", "Dropout", "Did Not Dropout")))
|
||||
str(dropout_clean_rf3)
|
||||
|
||||
table(dropout_clean$Target)
|
||||
table(dropout_clean_rf3$Target)
|
||||
|
||||
df.train3 <- sample_frac(dropout_clean_rf3, size = .85)
|
||||
df.test3 <- setdiff(dropout_clean_rf3, df.train3)
|
||||
|
||||
dropout.rf3 <- randomForest(Target ~ .,
|
||||
importance = TRUE,
|
||||
#proximity = TRUE,
|
||||
data = df.train3
|
||||
#keep.forest=TRUE,
|
||||
,mtry=2
|
||||
)
|
||||
print(dropout.rf3)
|
||||
pred3=predict(dropout.rf3, df.test3, type="response")
|
||||
print(confusionMatrix(as.factor(df.test3$Target), pred3)$overall[1])
|
||||
|
||||
oob.error.data <- data.frame(
|
||||
Trees=rep(1:nrow(dropout.rf3$err.rate), times=3),
|
||||
Type=rep(c("OOB", "Dropout", #"Enrolled",
|
||||
"Did Not Dropout"), each=nrow(dropout.rf3$err.rate)),
|
||||
Error=c(dropout.rf3$err.rate[,"OOB"],
|
||||
dropout.rf3$err.rate[,"Dropout"],
|
||||
#dropout.rf3$err.rate[,"Enrolled"],
|
||||
dropout.rf3$err.rate[,"Did Not Dropout"]))
|
||||
|
||||
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
|
||||
geom_line(aes(color=Type))+
|
||||
ggtitle("Random Forest #3 Error vs Number Of Trees")
|
||||
|
||||
|
||||
varImpPlot(dropout.rf3,
|
||||
main="Variable Importance Plot"
|
||||
)
|
||||
|
||||
|
||||
oob.values <- vector(length=10)
|
||||
for(i in 1:10) {
|
||||
temp.model <- randomForest(Target ~ ., data=df.train1,importance = TRUE, mtry=i)
|
||||
oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
|
||||
}
|
||||
oob.values
|
||||
## find the minimum error
|
||||
min(oob.values)
|
||||
## 2
|
||||
|
||||
oob.values <- vector(length=10)
|
||||
for(i in 1:10) {
|
||||
temp.model <- randomForest(Target ~ ., data=df.train3, mtry=i)
|
||||
oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
|
||||
}
|
||||
oob.values
|
||||
## find the minimum error
|
||||
min(oob.values)
|
||||
## 2
|
||||
Loading…
Reference in New Issue
Block a user