library(tidyverse) library(caret) dropout <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout.csv") dropout_clean=dropout colnames(dropout_clean)[1]='Marital.status' ## Combining under represented factors into buckets of at least 10 table(dropout_clean$Marital.status) table(dropout_clean$Application.mode) table(dropout_clean$Previous.qualification) table(dropout_clean$Mother.s.qualification) table(dropout_clean$Father.s.qualification) table(dropout_clean$Mother.s.occupation) table(dropout_clean$Father.s.occupation) dropout_clean=mutate(dropout_clean, Marital.status = ifelse((Marital.status == "3"| Marital.status == "6"), 7, Marital.status)) dropout_clean=mutate(dropout_clean, Application.mode = ifelse((Application.mode == "2"| Application.mode =="5"| Application.mode == "10"| Application.mode == "11"| Application.mode == "18"), 19, Application.mode)) dropout_clean=mutate(dropout_clean, Previous.qualification = ifelse((Previous.qualification == "4"| Previous.qualification == "5"| Previous.qualification == "8"| Previous.qualification == "10"| Previous.qualification == "11"| Previous.qualification == "13"| Previous.qualification == "17"), 18, Previous.qualification)) dropout_clean=mutate(dropout_clean, Mother.s.qualification = ifelse((Mother.s.qualification == "6"| Mother.s.qualification == "7"| Mother.s.qualification == "8"| Mother.s.qualification == "9"| Mother.s.qualification == "11"| Mother.s.qualification == "12"| Mother.s.qualification == "14"| Mother.s.qualification == "15"| Mother.s.qualification == "16"| Mother.s.qualification == "17"| Mother.s.qualification == "18"| Mother.s.qualification == "20"| Mother.s.qualification == "21"| Mother.s.qualification == "24"| Mother.s.qualification == "25"| Mother.s.qualification == "26"| Mother.s.qualification == "27"| Mother.s.qualification == "28"| Mother.s.qualification == "29"), 35, Mother.s.qualification)) dropout_clean=mutate(dropout_clean, Father.s.qualification = ifelse((Father.s.qualification == "6"| Father.s.qualification == "7"| Father.s.qualification == "8"| Father.s.qualification == "11"| Father.s.qualification == "12"| Father.s.qualification == "13"| Father.s.qualification == "15"| Father.s.qualification == "16"| Father.s.qualification == "17"| Father.s.qualification == "18"| Father.s.qualification == "19"| Father.s.qualification == "20"| Father.s.qualification == "21"| Father.s.qualification == "22"| Father.s.qualification == "23"| Father.s.qualification == "25"| Father.s.qualification == "26"| Father.s.qualification == "30"| Father.s.qualification == "31"| Father.s.qualification == "32"| Father.s.qualification == "33"| Father.s.qualification == "34"), 35, Father.s.qualification)) dropout_clean=mutate(dropout_clean, Mother.s.occupation = ifelse((Mother.s.occupation == "11"| Mother.s.occupation == "14"| Mother.s.occupation == "15"| Mother.s.occupation == "16"| Mother.s.occupation == "17"| Mother.s.occupation == "18"| Mother.s.occupation == "19"| Mother.s.occupation == "20"| Mother.s.occupation == "21"| Mother.s.occupation == "22"| Mother.s.occupation == "23"| Mother.s.occupation == "24"| Mother.s.occupation == "25"| Mother.s.occupation == "26"| Mother.s.occupation == "27"| Mother.s.occupation == "28"| Mother.s.occupation == "30"| Mother.s.occupation == "31"), 47, Mother.s.occupation)) dropout_clean=mutate(dropout_clean, Father.s.occupation = ifelse((Father.s.occupation == "14"| Father.s.occupation == "15"| Father.s.occupation == "16"| Father.s.occupation == "17"| Father.s.occupation == "18"| Father.s.occupation == "19"| Father.s.occupation == "20"| Father.s.occupation == "21"| Father.s.occupation == "22"| Father.s.occupation == "23"| Father.s.occupation == "24"| Father.s.occupation == "25"| Father.s.occupation == "26"| Father.s.occupation == "27"| Father.s.occupation == "28"| Father.s.occupation == "29"| Father.s.occupation == "30"| Father.s.occupation == "31"| Father.s.occupation == "32"| Father.s.occupation == "33"| Father.s.occupation == "34"| Father.s.occupation == "35"| Father.s.occupation == "36"| Father.s.occupation == "37"| Father.s.occupation == "38"| Father.s.occupation == "39"| Father.s.occupation == "40"| Father.s.occupation == "41"| Father.s.occupation == "42"| Father.s.occupation == "43"| Father.s.occupation == "45"| Father.s.occupation == "46"), 47, Father.s.occupation)) #factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Application.mode", "Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target") #dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor) #sapply(dropout_clean, table) dropout_clean=subset(dropout_clean, select = c(Gender,Displaced,Tuition.fees.up.to.date, Scholarship.holder,Marital.status, Daytime.evening.attendance, Course, Previous.qualification, Mother.s.qualification,Father.s.qualification, Mother.s.occupation, Father.s.occupation, Age.at.enrollment, International, Target)) #str(dropout_clean) write.csv(dropout_clean, "C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv", row.names=FALSE)