################################################################################################## ##--------------------------------------------------------------------------------------------- # Functions for Classes by Prof. Jin Man Lee # DePaul University # # To use any functions here, you need to run the following code to read into your program # source("http://condor.depaul.edu/jlee141/econdata/R/func_lib.R") # ################################################################################################## # descriptive2 # Descriptive Statistics by Class or Group # Function Example V1: main var V2: Class or Factor descriptive2 <- function(v1,v2) { n <- table(v2) mean <- tapply(v1,v2,mean) med <- tapply(v1,v2,median) sd <- tapply(v1,v2,sd) min <- tapply(v1,v2,min) max <- tapply(v1,v2,max) desc <- as.data.frame(cbind(n,mean,med,sd,min,max)) desc$name <- row.names(max) return(desc) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # fit_eval # fit_eval function to evaluate model goodness-to-fit # Calculating to return answer which are 7 criterion to evaluate forecasting models, which are # ME (Mean error), MAE (Mean absolute error), MPE (Mean percentage error), # MAPE (Mean absolute percentage error), # MSE (Mean squared error), RMSE (Root of mean square error). # # fit_eval(yhat,yorg,name) # example: fit_level(yhat,y,"OLS") # yhat and y are vector # created by Lee fit_eval <- function(yhat,yorg,name) { ei = yorg - yhat n = length(ei) ME = sum(ei)/n MAE = sum(abs(ei))/n MPE = sum((ei/yorg)*100)/n MAPE = sum((abs(ei)/yorg)*100)/n MSE = sum(ei*ei)/n RMSE = sqrt(sum(ei*ei)/n) stat = cbind(n,ME,MAE,MPE,MAPE,MSE,RMSE) rownames(stat) <- name return(stat) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Logistic Regression with ROC and AUC # input required: indata=data.frame name data needs to have y as dependent variable to calculate # ROC AUC # model= dep~ind1+ind2 # prob_level = threshold value to determine 1 logit_test <- function(indata,model,prob_level) { indata <- indata[complete.cases(indata),] set.seed(1234) # Create Train and Test Data nobs = nrow(indata) sampsize = round(nobs*0.7) train_idx = sample(nobs,size=sampsize) train <- indata[train_idx,] test <- indata[-train_idx,] # Logistic Regression logit1 <- glm(formula = model, family = binomial(link = logit), data = train) summary(logit1) # Check Multicollinearity vif(logit1) # Predicting the model using test data prob1 <-predict(logit1,newdata=test,type ="response") # setting the cutoff for probability values results <- ifelse(prob1 > prob_level,1,0) #Display the confusion matrix or classification table test_y <- test$y conttable <- table(test_y,results) print(conttable) #Calculating the error rate misclasificationerror <- mean(results != test_y) # Calculating the accuracy rate accuracyrate <- 1-misclasificationerror print( c("ACCURACY RATE:",round(accuracyrate,digits=5))) # conducting Receiver operating characteristic (ROC) test and Area under curve (AUC) # Compute AUC for predicting Default with the model if(any(grepl("package:neuralnet", search()))) detach("package:neuralnet") else (reloadnnt <- 0) if(!(require(ROCR))) install.packages("ROCR") pred <- prediction(prob1, test$y) pmf <- performance(pred, measure = "tpr", x.measure = "fpr") auc <- performance(pred, measure = "auc") auc <- auc@y.values[[1]] auc <- round(auc,digits=5) print(c("AUC",auc)) plot(pmf,col= "red", print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7), main=paste("Receiver Operating Characteristic (ROC)", "\n Logistic Reg AUC:",auc)) text(0.5,0.65,"Area Under Curve (AUC) ",col="blue") text(0.5,0.6,auc,col="blue") abline(0,1) if(reloadnnt == 1) library(neuralnet) #save(logit1,file="logit1.rda") } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Random Forest with ROC and AUC # input required: indata=data.frame data needs to have y as dependent variable to calculate # ROC AUC # model= dep~ind1+ind2 # prob_level = threshold value to determine 1 # mtry = number of parameters to be randomly selected # ntree = number of trees randforest_test <- function(indata,model,prob_leve,mtry,ntree) { indata <- indata[complete.cases(indata),] set.seed(1234) # Create Train and Test Data nobs = nrow(indata) sampsize = round(nobs*0.7) train_idx = sample(nobs,size=sampsize) train <- indata[train_idx,] test <- indata[-train_idx,] # Random Forest randomf1 <- randomForest(formula(model), data=train,mtry=mtry,ntree=ntree, importance=TRUE) summary(randomf1) # Predicting the model using test data prob1 <-predict(randomf1,newdata=test) # setting the cutoff for probability values results <- ifelse(prob1 > prob_level,1,0) #Display the confusion matrix or classification table test_y <- test$y conttable <- table(test_y,results) print(conttable) #Calculating the error rate misclasificationerror <- mean(results != test_y) # Calculating the accuracy rate accuracyrate <- 1-misclasificationerror print( c("ACCURACY RATE:",round(accuracyrate,digits=5))) # conducting Receiver operating characteristic (ROC) test and Area under curve (AUC) # Compute AUC for predicting Default with the model pred <- prediction(prob1, test$y) pmf <- performance(pred, measure = "tpr", x.measure = "fpr") auc <- performance(pred, measure = "auc") auc <- auc@y.values[[1]] auc <- round(auc,digits=5) print(c("AUC",auc)) plot(pmf,col= "red", print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7), main=paste("Receiver Operating Characteristic (ROC)", "\nRandom Forest AUC:",auc)) text(0.5,0.65,"Area Under Curve (AUC) ",col="blue") text(0.5,0.6,auc,col="blue") abline(0,1) #save(logit1,file="logit1.rda") } ##----------------------------------------------------------------------------------------------## ################################################################################################## ##--------------------------------------------------------------------------------------------- # conducting Receiver operating characteristic (ROC) test and Area under curve (AUC) # There are conflicts with neuralnet so unload it before use ROCR # auc_plot(prob3,testDefault,"Logistic Regression Model") # install.packages("ROCR") auc_plot <- function(predp,true,estlabel) { reloadnnt <- 1 if(any(grepl("package:neuralnet", search()))) detach("package:neuralnet") else (reloadnnt <- 0) if(!(require(ROCR))) install.packages("ROCR") # Compute AUC for predicting Default with the model pred <- prediction(predp, true) pmf <- performance(pred, measure = "tpr", x.measure = "fpr") auc <- performance(pred, measure = "auc") auc <- auc@y.values[[1]] auc <- round(auc,digits=5) auc plot(pmf,col= "red" , print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7), main=paste(estlabel,"\nReceiver Operating Characteristic (ROC)", "\nAUC:",auc)) text(0.7,0.35,"Area Under Curve (AUC) ",col="blue") text(0.7,0.25,auc,col="blue") abline(0,1) if(reloadnnt == 1) library(neuralnet) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Confusion Tables # conf_table(predp,true) # predp = the prediected probability of y =1 conf_table <- function(predp,ytest,estname) { predp <- as.numeric(predp) ytest <- as.numeric(ytest) ytest <- (ytest - min(ytest))/(max(ytest)-min(ytest)) for (iter in 1:9) { prob <- iter / 10 out <- ifelse(predp > prob, 1, 0) truepos <- sum(ifelse(ytest == 1 & out ==1, 1, 0)) trueneg <- sum(ifelse(ytest == 0 & out ==0, 1, 0)) falspos <- sum(ifelse(ytest == 0 & out ==1, 1, 0)) falsneg <- sum(ifelse(ytest == 1 & out ==0, 1, 0)) true_total <- truepos + falsneg false_total <- trueneg + falspos detection_rate <- round(truepos / (truepos + falsneg), digits = 4) false_pos_rate <- round(falspos / (trueneg + falspos), digits = 4) sumtable <- as.data.frame(cbind(estname,prob,true_total,truepos,falsneg,detection_rate,false_total, falspos,trueneg,false_pos_rate)) ifelse(iter == 1,out_conftable <-sumtable, out_conftable <- rbind(out_conftable, sumtable)) } # print(out_conftable) return(out_conftable) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Confusion Tables # conf_table(predp,true) # predp = the prediected probability of y =1 # conf_table_old <- function(predp,true,estname) { for (iter in 1:9) { prob <- iter/10 if(!(require(SDMTools)))install.packages("SDMTools") t1 <- confusion.matrix(true,predp, threshold= prob ) truepos <- t1[2,2] trueneg <- t1[1,1] falspos <- t1[2,1] falsneg <- t1[1,2] true_total <- truepos+falsneg false_total <- trueneg+falspos detection_rate <- round(truepos/(truepos+falsneg),digits=4) false_pos_rate <- round(falspos/(trueneg+falspos),digits=4) sumtable <- as.data.frame(cbind(estname,prob,true_total,truepos,falsneg,detection_rate, false_total,falspos,trueneg,false_pos_rate)) ifelse(iter ==1, out_conftable <- sumtable, out_conftable <- rbind(out_conftable,sumtable)) } # print(out_conftable) return(out_conftable) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Min Max Normalization # ML data Normalization # min_max_nor(data) min_max_nor <- function(data) { nk <- ncol(data) zdata <- data for (iter in 1:nk) { x <- data[,iter] zdata[,iter] <- (x - min(x))/(max(x) - min(x)) } return(zdata) } ##----------------------------------------------------------------------------------------------## ################################################################################################## # Neural Network Regression Analysis # nnnt_reg(zindata,seednn,hid) # nnnt_reg(seednn=12345,hid=c(5,2),zindata=df) # seednn = seed number for sampling # hid = number of hidden layers # zindata = scaled data using min_max (assumes y variable as dependent) # make sure remove the original y variable from the data nnet_reg <- function(zindata,seednn,hid) { # Train and Test Data set.seed(seednn) train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata))) ztrain <- zindata[train_ind,] ztest <- zindata[-train_ind,] # Neural Network Model if (!(require(neuralnet))) install.packages ("neuralnet") nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000) # Confusion Matrix & Misclassification Error - test data plot(nnet1) nnetp <- as.vector(predict(nnet1,ztest)) truey <- as.vector(ztest$y) rmse <- sqrt(mean((truey-nnetp)^2)) mape <- mean(abs(truey-nnetp)/truey) rsq <- 1 - (sum((truey-nnetp)^2) / sum((truey-mean(truey))^2)) # Output Setting and Return ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 ) ybinary <- 0 out <- cbind(ybinary,seednn,hid[1],hid[2],rmse,rsq) colnames(out)[3] <- "Hidden1" colnames(out)[4] <- "Hidden2" colnames(out)[5] <- "RMSE" colnames(out)[6] <- "RSQ" print(out) return(out) } ##--------------------------------------------------------------------------------------------## ################################################################################################## ##----------------------------------------------------------------------------------------------## # Neural Network Binary Regression Analysis # nnet_binary(zindata,seednn,hid) # nnet_binaryt(seednn=12345,hid=c(5,2),zindata=df) # seednn = seed number for sampling # hid = number of hidden layers # zindata = scaled data using min_max (assumes y variable as dependent) # make sure remove the original y variable from the data # y has to be covered to a factor variable nnet_binary <- function(zindata,seednn,hid) { # Train and Test Data set.seed(seednn) zindata$y <- as.factor(zindata$y) train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata))) ztrain <- zindata[train_ind,] ztest <- zindata[-train_ind,] print("Neural Network Model Hidden Layer:") print(hid) # Neural Network Model if (!(require(neuralnet))) install.packages ("neuralnet") nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000,threshold=0.0001) plot(nnet1) # Confusion Matrix & Misclassification Error - test data nnetp <- predict(nnet1,ztest) nnetp <- nnetp[,2] detach(package:neuralnet,unload = T) # detach due to conflicts with ROCR in prediction # AUC calculation if (!(require(ROCR))) install.packages ("ROCR") library(ROCR) pred <- prediction(nnetp, ztest$y) auc <- performance(pred, measure = "auc") auc <- auc@y.values[[1]] auc <- round(auc,digits=5) # Output Setting and Return ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 ) detach(package:ROCR,unload = T) ybinary <- 1 out <- cbind(ybinary,seednn,hid[1],hid[2],auc,0) colnames(out)[3] <- "Hidden1" colnames(out)[4] <- "Hidden2" colnames(out)[5] <- "AUC" print(out) return(out) } ##--------------------------------------------------------------------------------------------## ################################################################################################## ##--------------------------------------------------------------------------------------------## # Neural Network batch looping for searching the best hidden layers # nnet_search(zindata,seed0,niter,maxh1,binary) # zindata = scaled data with y variable without the orginal dependent variable # seed0 = starding seed number # niter = number of iteration of test # maxh1 = maximum number of hidden layers for NN # binary = 1 : binary choice model 0 : regular regression model nnet_search <- function(zindata,seed0,niter,maxh1,binary) { all_out <- matrix(nrow=1,ncol=6,c(0,0,0,0,0,0)) begintime <- Sys.time() for (iter in 1:niter) { tryCatch({ seednn <- round(rnorm(mean=seed0,sd=10000,n=1)) set.seed(seednn) print(iter) for (i in 2:maxh1) { i1 <- i for (j in 0:i1) { j1 <- j ifelse(j1 > 0,hid <- c(i1,j1), hid <-c(i1)) ifelse(binary==1, out <- nnet_binary(zindata,seednn,hid), out <- nnet_reg(zindata,seednn,hid)) all_out <- rbind(all_out,out) } } }, error=function(e){}) } all_out <- all_out[-1,] ifelse(binary==1, all_out <- all_out[order(all_out[,5],decreasing=TRUE),], all_out <- all_out[order(all_out[,5],decreasing=FALSE),]) all_out <- as.data.frame(all_out) endtime <- Sys.time() Duration <- endtime - begintime print(Duration) return(all_out) } ##--------------------------------------------------------------------------------------------## ################################################################################################## # Neural Network Binary Estimation using neuralnet package # nnet_bin_est(zindata,randseed,hid) # randseed = seed number for sampling # hid = number of hidden layers # zindata = scaled data using min_max (assumes y variable as dependent) # make sure remove the original y variable from the data # y has to be covered to a factor variable # nnet_bin_est <- function(zindata,randseed,hid) { set.seed(randseed) train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata))) ztrain <- zindata[train_ind,] ztest <- zindata[-train_ind,] if (!(require(neuralnet))) install.packages ("neuralnet") nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000) plot(nnet1) # Confusion Matrix & Misclassification Error - test data nnetp <- predict(nnet1,ztest) nnetp <- nnetp[,2] conf <- conf_table(nnetp,ztest$y,"NeuralNet") print(conf) detach(package:neuralnet,unload = T) if (!(require(ROCR))) install.packages ("ROCR") pred <- prediction(nnetp, ztest$y) auc <- performance(pred, measure = "auc") auc <- auc@y.values[[1]] auc <- round(auc,digits=5) auc_plot(nnetp,ztest$y,"Neural Network Model") ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 ) detach(package:ROCR,unload = T) return(nnet1) } ##--------------------------------------------------------------------------------------------##