##################################################################################################

##---------------------------------------------------------------------------------------------
# Functions for Classes by Prof. Jin Man Lee
# DePaul University
#
# To use any functions here, you need to run the following code to read into your program
# source("http://condor.depaul.edu/jlee141/econdata/R/func_lib.R")
#

##################################################################################################
# descriptive2 
# Descriptive Statistics by Class or Group 
# Function Example V1: main var V2: Class or Factor 
descriptive2  <- function(v1,v2) 
    {    n    <- table(v2)
         mean <- tapply(v1,v2,mean)
         med  <- tapply(v1,v2,median)
         sd   <- tapply(v1,v2,sd)
         min  <- tapply(v1,v2,min)
         max  <- tapply(v1,v2,max)
         desc <- as.data.frame(cbind(n,mean,med,sd,min,max))
         desc$name <- row.names(max)
         return(desc) 
     }
##----------------------------------------------------------------------------------------------##

##################################################################################################
# fit_eval    
# fit_eval function to evaluate model goodness-to-fit
# Calculating to return answer which are 7 criterion to evaluate forecasting models, which are 
# ME (Mean error), MAE (Mean absolute error), MPE (Mean percentage error), 
# MAPE (Mean absolute percentage error), 
# MSE (Mean squared error), RMSE (Root of mean square error).
# 
# fit_eval(yhat,yorg,name)
# example: fit_level(yhat,y,"OLS")
# yhat and y are vector 
#  created by Lee 

fit_eval <- function(yhat,yorg,name) 
   {
      ei   = yorg - yhat 
      n    = length(ei)
      ME   = sum(ei)/n
      MAE  = sum(abs(ei))/n
      MPE  = sum((ei/yorg)*100)/n
      MAPE = sum((abs(ei)/yorg)*100)/n
      MSE  = sum(ei*ei)/n
      RMSE = sqrt(sum(ei*ei)/n)
      stat = cbind(n,ME,MAE,MPE,MAPE,MSE,RMSE) 
      rownames(stat) <- name
      return(stat)
   } 
##----------------------------------------------------------------------------------------------##

##################################################################################################
# Logistic Regression with ROC and AUC  
#  input required: indata=data.frame name data needs to have y as dependent variable to calculate
#                         ROC AUC
#                   model= dep~ind1+ind2 
#                   prob_level = threshold value to determine 1   

logit_test <- function(indata,model,prob_level) {

indata <- indata[complete.cases(indata),]
set.seed(1234)

# Create Train and Test Data 
nobs = nrow(indata)
sampsize = round(nobs*0.7)
train_idx = sample(nobs,size=sampsize)
train <- indata[train_idx,]
test  <- indata[-train_idx,]

# Logistic Regression 
logit1 <- glm(formula = model,
              family = binomial(link = logit), data = train)
summary(logit1)

# Check Multicollinearity
vif(logit1)

# Predicting the model using test data
prob1 <-predict(logit1,newdata=test,type ="response")

# setting the cutoff for probability values
results <- ifelse(prob1 > prob_level,1,0)

#Display the confusion matrix or classification table
test_y <- test$y
conttable <- table(test_y,results)
print(conttable)

#Calculating the error rate
misclasificationerror <- mean(results != test_y)

# Calculating the accuracy rate
accuracyrate <- 1-misclasificationerror
print( c("ACCURACY RATE:",round(accuracyrate,digits=5)))

# conducting Receiver operating characteristic (ROC) test and Area under curve (AUC)
# Compute AUC for predicting Default with the model
if(any(grepl("package:neuralnet", search()))) detach("package:neuralnet") else (reloadnnt <- 0)  
if(!(require(ROCR))) install.packages("ROCR")
pred <- prediction(prob1, test$y)
pmf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc <- round(auc,digits=5)
print(c("AUC",auc))
plot(pmf,col= "red", print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7),
      main=paste("Receiver Operating Characteristic (ROC)", "\n Logistic Reg AUC:",auc))
text(0.5,0.65,"Area Under Curve (AUC) ",col="blue")
text(0.5,0.6,auc,col="blue")
abline(0,1)
if(reloadnnt == 1) library(neuralnet) 
#save(logit1,file="logit1.rda") 
}
##----------------------------------------------------------------------------------------------##

##################################################################################################
# Random Forest with ROC and AUC  
#  input required: indata=data.frame data needs to have y as dependent variable to calculate 
#                         ROC AUC
#                   model= dep~ind1+ind2 
#                   prob_level = threshold value to determine 1 
#                   mtry = number of parameters to be randomly selected
#                   ntree = number of trees   

randforest_test <- function(indata,model,prob_leve,mtry,ntree) {

indata <- indata[complete.cases(indata),]
set.seed(1234)

# Create Train and Test Data 
nobs = nrow(indata)
sampsize = round(nobs*0.7)
train_idx = sample(nobs,size=sampsize)
train <- indata[train_idx,]
test  <- indata[-train_idx,]

# Random Forest
randomf1 <- randomForest(formula(model), data=train,mtry=mtry,ntree=ntree, importance=TRUE)
summary(randomf1)

# Predicting the model using test data
prob1 <-predict(randomf1,newdata=test)

# setting the cutoff for probability values
results <- ifelse(prob1 > prob_level,1,0)

#Display the confusion matrix or classification table
test_y <- test$y
conttable <- table(test_y,results)
print(conttable)

#Calculating the error rate
misclasificationerror <- mean(results != test_y)


# Calculating the accuracy rate
accuracyrate <- 1-misclasificationerror
print( c("ACCURACY RATE:",round(accuracyrate,digits=5)))

# conducting Receiver operating characteristic (ROC) test and Area under curve (AUC)
# Compute AUC for predicting Default with the model
pred <- prediction(prob1, test$y)
pmf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc <- round(auc,digits=5)
print(c("AUC",auc))
plot(pmf,col= "red", print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7),
      main=paste("Receiver Operating Characteristic (ROC)", "\nRandom Forest AUC:",auc))
text(0.5,0.65,"Area Under Curve (AUC) ",col="blue")
text(0.5,0.6,auc,col="blue")
abline(0,1)
#save(logit1,file="logit1.rda") 
}
##----------------------------------------------------------------------------------------------##


##################################################################################################
##---------------------------------------------------------------------------------------------
# conducting Receiver operating characteristic (ROC) test and Area under curve (AUC)
# There are conflicts with neuralnet so unload it before use ROCR 
# auc_plot(prob3,testDefault,"Logistic Regression Model")
# install.packages("ROCR")

auc_plot <- function(predp,true,estlabel) {
reloadnnt <- 1 
if(any(grepl("package:neuralnet", search()))) detach("package:neuralnet") else (reloadnnt <- 0)  
if(!(require(ROCR))) install.packages("ROCR")

# Compute AUC for predicting Default with the model
pred <- prediction(predp, true)
pmf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc <- round(auc,digits=5)
auc
plot(pmf,col= "red" ,  print.cutoffs.at=seq(0,1,by=0.1),text.adj=c(-0.2,1.7),
     main=paste(estlabel,"\nReceiver Operating Characteristic (ROC)", "\nAUC:",auc))
text(0.7,0.35,"Area Under Curve (AUC) ",col="blue")
text(0.7,0.25,auc,col="blue")
abline(0,1)
if(reloadnnt == 1) library(neuralnet) 

}
##----------------------------------------------------------------------------------------------##

##################################################################################################
# Confusion Tables
# conf_table(predp,true)
# predp = the prediected probability of y =1 

conf_table <- function(predp,ytest,estname) {
  predp <- as.numeric(predp)
  ytest  <- as.numeric(ytest)
  ytest <- (ytest - min(ytest))/(max(ytest)-min(ytest))
  for (iter in 1:9) {
    prob    <- iter / 10
    out     <- ifelse(predp > prob, 1, 0)
    truepos <- sum(ifelse(ytest == 1 & out ==1, 1, 0))
    trueneg <- sum(ifelse(ytest == 0 & out ==0, 1, 0))
    falspos <- sum(ifelse(ytest == 0 & out ==1, 1, 0))
    falsneg <- sum(ifelse(ytest == 1 & out ==0, 1, 0))
    true_total <- truepos + falsneg
    false_total <- trueneg + falspos
    detection_rate  <- round(truepos / (truepos + falsneg), digits = 4)
    false_pos_rate  <- round(falspos / (trueneg + falspos), digits = 4)
    sumtable <-
      as.data.frame(cbind(estname,prob,true_total,truepos,falsneg,detection_rate,false_total,
                          falspos,trueneg,false_pos_rate))
    ifelse(iter == 1,out_conftable <-sumtable,
                     out_conftable <- rbind(out_conftable, sumtable))
      }
  #  print(out_conftable)
   return(out_conftable)
}
##----------------------------------------------------------------------------------------------##


##################################################################################################
# Confusion Tables
# conf_table(predp,true)
# predp = the prediected probability of y =1 
#
conf_table_old <- function(predp,true,estname) {
  for (iter in 1:9) {
  prob    <- iter/10
  if(!(require(SDMTools)))install.packages("SDMTools")
  t1 <- confusion.matrix(true,predp, threshold= prob ) 
  truepos <- t1[2,2]
  trueneg <- t1[1,1]
  falspos <- t1[2,1]
  falsneg <- t1[1,2]
  true_total <- truepos+falsneg
  false_total <- trueneg+falspos
  detection_rate  <- round(truepos/(truepos+falsneg),digits=4)
  false_pos_rate  <- round(falspos/(trueneg+falspos),digits=4)
  sumtable <- as.data.frame(cbind(estname,prob,true_total,truepos,falsneg,detection_rate,
                    false_total,falspos,trueneg,false_pos_rate))
  ifelse(iter ==1, out_conftable <- sumtable, out_conftable <- rbind(out_conftable,sumtable))
  }
#  print(out_conftable)
  return(out_conftable)
}
##----------------------------------------------------------------------------------------------##

##################################################################################################
# Min Max Normalization
# ML data Normalization
# min_max_nor(data)

min_max_nor <- function(data) {
  nk <- ncol(data)
  zdata <- data
  for (iter in 1:nk) {
    x  <- data[,iter]
    zdata[,iter] <- (x - min(x))/(max(x) - min(x))
  }
  return(zdata)
}
##----------------------------------------------------------------------------------------------##

##################################################################################################
#  Neural Network Regression Analysis 
#  nnnt_reg(zindata,seednn,hid) 
#  nnnt_reg(seednn=12345,hid=c(5,2),zindata=df)
#   seednn = seed number for sampling
#   hid = number of hidden layers 
#   zindata = scaled data using min_max (assumes y variable as dependent)
#             make sure remove the original y variable from the data

nnet_reg <- function(zindata,seednn,hid) {
  # Train and Test Data
  set.seed(seednn)
  train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata)))
  ztrain    <- zindata[train_ind,]
  ztest     <- zindata[-train_ind,]
  
  # Neural Network Model 
  if (!(require(neuralnet))) install.packages ("neuralnet")
  nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000)
  # Confusion Matrix & Misclassification Error - test data
  plot(nnet1)
  nnetp   <- as.vector(predict(nnet1,ztest))
  truey    <-  as.vector(ztest$y)
  
  rmse    <- sqrt(mean((truey-nnetp)^2))
  mape    <- mean(abs(truey-nnetp)/truey)
  rsq     <- 1 - (sum((truey-nnetp)^2) / sum((truey-mean(truey))^2))
 
  # Output Setting and Return 
  ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 )
  ybinary  <- 0
  out <- cbind(ybinary,seednn,hid[1],hid[2],rmse,rsq)
  colnames(out)[3] <- "Hidden1" 
  colnames(out)[4] <- "Hidden2"  
  colnames(out)[5] <- "RMSE"  
  colnames(out)[6] <- "RSQ"  
  print(out)
  return(out)
}
##--------------------------------------------------------------------------------------------##

##################################################################################################
##----------------------------------------------------------------------------------------------##
#  Neural Network Binary Regression Analysis 
#  nnet_binary(zindata,seednn,hid) 
#  nnet_binaryt(seednn=12345,hid=c(5,2),zindata=df)
#   seednn = seed number for sampling
#   hid = number of hidden layers 
#   zindata = scaled data using min_max (assumes y variable as dependent)
#             make sure remove the original y variable from the data
#             y has to be covered to a factor variable

nnet_binary <- function(zindata,seednn,hid) {
  # Train and Test Data
  set.seed(seednn)
  zindata$y   <- as.factor(zindata$y) 
  train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata)))
  ztrain    <- zindata[train_ind,]
  ztest     <- zindata[-train_ind,]
  print("Neural Network Model Hidden Layer:") 
  print(hid)
  # Neural Network Model 
  if (!(require(neuralnet))) install.packages ("neuralnet")
  nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000,threshold=0.0001)
  plot(nnet1)
  # Confusion Matrix & Misclassification Error - test data
  nnetp   <- predict(nnet1,ztest)
  nnetp   <- nnetp[,2]
  detach(package:neuralnet,unload = T)             # detach due to conflicts with ROCR in prediction
  
  # AUC calculation  
  if (!(require(ROCR))) install.packages ("ROCR")
  library(ROCR)
  pred <- prediction(nnetp, ztest$y)
  auc <- performance(pred, measure = "auc")
  auc <- auc@y.values[[1]]
  auc <- round(auc,digits=5)
  
  # Output Setting and Return 
  ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 )
  detach(package:ROCR,unload = T)
  ybinary  <- 1
  out <- cbind(ybinary,seednn,hid[1],hid[2],auc,0)
  colnames(out)[3] <- "Hidden1" 
  colnames(out)[4] <- "Hidden2"  
  colnames(out)[5] <- "AUC"  
  print(out)
  return(out)
}
##--------------------------------------------------------------------------------------------##

##################################################################################################
##--------------------------------------------------------------------------------------------##
# Neural Network batch looping for searching the best hidden layers
# nnet_search(zindata,seed0,niter,maxh1,binary)
#           zindata =  scaled data with y variable without the orginal dependent variable
#           seed0   =  starding seed number
#           niter   =  number of iteration of test
#           maxh1   =  maximum number of hidden layers for NN
#           binary  =  1 : binary choice model  0 : regular regression model 
  
nnet_search <- function(zindata,seed0,niter,maxh1,binary) {
all_out <- matrix(nrow=1,ncol=6,c(0,0,0,0,0,0)) 
  
begintime <- Sys.time()
for (iter in 1:niter) {
 tryCatch({
  seednn <- round(rnorm(mean=seed0,sd=10000,n=1)) 
  set.seed(seednn)
  print(iter)
  for (i in 2:maxh1) {
    i1 <- i
    for (j in 0:i1) {
      j1 <- j
      ifelse(j1 > 0,hid <- c(i1,j1), hid <-c(i1))
         ifelse(binary==1, 
                          out <-  nnet_binary(zindata,seednn,hid),
                          out <-  nnet_reg(zindata,seednn,hid))
      all_out <- rbind(all_out,out)
    }
   }
 }, error=function(e){})
}
all_out <- all_out[-1,]
ifelse(binary==1,
   all_out <- all_out[order(all_out[,5],decreasing=TRUE),],
   all_out <- all_out[order(all_out[,5],decreasing=FALSE),])

all_out <- as.data.frame(all_out)
endtime <- Sys.time()
Duration <- endtime - begintime
print(Duration)

return(all_out)
}
##--------------------------------------------------------------------------------------------##


##################################################################################################
#  Neural Network Binary Estimation using neuralnet package 
#  nnet_bin_est(zindata,randseed,hid) 
#   randseed = seed number for sampling
#   hid = number of hidden layers 
#   zindata = scaled data using min_max (assumes y variable as dependent)
#             make sure remove the original y variable from the data
#             y has to be covered to a factor variable
# 
nnet_bin_est <- function(zindata,randseed,hid) {
set.seed(randseed)
train_ind <- sample(nrow(zindata),round(0.7*nrow(zindata)))
ztrain    <- zindata[train_ind,]
ztest     <- zindata[-train_ind,]
if (!(require(neuralnet))) install.packages ("neuralnet")
nnet1 <- neuralnet(y ~., data=ztrain, hidden=hid, linear.output = FALSE, stepmax = 1000000)
plot(nnet1)

# Confusion Matrix & Misclassification Error - test data
nnetp   <- predict(nnet1,ztest)
nnetp   <- nnetp[,2]
conf <- conf_table(nnetp,ztest$y,"NeuralNet")
print(conf)
detach(package:neuralnet,unload = T)
if (!(require(ROCR))) install.packages ("ROCR")
pred <- prediction(nnetp, ztest$y)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc <- round(auc,digits=5)
auc_plot(nnetp,ztest$y,"Neural Network Model")
ifelse(is.na(hid[2])==TRUE, hid[2] <- 0, 0 )
detach(package:ROCR,unload = T)
return(nnet1) 
}
##--------------------------------------------------------------------------------------------##