##################################################################################################
###                                 Supplementary material to                                  ###
### A new confidence interval based on the theory of U-statistics for the area under the curve ###
###                 by J. Kampf, L. Vogel, I. Dykun, T. Rassaf and A. Mahabadi                 ###
##################################################################################################


### Here we simulate the AUC for the 2nd LASSO model (the one with "skew" beta_0)
### (Table 12, 13 and 14 of the article)

setwd("C:/Users/juekampf/Documents/AUC/Modern_Stochastics")
source("AUC_CI.R")

library(glmnet)

p=c(100,100)
m=c(800, 8000)
n=c(200, 2000)

N_sim_A=10^8          # Number of simulations for A_2 
N_sim_fitted = 10^6   # Number of simulations for A_1
N_conf =10^4          # Number of different realizations of the confidence interval

covered     = array(0, dim=c(4,2))   # Array to store the coverage probabilities for A_2
covered_f   = array(0, dim=c(4,2))   # Array to store the coverage probabilities for A_1
mean_length = array(0, dim=c(4,2))   # Array to store the mean length of the
                                     # confidence intervals


set.seed(2)

### Let us simulate the theoretical AUC A_1 = P(Y_1<Y_2 | I_1=0, I_2=1 )
random_numbers= cbind( array(rnorm(2*N_sim_A), dim=c(N_sim_A,2)),
                       array(runif(2*N_sim_A), dim=c(N_sim_A,2)))
I_1=((exp(random_numbers[,1])/(exp(random_numbers[,1])+1))<random_numbers[,3])
I_2=((exp(random_numbers[,2])/(exp(random_numbers[,2])+1))>random_numbers[,4])
I_3= I_1 * I_2 * (random_numbers[,1]<random_numbers[,2])
AUC_theoretical = mean(I_3)/(mean(I_1*I_2))


# Now simulate the confidence intervals

for(k in (1:2)){
  
  beta_pre = ((1:p[k])-p[k]/2)
  beta_0 = beta_pre/sqrt(sum(beta_pre^2))
  
  interval =array(0, dim=c(4,N_conf,2)) # Array to store the confidence intervals
                                        # First component:  Which algorithm?
                                        # Second component: Which simulation run?
                                        # Third component: Which endpoint?
  AUC_fitted = rep(0, N_conf)           # Array to store the "fitted" AUC A_1
                                        # (Notice that unlike A_2 this depends on i)
  
  for(i in (1:N_conf)){
    
    set.seed(i)    
    
    # Simulate multivariate normal distributed design points and a logistic
    # regression model based on them
    design_training = array(rnorm(p[k]*m[k]), dim=c(m[k],p[k]))
    design_test     = array(rnorm(p[k]*n[k]), dim=c(n[k],p[k]))
    
    U_training = runif(m[k])
    U_test     = runif(n[k])
    
    Y_training = exp((design_training%*%beta_0)[,1])/(exp((design_training%*%beta_0)[,1])+1)
    Y_test     = exp((design_test%*%beta_0)    [,1])/(exp((design_test%*%beta_0)    [,1])+1)
    
    target_training = as.double(U_training<Y_training)
    target_test     = as.double(U_test    <Y_test)
    
    # Estimate the parameters of the model just simulated
    model1 <- glmnet(x=as.matrix(design_training), y=target_training, family=binomial,
                     lambda=0.05)
    fitted_test <- predict(model1, newx=as.matrix(design_test))
    
    # Calculate the "intermediate" AUC A_2
    design_calc_1  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    design_calc_2  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    random_numbers = array( runif(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    I_1 = ((exp(design_calc_1[,1])/(exp(design_calc_1[,1])+1))<random_numbers[,1])
    I_2 = ((exp(design_calc_2[,1])/(exp(design_calc_2[,1])+1))>random_numbers[,2])
    hat_beta_1 <- sum(as.double(model1$beta)*beta_0)
    hat_beta_2 <- sqrt(sum(as.double(model1$beta)^2)-hat_beta_1^2)
    fitted_calc_1 <- hat_beta_1 * design_calc_1[,1] + hat_beta_2 *design_calc_1[,2]
    fitted_calc_2 <- hat_beta_1 * design_calc_2[,1] + hat_beta_2 *design_calc_2[,2]
    I_3= I_1 * I_2 * (fitted_calc_1<fitted_calc_2)
    AUC_fitted[i] = mean(I_3)/(mean(I_1*I_2))
    
    
    interval[1,i,]=AUC_new2(target_test, fitted_test[,1])
    interval[2,i,]=AUC_new3(target_test, fitted_test[,1])
    interval[3,i,]=AUC_DeLong(target_test, fitted_test[,1])
    interval[4,i,]=AUC_MWald(target_test, fitted_test[,1])
  }
  
  # Test, whether the programm works alright
  print(k)
  if(sum(is.na(interval))>0){
    stop(paste(k,i,which(is.na(interval))))
  }
  
  
  covered[,k]  = apply( (interval[,,1]<AUC_theoretical) & (interval[,,2]>AUC_theoretical),
                        1, mean)
  
  for(M in (1:4)){
    covered_f[M,k]  = mean( (interval[M,,1]<AUC_fitted) & (interval[M,,2]>AUC_fitted))
  }
  
  mean_length[,k] =apply(interval[,,2]-interval[,,1], 1, mean)
}#k



# Storing the results
# Store the results for A_2
covered_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                   "DeLong", "Modified Wald"),
                                 rbind(p, m+n, format(covered, digits=3))))
# Store the results for A_1
covered_f_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                     "DeLong", "Modified Wald"),
                                   rbind(p, m+n, format(covered_f, digits=3))))
# Store the mean length of the confidence intervals
mean_length_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                       "DeLong", "Modified Wald"),
                                     rbind(p, m+n, format(mean_length, digits=3))))

library(xtable)

xtable(covered_df)

xtable(covered_f_df)

xtable(mean_length_df)


