##################################################################################################
###                                 Supplementary material to                                  ###
### A new confidence interval based on the theory of U-statistics for the area under the curve ###
###                 by J. Kampf, L. Vogel, I. Dykun, T. Rassaf and A. Mahabadi                 ###
##################################################################################################


### Here we present the simulations for the logistic regression model 
### (Table 3, 4 and 5 of the article)


######################################################
####                    WARNING!                  ####
#### The following program will run for 19 hours  ####
#### on a 2.5 GHz computer.                       ####
#### If you do not have that much time, choose    ####
#### a smaller value for the parameter N_conf.    ####
######################################################

source("AUC_CI.R")

p=c(10,10,10,100,100)
m=c(80, 800, 8000, 800, 8000)
n=c(20, 200, 2000, 200, 2000)

N_sim_A=10^8          # Number of simulations for A_2 
N_sim_fitted = 10^6   # Number of simulations for A_1
N_conf =10^4          # Number of different realizations of the confidence interval

covered     = array(0, dim=c(4,5))   # Array to store the coverage probabilities for A_2
covered_f   = array(0, dim=c(4,5))   # Array to store the coverage probabilities for A_1
mean_length = array(0, dim=c(4,5))   # Array to store the mean length of the
                                     # confidence intervals


set.seed(2)

### Let us simulate the theoretical AUC A_1 = P(Y_1<Y_2 | I_1=0, I_2=1 )
random_numbers= cbind( array(rnorm(2*N_sim_A), dim=c(N_sim_A,2)),
                       array(runif(2*N_sim_A), dim=c(N_sim_A,2)))
I_1=((exp(random_numbers[,1])/(exp(random_numbers[,1])+1))<random_numbers[,3])
I_2=((exp(random_numbers[,2])/(exp(random_numbers[,2])+1))>random_numbers[,4])
I_3= I_1 * I_2 * (random_numbers[,1]<random_numbers[,2])
AUC_theoretical = mean(I_3)/(mean(I_1*I_2))


# Now simulate the confidence intervals

for(k in (1:5)){
  
  interval =array(0, dim=c(4,N_conf,2)) # Array to store the confidence intervals
                                        # First component:  Which algorithm?
                                        # Second component: Which simulation run?
                                        # Third component: Which endpoint?
  AUC_fitted = rep(0, N_conf)           # Array to store the "fitted" AUC A_1
                                        # (Notice that unlike A_2 this depends on i)
  
  for(i in (1:N_conf)){
    
    set.seed(i)    
    
    # Simulate multivariate normal distributed design points and a logistic
    # regression model based on them
    design_training = data.frame(array(rnorm(p[k]*m[k]), dim=c(m[k],p[k])))
    design_test     = data.frame(array(rnorm(p[k]*n[k]), dim=c(n[k],p[k])))
    
    U_training = runif(m[k])
    U_test     = runif(n[k])
    
    Y_training = exp(design_training[,1])/(exp(design_training[,1])+1)
    Y_test     = exp(design_test    [,1])/(exp(design_test    [,1])+1)
    
    target_training = as.double(U_training<Y_training)
    target_test     = as.double(U_test    <Y_test)
    
    # Estimate the parameters of the model just simulated
    model1 <- glm(target_training~0+., data=design_training, family=binomial)
    fitted_test <- predict(model1, newdata=data.frame(design_test))
    
    # Calculate the "intermediate" AUC A_2
    design_calc_1  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    design_calc_2  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    random_numbers = array( runif(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    I_1 = ((exp(design_calc_1[,1])/(exp(design_calc_1[,1])+1))<random_numbers[,1])
    I_2 = ((exp(design_calc_2[,1])/(exp(design_calc_2[,1])+1))>random_numbers[,2])
    hat_beta_1 <- model1$coefficients[1]
    hat_beta_2 <- sqrt(sum(model1$coefficients[-1]^2))
    fitted_calc_1 <- hat_beta_1 * design_calc_1[,1] + hat_beta_2 *design_calc_1[,2]
    fitted_calc_2 <- hat_beta_1 * design_calc_2[,1] + hat_beta_2 *design_calc_2[,2]
    I_3= I_1 * I_2 * (fitted_calc_1<fitted_calc_2)
    AUC_fitted[i] = mean(I_3)/(mean(I_1*I_2))
    
    
    interval[1,i,]=AUC_new2(target_test, fitted_test)
    interval[2,i,]=AUC_new3(target_test, fitted_test)
    interval[3,i,]=AUC_DeLong(target_test, fitted_test)
    interval[4,i,]=AUC_MWald(target_test, fitted_test)
  }
  
  # Test, whether the programm works alright
  print(k)
  if(sum(is.na(interval))>0){
    stop(paste(k,i,which(is.na(interval))))
  }
  
  
  covered[,k]  = apply( (interval[,,1]<AUC_theoretical) & (interval[,,2]>AUC_theoretical),
                        1, mean)
  
  for(M in (1:4)){
    covered_f[M,k]  = mean( (interval[M,,1]<AUC_fitted) & (interval[M,,2]>AUC_fitted))
  }
  
  mean_length[,k] =apply(interval[,,2]-interval[,,1], 1, mean)
}#k



# Storing the results
# Store the results for A_2
covered_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                   "DeLong", "Modified Wald"),
                                 rbind(p, m+n, format(covered, digits=3))))
# Store the results for A_1
covered_f_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                     "DeLong", "Modified Wald"),
                                   rbind(p, m+n, format(covered_f, digits=3))))
# Store the mean length of the confidence intervals
mean_length_df=as.data.frame( cbind (c(" ", " ", "Corollary 2", "Corollary 3",
                                       "DeLong", "Modified Wald"),
                                     rbind(p, m+n, format(mean_length, digits=3))))

library(xtable)

xtable(covered_df)

xtable(covered_f_df)

xtable(mean_length_df)
