##################################################################################################
###                                 Supplementary material to                                  ###
### A new confidence interval based on the theory of U-statistics for the area under the curve ###
###                 by J. Kampf, L. Vogel, I. Dykun, T. Rassaf and A. Mahabadi                 ###
##################################################################################################


### Here we present the simulations for the logistic regression model 
### (bias and standard deviation for the AUC)
### (Table 16 of the article)

library(pROC)

p=c(10,10,10,100,100)
m=c(80, 800, 8000, 800, 8000)
n=c(20, 200, 2000, 200, 2000)

N_sim_A=10^8          # Number of simulations for A_2 
N_sim_fitted = 10^6   # Number of simulations for A_1
N_conf =10^4          # Number of different realizations of the AUC

A_hat_mean = rep(0,5) # array to store the biases
A1 = rep(0,5)
stdv = rep(0,5) # array to store the standard deviations


set.seed(2)

### Let us simulate the theoretical AUC A_1 = P(Y_1<Y_2 | I_1=0, I_2=1 )
random_numbers= cbind( array(rnorm(2*N_sim_A), dim=c(N_sim_A,2)),
                       array(runif(2*N_sim_A), dim=c(N_sim_A,2)))
I_1=((exp(random_numbers[,1])/(exp(random_numbers[,1])+1))<random_numbers[,3])
I_2=((exp(random_numbers[,2])/(exp(random_numbers[,2])+1))>random_numbers[,4])
I_3= I_1 * I_2 * (random_numbers[,1]<random_numbers[,2])
AUC_theoretical = mean(I_3)/(mean(I_1*I_2))


# Now simulate the confidence intervals

for(k in (1:5)){
  
  AUC_empirical = rep(0, N_conf) # Array to store the confidence empirical AUC 

    AUC_fitted = rep(0, N_conf)           # Array to store the "fitted" AUC A_1
                                        # (Notice that unlike A_2 this depends on i)
  
  for(i in (1:N_conf)){
    
    set.seed(i)    
    
    # Simulate multivariate normal distributed design points and a logistic
    # regression model based on them
    design_training = data.frame(array(rnorm(p[k]*m[k]), dim=c(m[k],p[k])))
    design_test     = data.frame(array(rnorm(p[k]*n[k]), dim=c(n[k],p[k])))
    
    U_training = runif(m[k])
    U_test     = runif(n[k])
    
    Y_training = exp(design_training[,1])/(exp(design_training[,1])+1)
    Y_test     = exp(design_test    [,1])/(exp(design_test    [,1])+1)
    
    target_training = as.double(U_training<Y_training)
    target_test     = as.double(U_test    <Y_test)
    
    # Estimate the parameters of the model just simulated
    model1 <- glm(target_training~0+., data=design_training, family=binomial)
    fitted_test <- predict(model1, newdata=data.frame(design_test))
    
    # Calculate the "intermediate" AUC A_1
    design_calc_1  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    design_calc_2  = array( rnorm(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    random_numbers = array( runif(2*N_sim_fitted), dim=c(N_sim_fitted, 2) )
    I_1 = ((exp(design_calc_1[,1])/(exp(design_calc_1[,1])+1))<random_numbers[,1])
    I_2 = ((exp(design_calc_2[,1])/(exp(design_calc_2[,1])+1))>random_numbers[,2])
    hat_beta_1 <- model1$coefficients[1]
    hat_beta_2 <- sqrt(sum(model1$coefficients[-1]^2))
    fitted_calc_1 <- hat_beta_1 * design_calc_1[,1] + hat_beta_2 *design_calc_1[,2]
    fitted_calc_2 <- hat_beta_1 * design_calc_2[,1] + hat_beta_2 *design_calc_2[,2]
    I_3= I_1 * I_2 * (fitted_calc_1<fitted_calc_2)
    AUC_fitted[i] = mean(I_3)/(mean(I_1*I_2))
    
    
    AUC_empirical[i]=auc(target_test, fitted_test)
  }
  
  # Test, whether the programm works alright
  print(k)
  
  A_hat_mean[k] = mean(AUC_empirical)
  A1[k] = mean(AUC_fitted)
  stdv[k] = sd(AUC_empirical)
}#k



# Storing the results
df=as.data.frame( cbind (c(" ", " ", "mean of A" , "A1", "A2", 
                           "bias to target A1", "bias to target A2",
                                   "standard deviation"),
                         rbind(p, m+n, format(A_hat_mean, digits=3), 
                               format(A1, digits=3), 
                               format(AUC_theoretical, digits=3),
                               format(abs(A_hat_mean-A1), digits=3),
                               format(abs(A_hat_mean-AUC_theoretical), digits=3), 
                               format(stdv, digits = 3))))

library(xtable)

xtable(df)
