stat_data_analysis_rcode

Statistical Data Analysis - Fall 2018
Log | Files | Refs | README

R-IC10.R (13263B)


      1 #*******************************************************#
      2 #*******************************************************#
      3 # 		         STAT320 - IC10 Solutions	      	  	    #
      4 #*******************************************************#
      5 #*******************************************************#
      6 #***************************************************#
      7 #           Type I/Power for Two Sample Means       #  
      8 #***************************************************#
      9 
     10 #***************************************************#
     11 #***************************************************#
     12 #***************************************************#
     13 #***************************************************#
     14 #***************************************************#
     15 
     16 #***************************************************#
     17 # Means P-value
     18 #***************************************************#
     19 set.seed(3697)
     20 #***************************************************#
     21 
     22 two.mean.perm.pval.f = function(xsampA,xsampB,LowTail = TRUE,NSim=100000)
     23 {
     24   # Compute Sizes of each sample
     25   nA = length(xsampA)
     26   nB = length(xsampB)
     27   
     28   xobs = mean(xsampA)-mean(xsampB)
     29   
     30   # Combine Samples to one super sample
     31   supersample = c(xsampA,xsampB)
     32   
     33   # Create object where simulated DIFFERENCES of averages will be stored
     34   ODiffBars = numeric(NSim)
     35   
     36   # Repeat simulations NSim times
     37   for(i in 1:NSim)
     38   {
     39     shuffledsupersample = sample(supersample)
     40     
     41     # Generate sample 1 by resampling super Sample WITHOUT replacement
     42     # in practice we select the first nA elements of the shuffled supersample
     43     NewSampleA = shuffledsupersample[1:nA]
     44     
     45     # Assign remaining values in supersample to Sample B
     46     # that is the elements (nA+1) through (nA+nB)
     47     NewSampleB  = shuffledsupersample[nA+1:nB]
     48 
     49     # Save difference in means simulated samples
     50     ODiffBars[i] = mean(NewSampleA) - mean(NewSampleB )
     51   }
     52   # Compute p-value
     53   if(LowTail==TRUE)
     54   {
     55     # if alternative Ha is mu < mu0
     56     p = sum(ODiffBars<=xobs)/NSim
     57   }else
     58   {
     59     # if alternative Ha is mu > mu0
     60     p = sum(ODiffBars>=xobs)/NSim
     61   }
     62   return(p) 
     63 }
     64 
     65 
     66 two.mean.perm.recenter.pval.f = function(xsampA,xsampB,LowTail = TRUE,NSim=100000)
     67 {
     68   # Compute Sizes of each sample
     69   nA = length(xsampA)
     70   nB = length(xsampB)
     71   
     72   xobs = mean(xsampA)-mean(xsampB)
     73   
     74   # center samples at 0 by subtracting the means 
     75   xsampA = xsampA - mean(xsampA)
     76   
     77   xsampB = xsampB - mean(xsampB)
     78   
     79   # Combine Samples to one super sample
     80   supersample = c(xsampA,xsampB)
     81   
     82   # Create object where simulated DIFFERENCES of averages will be stored
     83   ODiffBars = numeric(NSim)
     84   
     85   # Repeat simulations NSim times
     86   for(i in 1:NSim)
     87   {
     88     shuffledsupersample = sample(supersample)
     89     
     90     # Generate sample 1 by resampling super Sample WITHOUT replacement
     91     # in practice we select the first nA elements of the shuffled supersample
     92     NewSampleA = shuffledsupersample[1:nA]
     93     
     94     # Assign remaining values in supersample to Sample B
     95     # that is the elements (nA+1) through (nA+nB)
     96     NewSampleB  = shuffledsupersample[nA+1:nB]
     97     
     98     # Save difference in means simulated samples
     99     ODiffBars[i] = mean(NewSampleA) - mean(NewSampleB )
    100   }
    101   # Compute p-value
    102   if(LowTail==TRUE)
    103   {
    104     # if alternative Ha is mu < mu0
    105     p = sum(ODiffBars<=xobs)/NSim
    106   }else
    107   {
    108     # if alternative Ha is mu > mu0
    109     p = sum(ODiffBars>=xobs)/NSim
    110   }
    111   return(p) 
    112 }
    113 
    114 
    115 two.mean.boot.pval.f = function(xsampA,xsampB,LowTail = TRUE,NSim=100000)
    116 {
    117   # Compute Sizes of each sample
    118   nA = length(xsampA)
    119   nB = length(xsampB)
    120   
    121   xobs = mean(xsampA)-mean(xsampB)
    122   
    123   # Combine Samples to one super sample
    124   supersample = c(xsampA,xsampB)
    125   
    126   # Create object where simulated DIFFERENCES of averages will be stored
    127   ODiffBars = numeric(NSim)
    128   
    129   # Repeat simulations NSim times
    130   for(i in 1:NSim)
    131   {
    132     shuffledsupersample = sample(supersample)
    133     
    134     # Generate sample 1 by resampling super Sample WITH replacement
    135     NewSampleA = sample(supersample,size = nA,replace = TRUE) 
    136     
    137     # Generate sample 2 by resampling super Sample WITH replacement
    138     NewSampleB = sample(supersample,size = nB,replace = TRUE) 
    139     
    140     # Save difference in means simulated samples
    141     ODiffBars[i] = mean(NewSampleA) - mean(NewSampleB )
    142   }
    143   # Compute p-value
    144   if(LowTail==TRUE)
    145   {
    146     # if alternative Ha is mu < mu0
    147     p = sum(ODiffBars<=xobs)/NSim
    148   }else
    149   {
    150     # if alternative Ha is mu > mu0
    151     p = sum(ODiffBars>=xobs)/NSim
    152   }
    153   return(p) 
    154 }
    155 
    156 
    157 two.mean.boot.recenter.pval.f = function(xsampA,xsampB,LowTail = TRUE,NSim=100000)
    158 {
    159   # Compute Sizes of each sample
    160   nA = length(xsampA)
    161   nB = length(xsampB)
    162   
    163   xobs = mean(xsampA)-mean(xsampB)
    164   
    165   # center samples at 0 by subtracting the means 
    166   xsampA = xsampA - mean(xsampA)
    167   
    168   xsampB = xsampB - mean(xsampB)
    169   
    170   # Combine Samples to one super sample
    171   supersample = c(xsampA,xsampB)
    172   
    173   # Create object where simulated DIFFERENCES of averages will be stored
    174   ODiffBars = numeric(NSim)
    175   
    176   # Repeat simulations NSim times
    177   for(i in 1:NSim)
    178   {
    179     shuffledsupersample = sample(supersample)
    180     
    181     # Generate sample 1 by resampling super Sample WITH replacement
    182     NewSampleA = sample(supersample,size = nA,replace = TRUE) 
    183     
    184     # Generate sample 2 by resampling super Sample WITH replacement
    185     NewSampleB = sample(supersample,size = nB,replace = TRUE) 
    186     
    187     # Save difference in means simulated samples
    188     ODiffBars[i] = mean(NewSampleA) - mean(NewSampleB )
    189   }
    190   # Compute p-value
    191   if(LowTail==TRUE)
    192   {
    193     # if alternative Ha is mu < mu0
    194     p = sum(ODiffBars<=xobs)/NSim
    195   }else
    196   {
    197     # if alternative Ha is mu > mu0
    198     p = sum(ODiffBars>=xobs)/NSim
    199   }
    200   return(p) 
    201 }
    202 
    203 
    204 #***************************************************#
    205 # Means CI
    206 #***************************************************#
    207 
    208 two.mean.CI.f = function(xsampA,xsampB,alpha=.05,NSim=100000)
    209 {
    210   # Compute Sizes of each sample
    211   nA = length(xsampA)
    212   nB = length(xsampB)
    213   
    214   # Create object where simulated DIFFERENCES of averages will be stored
    215   ODiffBars = numeric(NSim)
    216   
    217   # Repeat simulations NSim times
    218   for(i in 1:NSim)
    219   {
    220     # Generate sample 1 by resampling original sample WITH replacement 
    221     NewSampleA = sample(x= xsampA,size = nA,replace = TRUE) 
    222     
    223     # Generate sample 2 by resampling original sample WITH replacement 
    224     NewSampleB  = sample(x= xsampB,size = nB,replace = TRUE) 
    225     
    226     # Save difference in means simulated samples
    227     ODiffBars[i] = mean(NewSampleA) - mean(NewSampleB)
    228   }
    229   
    230   # Compute CI
    231   ci =quantile(ODiffBars,probs = c(alpha/2,1-alpha/2))
    232   return(ci) 
    233 }
    234 
    235 
    236 
    237 #***************************************************#
    238 #***************************************************#
    239 #***************************************************#
    240 #***************************************************#
    241 #***************************************************#
    242 #***************************************************#
    243 # Type I Simulation Study - Case 1: Normal
    244 #***************************************************#
    245 #***************************************************#
    246 #***************************************************#
    247 #***************************************************#
    248 #***************************************************#
    249 #***************************************************#
    250 
    251 NSTudyReps = 10000
    252 NSim = 10000
    253 
    254 nA = 35
    255 nB = 35
    256 LowTail = TRUE
    257 
    258 PermNullPVals = numeric(NSTudyReps)
    259 PermRecNullPVals = numeric(NSTudyReps)
    260 BootNullPVals = numeric(NSTudyReps)
    261 BootRecNullPVals = numeric(NSTudyReps)
    262 
    263 #***************************************************#
    264 # The true distributionin the Common Population:
    265 # We randomly select this. We should explore different 
    266 # options. These options will address two components:
    267 # 1) Shape of distribution (Normal, Uniform, Exponential, etc.)
    268 # 2) Parameters of distribution (Mean, SD, lambda, etc.)
    269 #***************************************************#
    270 
    271 
    272 #***************************************************#
    273 
    274 start_time <- Sys.time()
    275 
    276 for(s in 1:NSTudyReps)
    277 {
    278   # Generating Samples A and B Under the Null Using the given distribution
    279   
    280 
    281   
    282   
    283   
    284   
    285   
    286   
    287   # p-val using permutation method
    288   PermNullPVals[s] = two.mean.perm.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    289   
    290   # p-val using recentering permutation method
    291   PermRecNullPVals[s] = two.mean.perm.recenter.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    292 
    293     # p-val using bootstrap method
    294   BootNullPVals[s] = two.mean.boot.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    295   
    296   # p-val using recentering bootstrap method
    297   BootRecNullPVals[s] = two.mean.boot.recenter.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    298   
    299   cat(s,"\n")
    300 }
    301 
    302 end_time <- Sys.time()
    303 
    304 end_time - start_time
    305 
    306 # Compute the percentage of replicates that p-value less than
    307 # given alpha
    308 
    309 a=.05
    310 sum(PermNullPVals<=a)/NSTudyReps
    311 sum(PermRecNullPVals<=a)/NSTudyReps
    312 sum(BootNullPVals<=a)/NSTudyReps
    313 sum(BootRecNullPVals<=a)/NSTudyReps
    314 
    315 
    316 #***************************************************#
    317 #***************************************************#
    318 #***************************************************#
    319 #***************************************************#
    320 #***************************************************#
    321 # Power Simulation Study 
    322 #***************************************************#
    323 #***************************************************#
    324 #***************************************************#
    325 #***************************************************#
    326 #***************************************************#
    327 #***************************************************#
    328 
    329 
    330 PermPowPVals = numeric(NSTudyReps)
    331 PermRecPowPVals = numeric(NSTudyReps)
    332 BootPowPVals = numeric(NSTudyReps)
    333 BootRecPowPVals = numeric(NSTudyReps)
    334 
    335 #***************************************************#
    336 # The true distributionin the Two Populations:
    337 # We randomly select this. We should explore different 
    338 # options. These options will address two components:
    339 # 1) Shape of distribution (Normal, Uniform, Exponential, etc.)
    340 # 2) Parameters of distribution (Mean, SD, lambda, etc.)
    341 #***************************************************#
    342 
    343 # Case 1: Normal
    344 
    345 
    346 #***************************************************#
    347 
    348 start_time <- Sys.time()
    349 
    350 for(s in 1:NSTudyReps)
    351 {
    352   # Generating Samples A and B Under the Alternative 
    353 
    354   
    355   
    356   
    357     
    358   # p-val using permutation method
    359   PermPowPVals[s] = two.mean.perm.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    360   
    361   # p-val using permutation method
    362   PermRecPowPVals[s] = two.mean.perm.recenter.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    363 
    364     # p-val using bootstrap method
    365   BootPowPVals[s] = two.mean.boot.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    366   
    367   # p-val using recentering bootstrap method
    368   BootRecPowPVals[s] = two.mean.boot.recenter.pval.f(SampA,SampB,LowTail = LowTail,NSim=NSim)
    369   
    370   cat(s,"\n")
    371 }
    372 
    373 end_time <- Sys.time()
    374 
    375 end_time - start_time
    376 
    377 # Compute the percentage of replicates that p-value less than
    378 # given alpha. This will give us the power of the method
    379 
    380 a=.05
    381 sum(PermPowPVals<=a)/NSTudyReps
    382 sum(PermRecPowPVals<=a)/NSTudyReps
    383 sum(BootPowPVals<=a)/NSTudyReps
    384 sum(BootRecPowPVals<=a)/NSTudyReps
    385 
    386 
    387 #***************************************************#
    388 #***************************************************#
    389 #***************************************************#
    390 #***************************************************#
    391 #***************************************************#
    392 # CI Study 
    393 #***************************************************#
    394 #***************************************************#
    395 #***************************************************#
    396 #***************************************************#
    397 #***************************************************#
    398 #***************************************************#
    399 
    400 
    401 #***************************************************#
    402 # The true distributionin the Two Populations:
    403 # We randomly select this. We should explore different 
    404 # options. These options will address two components:
    405 # 1) Shape of distribution (Normal, Uniform, Exponential, etc.)
    406 # 2) Parameters of distribution (Mean, SD, lambda, etc.)
    407 #***************************************************#
    408 
    409 # Case 1: Normal
    410 
    411 
    412 a = .05 
    413 
    414 
    415 # The CI is for the difference in the two proportions 
    416 TrueDiff = muA - muB
    417 
    418 CIIncludeTrueDiff = numeric(NSTudyReps)
    419 
    420 
    421 #***************************************************#
    422 
    423 start_time <- Sys.time()
    424 
    425 for(s in 1:NSTudyReps)
    426 {
    427   # Generating Samples A and B Under the Alternative 
    428 
    429   
    430   
    431   
    432   
    433   
    434   
    435     
    436   # CI using bootstrap method
    437   CI = two.mean.CI.f(SampA,SampB,alpha=a,NSim=NSim)
    438   
    439   # Check if CI included true difference
    440   if(CI[1] <= TrueDiff && TrueDiff <= CI[2])
    441   {
    442     CIIncludeTrueDiff[s]=1
    443   }else
    444   {
    445     CIIncludeTrueDiff[s]=0
    446   }
    447   cat(s,"\n")
    448 }
    449 
    450 end_time <- Sys.time()
    451 
    452 end_time - start_time
    453 
    454 # Compute the percentage of replicates that CI included truediff
    455 
    456 sum(CIIncludeTrueDiff)/NSTudyReps
    457