Setup, Constants and Functions, Load data

suppressMessages(require(rms, quietly = TRUE, warn.conflicts = FALSE))
require(splines, quietly = TRUE)
require(plotly, quietly = TRUE, warn.conflicts = FALSE)
require(Hmisc, quietly = TRUE)
require(e1071, quietly = TRUE)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
require(caret, quietly = TRUE)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
require(BiodiversityR, quietly = TRUE)
## This is vegan 2.4-3
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:caret':
## 
##     tolerance
## The following object is masked from 'package:rms':
## 
##     calibrate
## BiodiversityR 2.8-3: Use command BiodiversityRGUI() to launch the Graphical User Interface and to learn about backward compatibility
require(logistf, quietly = TRUE)
require(rmarkdown, quietly = TRUE)
## Warning: package 'rmarkdown' was built under R version 3.4.2

Constants and Functions

NA_THRESHOLD = 0.03
CORR_THRESHOLD = 0.7

# Remove variables that correlation are non-relevant. These are the identification variables or dependant variable.
# Warning: This is redefined for each model construction!!!
dropToPredict = c("File.Path", "Project", "Language", "Table.Name","Name","Kind","Table.Name", "X..Bugs.Post", "File", "Distinct.count.of.Issue.Key.POST", "X..Catch", "X..Throws")

source(file = "construction_functions.R")
source(file = "analysis_functions.R")

Load combined data with no missing

load(file ="0-all_no_missing.RData")

Models

We build in total many models. They were per project, per group of files (i.e. all files, with catch blocks, with throws, with both) and finally we built a base model for reference, and then we include exception handling features according to the model construction analysis.

1- Preliminary analysis and results - All files including zero catch and zero throws

all_list_omitted_1 = all_no_missing
all_list_omitted = all_list_omitted_1

all_list_omitted = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = as.data.frame(all_list_omitted_1[i])
  print(paste("nrow:",nrow(temp_data),"ncol:",ncol(temp_data)))

  #Don't do anything in here, repack only.

  all_list_omitted <- c(all_list_omitted, list(list(project=projects[i], data=temp_data)))
}
## [1] "Project: hadoop-2.6"
## [1] "nrow: 3662 ncol: 159"
## [1] "Project: hibernate-5.0"
## [1] "nrow: 3450 ncol: 159"
## [1] "Project: umbraco-7.6"
## [1] "nrow: 3083 ncol: 159"

Predictor budget estimation (MC2) and normality adjustment (MC3)

modelSelectionAndNormalityAdjustment(all_list_omitted)
## [1] "Project: hadoop-2.6 D.F. Budget: 244"
## [1] "Project: hibernate-5.0 D.F. Budget: 230"
## [1] "Project: umbraco-7.6 D.F. Budget: 205"
## [1] "Project: hadoop-2.6 skewness, 11.2075144591477"
## [1] "Project: hadoop-2.6 kurtosis, 206.311714751217"
## [1] "Project: hibernate-5.0 skewness, 5.65031532192437"
## [1] "Project: hibernate-5.0 kurtosis, 47.2644938004385"
## [1] "Project: umbraco-7.6 skewness, 7.94132944771468"
## [1] "Project: umbraco-7.6 kurtosis, 75.8810831241314"

## Model 0 - BASE Only Before looking into models that include catch blocks or throws blocks data we would consider also models that only have base metrics. We aim to understand the difference between BASE only metrics and BASE + EH Metrics ### Drop variables

all_list_omitted_m0 = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = all_list_omitted[[i]]$data

  temp_data = temp_data[,!(names(temp_data) %in% catch_names)]
  temp_data = temp_data[,!(names(temp_data) %in% try_names)]
  temp_data = temp_data[,!(names(temp_data) %in% throws_names)]

  print(nrow(temp_data))
  print(names(temp_data))

  all_list_omitted_m0 <- c(all_list_omitted_m0, list(temp_data))
}
## [1] "Project: hadoop-2.6"
## [1] 3662
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"                        
## [1] "Project: hibernate-5.0"
## [1] 3450
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"                        
## [1] "Project: umbraco-7.6"
## [1] 3083
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"
dropToPredict = c("File.Path",  "Project", "Language", "Table.Name","Name","Kind", "X..Bugs.Post", "File", "Distinct.count.of.Issue.Key.POST")

Correlation analysis (MC4), Redundancy Analysis (MC5), Budget based correlation analysis (MC6)

all_list_model_m0 = dataApplyReduction(all_list_omitted_m0)
## [1] "Project: hadoop-2.6"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 15"
## [1] "Distinct.count.of.Issue.Key.PRE + Churn.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + MaxEssential"
## [1] "Project: hibernate-5.0"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 16"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + RatioCommentToCode"
## [1] "Project: umbraco-7.6"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 8"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + RatioCommentToCode"
## [1] "Project: hadoop-2.6"
## [1] "Redudant variables: CountDeclMethodPrivate,␣CountDeclClassMethod,␣CountDeclClass,␣CountDeclInstanceVariable,␣CountLineComment"
## [1] "Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + Churn.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassVariable + CountDeclMethodDefault + CountDeclMethodProtected + CountDeclMethodPublic + File + Kind + MaxEssential + Name + Table.Name"
## [1] "Project: hibernate-5.0"
## [1] "Redudant variables: "
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + File + Kind + Name + RatioCommentToCode + Table.Name"
## [1] "Project: umbraco-7.6"
## [1] "Redudant variables: "
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + File + Kind + Name + RatioCommentToCode + Table.Name"
## [1] "Project: hadoop-2.6"
## [1] "NumberOfMetricsInitial: 10 Budget: 244 Over Budget: FALSE NumberOfMetricsKept: 10 CorrelationCutoff: 0.7"
## [1] "Project: hibernate-5.0"
## [1] "NumberOfMetricsInitial: 16 Budget: 230 Over Budget: FALSE NumberOfMetricsKept: 16 CorrelationCutoff: 0.7"
## [1] "Project: umbraco-7.6"
## [1] "NumberOfMetricsInitial: 8 Budget: 205 Over Budget: FALSE NumberOfMetricsKept: 8 CorrelationCutoff: 0.7"

Setup formulas

form_list_bin_m0 = dataSetupFormulasBinary(all_list_model_m0)
## [1] "Project: hadoop-2.6"
## [1] "Distinct.count.of.Issue.Key.PRE + Churn.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassVariable + CountDeclMethodDefault + CountDeclMethodProtected + CountDeclMethodPublic + MaxEssential"
## [1] "Project: hibernate-5.0"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + RatioCommentToCode"
## [1] "Project: umbraco-7.6"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + RatioCommentToCode"

Fit regression model (MC7)

models_1_BASE = modelFitLogistic(all_list_model_m0,form_list_bin_m0,"ALL.BASE")
## [1] "Project: hadoop-2.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3662    LR chi2     437.88    R2       0.306    C       0.843    
##   FALSE       3439    d.f.            10    g        1.103    Dxy     0.687    
##   TRUE         223    Pr(> chi2) <0.0001    gr       3.014    gamma   0.703    
##  max |deriv| 1e-13                          gp       0.076    tau-a   0.079    
##                                             Brier    0.045                     
##  
##                                  Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                       -4.3614 0.2832 -15.40 <0.0001 
##  Distinct.count.of.Issue.Key.PRE  1.9006 0.4227   4.50 <0.0001 
##  Churn.PRE                        0.7777 0.0906   8.58 <0.0001 
##  AvgEssential                    -0.9794 0.9775  -1.00 0.3164  
##  AvgLineBlank                     0.7203 0.6229   1.16 0.2476  
##  AvgLineComment                   0.7669 0.5625   1.36 0.1728  
##  CountDeclClassVariable           0.6099 0.2040   2.99 0.0028  
##  CountDeclMethodDefault           0.3829 0.1924   1.99 0.0466  
##  CountDeclMethodProtected        -0.0073 0.2461  -0.03 0.9763  
##  CountDeclMethodPublic           -0.0068 0.1907  -0.04 0.9717  
##  MaxEssential                     1.3270 0.4330   3.06 0.0022  
##  
## [1] "Project: hibernate-5.0"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3450    LR chi2     381.53    R2       0.216    C       0.788    
##   FALSE       3096    d.f.            16    g        1.301    Dxy     0.576    
##   TRUE         354    Pr(> chi2) <0.0001    gr       3.672    gamma   0.579    
##  max |deriv| 1e-13                          gp       0.105    tau-a   0.106    
##                                             Brier    0.079                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -3.7664 0.4069 -9.26  <0.0001 
##  Distinct.count.of.Author.Email.PRE  0.7984 0.4904  1.63  0.1035  
##  Distinct.count.of.Issue.Key.PRE     1.1755 0.2436  4.83  <0.0001 
##  AvgCyclomaticModified              -1.1593 0.7392 -1.57  0.1168  
##  AvgEssential                        0.0771 0.9183  0.08  0.9331  
##  AvgLineBlank                        1.1003 0.5101  2.16  0.0310  
##  AvgLineComment                      0.1181 0.5153  0.23  0.8187  
##  CountDeclClassMethod               -0.5231 0.2698 -1.94  0.0526  
##  CountDeclClassVariable             -0.5600 0.3033 -1.85  0.0649  
##  CountDeclClass                     -0.5608 0.4574 -1.23  0.2201  
##  CountDeclInstanceVariable          -0.5720 0.2058 -2.78  0.0054  
##  CountDeclMethodDefault              0.0603 0.2993  0.20  0.8402  
##  CountDeclMethodPrivate              0.8586 0.2708  3.17  0.0015  
##  CountDeclMethodProtected           -0.1600 0.2357 -0.68  0.4973  
##  CountDeclMethodPublic               1.2950 0.2343  5.53  <0.0001 
##  CountLineComment                    1.0226 0.3158  3.24  0.0012  
##  RatioCommentToCode                 -2.8372 0.5818 -4.88  <0.0001 
##  
## [1] "Project: umbraco-7.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3083    LR chi2      94.37    R2       0.136    C       0.750    
##   FALSE       2999    d.f.             8    g        1.005    Dxy     0.500    
##   TRUE          84    Pr(> chi2) <0.0001    gr       2.731    gamma   0.513    
##  max |deriv| 9e-07                          gp       0.028    tau-a   0.027    
##                                             Brier    0.025                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -5.4088 0.5125 -10.55 <0.0001 
##  Distinct.count.of.Author.Email.PRE  3.1202 0.6078   5.13 <0.0001 
##  Distinct.count.of.Issue.Key.PRE     0.3255 1.0879   0.30 0.7648  
##  AvgEssential                        0.8683 1.2120   0.72 0.4737  
##  AvgLineBlank                        1.5648 0.6844   2.29 0.0222  
##  AvgLineComment                     -1.0431 0.8267  -1.26 0.2070  
##  CountDeclClass                     -0.4660 1.0231  -0.46 0.6488  
##  CountDeclFunction                   1.3171 0.2756   4.78 <0.0001 
##  RatioCommentToCode                  1.5581 0.9372   1.66 0.0964  
## 

Model 1 - BASE + N Throws + N Catch

As we can see in the clusters, all the throws blocks data where correlated to each other. Similarly, all catch blocks data and try blocks data were correlated to each other. In this situation, we can see that exception handling metrics are important and we will dig further to understand it better. ### Drop variables

all_list_omitted_m1 = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = all_list_omitted[[i]]$data

  keepForID = c("Project", "File.Path")
  keepForCatch = c("X..Catch",keepForID)
  keepForTry = c(keepForID)
  keepForThrows = c("X..Throws", keepForID)

  catch_names_drop = catch_names[!(catch_names %in% keepForCatch)]
  try_names_drop = try_names[!(try_names %in% keepForTry)]
  throws_names_drop = throws_names[!(throws_names %in% keepForThrows)]

  temp_data = temp_data[,!(names(temp_data) %in% catch_names_drop)]
  temp_data = temp_data[,!(names(temp_data) %in% try_names_drop)]
  temp_data = temp_data[,!(names(temp_data) %in% throws_names_drop)]

  print(names(temp_data))

  all_list_omitted_m1 <- c(all_list_omitted_m1, list(temp_data))
}
## [1] "Project: hadoop-2.6"
##  [1] "File.Path"                         
##  [2] "Project"                           
##  [3] "fileSize"                          
##  [4] "Distinct.count.of.Author.Email.PRE"
##  [5] "Distinct.count.of.Commit.Hash.PRE" 
##  [6] "Distinct.count.of.Issue.Key.POST"  
##  [7] "Distinct.count.of.Issue.Key.PRE"   
##  [8] "Churn.PRE"                         
##  [9] "AvgCyclomaticModified"             
## [10] "AvgCyclomaticStrict"               
## [11] "AvgCyclomatic"                     
## [12] "AvgEssential"                      
## [13] "AvgLineBlank"                      
## [14] "AvgLineCode"                       
## [15] "AvgLineComment"                    
## [16] "AvgLine"                           
## [17] "CountDeclClassMethod"              
## [18] "CountDeclClassVariable"            
## [19] "CountDeclClass"                    
## [20] "CountDeclFunction"                 
## [21] "CountDeclInstanceMethod"           
## [22] "CountDeclInstanceVariable"         
## [23] "CountDeclMethodDefault"            
## [24] "CountDeclMethodPrivate"            
## [25] "CountDeclMethodProtected"          
## [26] "CountDeclMethodPublic"             
## [27] "CountDeclMethod"                   
## [28] "CountLineBlank"                    
## [29] "CountLineCodeDecl"                 
## [30] "CountLineCodeExe"                  
## [31] "CountLineCode"                     
## [32] "CountLineComment"                  
## [33] "CountLine"                         
## [34] "CountSemicolon"                    
## [35] "CountStmtDecl"                     
## [36] "CountStmtExe"                      
## [37] "CountStmt"                         
## [38] "File"                              
## [39] "Kind"                              
## [40] "MaxCyclomaticModified"             
## [41] "MaxCyclomaticStrict"               
## [42] "MaxCyclomatic"                     
## [43] "MaxEssential"                      
## [44] "MaxNesting"                        
## [45] "Name"                              
## [46] "Number.of.Records"                 
## [47] "RatioCommentToCode"                
## [48] "SumCyclomaticModified"             
## [49] "SumCyclomaticStrict"               
## [50] "SumCyclomatic"                     
## [51] "SumEssential"                      
## [52] "Table.Name"                        
## [53] "X..Throws"                         
## [54] "X..Catch"                          
## [1] "Project: hibernate-5.0"
##  [1] "File.Path"                         
##  [2] "Project"                           
##  [3] "fileSize"                          
##  [4] "Distinct.count.of.Author.Email.PRE"
##  [5] "Distinct.count.of.Commit.Hash.PRE" 
##  [6] "Distinct.count.of.Issue.Key.POST"  
##  [7] "Distinct.count.of.Issue.Key.PRE"   
##  [8] "Churn.PRE"                         
##  [9] "AvgCyclomaticModified"             
## [10] "AvgCyclomaticStrict"               
## [11] "AvgCyclomatic"                     
## [12] "AvgEssential"                      
## [13] "AvgLineBlank"                      
## [14] "AvgLineCode"                       
## [15] "AvgLineComment"                    
## [16] "AvgLine"                           
## [17] "CountDeclClassMethod"              
## [18] "CountDeclClassVariable"            
## [19] "CountDeclClass"                    
## [20] "CountDeclFunction"                 
## [21] "CountDeclInstanceMethod"           
## [22] "CountDeclInstanceVariable"         
## [23] "CountDeclMethodDefault"            
## [24] "CountDeclMethodPrivate"            
## [25] "CountDeclMethodProtected"          
## [26] "CountDeclMethodPublic"             
## [27] "CountDeclMethod"                   
## [28] "CountLineBlank"                    
## [29] "CountLineCodeDecl"                 
## [30] "CountLineCodeExe"                  
## [31] "CountLineCode"                     
## [32] "CountLineComment"                  
## [33] "CountLine"                         
## [34] "CountSemicolon"                    
## [35] "CountStmtDecl"                     
## [36] "CountStmtExe"                      
## [37] "CountStmt"                         
## [38] "File"                              
## [39] "Kind"                              
## [40] "MaxCyclomaticModified"             
## [41] "MaxCyclomaticStrict"               
## [42] "MaxCyclomatic"                     
## [43] "MaxEssential"                      
## [44] "MaxNesting"                        
## [45] "Name"                              
## [46] "Number.of.Records"                 
## [47] "RatioCommentToCode"                
## [48] "SumCyclomaticModified"             
## [49] "SumCyclomaticStrict"               
## [50] "SumCyclomatic"                     
## [51] "SumEssential"                      
## [52] "Table.Name"                        
## [53] "X..Throws"                         
## [54] "X..Catch"                          
## [1] "Project: umbraco-7.6"
##  [1] "File.Path"                         
##  [2] "Project"                           
##  [3] "fileSize"                          
##  [4] "Distinct.count.of.Author.Email.PRE"
##  [5] "Distinct.count.of.Commit.Hash.PRE" 
##  [6] "Distinct.count.of.Issue.Key.POST"  
##  [7] "Distinct.count.of.Issue.Key.PRE"   
##  [8] "Churn.PRE"                         
##  [9] "AvgCyclomaticModified"             
## [10] "AvgCyclomaticStrict"               
## [11] "AvgCyclomatic"                     
## [12] "AvgEssential"                      
## [13] "AvgLineBlank"                      
## [14] "AvgLineCode"                       
## [15] "AvgLineComment"                    
## [16] "AvgLine"                           
## [17] "CountDeclClassMethod"              
## [18] "CountDeclClassVariable"            
## [19] "CountDeclClass"                    
## [20] "CountDeclFunction"                 
## [21] "CountDeclInstanceMethod"           
## [22] "CountDeclInstanceVariable"         
## [23] "CountDeclMethodDefault"            
## [24] "CountDeclMethodPrivate"            
## [25] "CountDeclMethodProtected"          
## [26] "CountDeclMethodPublic"             
## [27] "CountDeclMethod"                   
## [28] "CountLineBlank"                    
## [29] "CountLineCodeDecl"                 
## [30] "CountLineCodeExe"                  
## [31] "CountLineCode"                     
## [32] "CountLineComment"                  
## [33] "CountLine"                         
## [34] "CountSemicolon"                    
## [35] "CountStmtDecl"                     
## [36] "CountStmtExe"                      
## [37] "CountStmt"                         
## [38] "File"                              
## [39] "Kind"                              
## [40] "MaxCyclomaticModified"             
## [41] "MaxCyclomaticStrict"               
## [42] "MaxCyclomatic"                     
## [43] "MaxEssential"                      
## [44] "MaxNesting"                        
## [45] "Name"                              
## [46] "Number.of.Records"                 
## [47] "RatioCommentToCode"                
## [48] "SumCyclomaticModified"             
## [49] "SumCyclomaticStrict"               
## [50] "SumCyclomatic"                     
## [51] "SumEssential"                      
## [52] "Table.Name"                        
## [53] "X..Throws"                         
## [54] "X..Catch"
dropToPredict = c("File.Path",  "Project", "Language", "Table.Name","Name","Kind", "X..Bugs.Post", "File", "Distinct.count.of.Issue.Key.POST")

Correlation analysis (MC4), Redundancy Analysis (MC5), Budget based correlation analysis (MC6)

all_list_model_m1 = dataApplyReduction(all_list_omitted_m1)
## [1] "Project: hadoop-2.6"

## [1] "NumberOfMetricsInitial: 47"
## [1] "NumberOfMetricsKept: 18"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + MaxEssential + RatioCommentToCode + X..Throws + X..Catch"
## [1] "Project: hibernate-5.0"

## [1] "NumberOfMetricsInitial: 47"
## [1] "NumberOfMetricsKept: 18"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + RatioCommentToCode + X..Throws + X..Catch"
## [1] "Project: umbraco-7.6"

## [1] "NumberOfMetricsInitial: 47"
## [1] "NumberOfMetricsKept: 9"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + RatioCommentToCode + X..Catch"
## [1] "Project: hadoop-2.6"
## [1] "Redudant variables: CountDeclMethodPrivate,␣CountDeclClassMethod,␣X..Throws,␣CountDeclClass,␣CountDeclInstanceVariable,␣CountDeclMethodProtected,␣CountLineComment"
## [1] "File.Path + Project + Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassVariable + CountDeclMethodDefault + CountDeclMethodPublic + File + Kind + MaxEssential + Name + RatioCommentToCode + Table.Name + X..Catch"
## [1] "Project: hibernate-5.0"
## [1] "Redudant variables: "
## [1] "File.Path + Project + Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + File + Kind + Name + RatioCommentToCode + Table.Name + X..Throws + X..Catch"
## [1] "Project: umbraco-7.6"
## [1] "Redudant variables: "
## [1] "File.Path + Project + Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + File + Kind + Name + RatioCommentToCode + Table.Name + X..Catch"
## [1] "Project: hadoop-2.6"
## [1] "NumberOfMetricsInitial: 11 Budget: 244 Over Budget: FALSE NumberOfMetricsKept: 11 CorrelationCutoff: 0.7"
## [1] "Project: hibernate-5.0"
## [1] "NumberOfMetricsInitial: 18 Budget: 230 Over Budget: FALSE NumberOfMetricsKept: 18 CorrelationCutoff: 0.7"
## [1] "Project: umbraco-7.6"
## [1] "NumberOfMetricsInitial: 9 Budget: 205 Over Budget: FALSE NumberOfMetricsKept: 9 CorrelationCutoff: 0.7"

Setup formulas

form_list_bin_m1 = dataSetupFormulasBinary(all_list_model_m1)
## [1] "Project: hadoop-2.6"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassVariable + CountDeclMethodDefault + CountDeclMethodPublic + MaxEssential + RatioCommentToCode + X..Catch"
## [1] "Project: hibernate-5.0"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + RatioCommentToCode + X..Throws + X..Catch"
## [1] "Project: umbraco-7.6"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountDeclFunction + RatioCommentToCode + X..Catch"

Fit regression model (MC7)

models_1_BSEH = modelFitLogistic(all_list_model_m1,form_list_bin_m1,"ALL.BSEH")
## [1] "Project: hadoop-2.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3662    LR chi2     482.25    R2       0.335    C       0.855    
##   FALSE       3439    d.f.            11    g        1.272    Dxy     0.711    
##   TRUE         223    Pr(> chi2) <0.0001    gr       3.566    gamma   0.718    
##  max |deriv| 8e-10                          gp       0.080    tau-a   0.081    
##                                             Brier    0.043                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -3.4030 0.4134 -8.23  <0.0001 
##  Distinct.count.of.Author.Email.PRE  3.8178 0.4021  9.50  <0.0001 
##  Distinct.count.of.Issue.Key.PRE     0.4338 0.4910  0.88  0.3769  
##  AvgEssential                       -1.4945 1.0469 -1.43  0.1534  
##  AvgLineBlank                        0.4635 0.6410  0.72  0.4696  
##  AvgLineComment                      0.3672 0.5883  0.62  0.5326  
##  CountDeclClassVariable              0.3420 0.2122  1.61  0.1070  
##  CountDeclMethodDefault              0.1828 0.1948  0.94  0.3480  
##  CountDeclMethodPublic              -0.3671 0.1982 -1.85  0.0640  
##  MaxEssential                        1.1103 0.4488  2.47  0.0134  
##  RatioCommentToCode                 -1.9394 0.6839 -2.84  0.0046  
##  X..Catch                            0.6860 0.2468  2.78  0.0054  
##  
## [1] "Project: hibernate-5.0"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3450    LR chi2     394.22    R2       0.223    C       0.791    
##   FALSE       3096    d.f.            18    g        1.299    Dxy     0.583    
##   TRUE         354    Pr(> chi2) <0.0001    gr       3.664    gamma   0.586    
##  max |deriv| 2e-13                          gp       0.106    tau-a   0.107    
##                                             Brier    0.079                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -3.6083 0.4140 -8.71  <0.0001 
##  Distinct.count.of.Author.Email.PRE  0.5765 0.4992  1.15  0.2482  
##  Distinct.count.of.Issue.Key.PRE     1.2353 0.2482  4.98  <0.0001 
##  AvgCyclomaticModified              -1.1501 0.7589 -1.52  0.1296  
##  AvgEssential                        0.2388 0.9173  0.26  0.7946  
##  AvgLineBlank                        1.0465 0.5128  2.04  0.0413  
##  AvgLineComment                      0.1323 0.5219  0.25  0.7999  
##  CountDeclClassMethod               -0.3915 0.2744 -1.43  0.1537  
##  CountDeclClassVariable             -0.5651 0.3079 -1.84  0.0665  
##  CountDeclClass                     -0.5269 0.4607 -1.14  0.2527  
##  CountDeclInstanceVariable          -0.5167 0.2092 -2.47  0.0135  
##  CountDeclMethodDefault             -0.0451 0.3042 -0.15  0.8821  
##  CountDeclMethodPrivate              0.8261 0.2780  2.97  0.0030  
##  CountDeclMethodProtected           -0.3077 0.2433 -1.26  0.2060  
##  CountDeclMethodPublic               1.1546 0.2401  4.81  <0.0001 
##  CountLineComment                    0.8879 0.3213  2.76  0.0057  
##  RatioCommentToCode                 -2.7248 0.5853 -4.66  <0.0001 
##  X..Throws                           0.7217 0.2017  3.58  0.0003  
##  X..Catch                           -0.1280 0.2801 -0.46  0.6476  
##  
## [1] "Project: umbraco-7.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3083    LR chi2     102.71    R2       0.148    C       0.758    
##   FALSE       2999    d.f.             9    g        0.951    Dxy     0.516    
##   TRUE          84    Pr(> chi2) <0.0001    gr       2.587    gamma   0.531    
##  max |deriv| 5e-07                          gp       0.028    tau-a   0.027    
##                                             Brier    0.024                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -5.3076 0.5079 -10.45 <0.0001 
##  Distinct.count.of.Author.Email.PRE  3.0602 0.6169   4.96 <0.0001 
##  Distinct.count.of.Issue.Key.PRE     0.2517 1.1103   0.23 0.8206  
##  AvgEssential                        1.2942 1.1674   1.11 0.2676  
##  AvgLineBlank                        1.3611 0.6901   1.97 0.0486  
##  AvgLineComment                     -1.4642 0.8769  -1.67 0.0950  
##  CountDeclClass                     -0.4619 1.0443  -0.44 0.6582  
##  CountDeclFunction                   0.9862 0.2991   3.30 0.0010  
##  RatioCommentToCode                  1.5917 0.9281   1.71 0.0864  
##  X..Catch                            1.5400 0.5152   2.99 0.0028  
## 

Model Analysis for ALL

In this section, we present the selected statistics for our analysis. As explained in our approach, they are the steps: MC7, MA1, MA2, MA3 and MA4.

Here we extract the selected statistics and we add the data (columns) to an object that will be exported to CSV in the section Output.

Fit regression model (MC7): summary stats

model_things_1_BASE = vector("list", 0)
model_things_1_BSEH = vector("list", 0)
model_things_1_BASE = modelStats(models_1_BASE)
model_things_1_BSEH = modelStats(models_1_BSEH)

Model stability assessment (MA1)

model_things_1_BASE = modelValidate(models_1_BASE, model_things_1_BASE)
model_things_1_BSEH = modelValidate(models_1_BSEH, model_things_1_BSEH)

Model significant variables

model_things_1_BASE = modelSignificance(models_1_BASE, model_things_1_BASE)
## [1] "project:  hadoop-2.6 model:  ALL.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                          Chi-Square d.f. P     
##  Distinct.count.of.Issue.Key.PRE  20.22      1   <.0001
##  Churn.PRE                        73.62      1   <.0001
##  AvgEssential                      1.00      1   0.3164
##  AvgLineBlank                      1.34      1   0.2476
##  AvgLineComment                    1.86      1   0.1728
##  CountDeclClassVariable            8.94      1   0.0028
##  CountDeclMethodDefault            3.96      1   0.0466
##  CountDeclMethodProtected          0.00      1   0.9763
##  CountDeclMethodPublic             0.00      1   0.9717
##  MaxEssential                      9.39      1   0.0022
##  TOTAL                           365.46     10   <.0001
## [1] "project:  hibernate-5.0 model:  ALL.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE   2.65      1   0.1035
##  Distinct.count.of.Issue.Key.PRE     23.29      1   <.0001
##  AvgCyclomaticModified                2.46      1   0.1168
##  AvgEssential                         0.01      1   0.9331
##  AvgLineBlank                         4.65      1   0.0310
##  AvgLineComment                       0.05      1   0.8187
##  CountDeclClassMethod                 3.76      1   0.0526
##  CountDeclClassVariable               3.41      1   0.0649
##  CountDeclClass                       1.50      1   0.2201
##  CountDeclInstanceVariable            7.72      1   0.0054
##  CountDeclMethodDefault               0.04      1   0.8402
##  CountDeclMethodPrivate              10.05      1   0.0015
##  CountDeclMethodProtected             0.46      1   0.4973
##  CountDeclMethodPublic               30.55      1   <.0001
##  CountLineComment                    10.48      1   0.0012
##  RatioCommentToCode                  23.78      1   <.0001
##  TOTAL                              306.17     16   <.0001
## [1] "project:  umbraco-7.6 model:  ALL.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE 26.35      1    <.0001
##  Distinct.count.of.Issue.Key.PRE     0.09      1    0.7648
##  AvgEssential                        0.51      1    0.4737
##  AvgLineBlank                        5.23      1    0.0222
##  AvgLineComment                      1.59      1    0.2070
##  CountDeclClass                      0.21      1    0.6488
##  CountDeclFunction                  22.85      1    <.0001
##  RatioCommentToCode                  2.76      1    0.0964
##  TOTAL                              99.58      8    <.0001
model_things_1_BSEH = modelSignificance(models_1_BSEH, model_things_1_BSEH)
## [1] "project:  hadoop-2.6 model:  ALL.BSEH"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  90.17      1   <.0001
##  Distinct.count.of.Issue.Key.PRE      0.78      1   0.3769
##  AvgEssential                         2.04      1   0.1534
##  AvgLineBlank                         0.52      1   0.4696
##  AvgLineComment                       0.39      1   0.5326
##  CountDeclClassVariable               2.60      1   0.1070
##  CountDeclMethodDefault               0.88      1   0.3480
##  CountDeclMethodPublic                3.43      1   0.0640
##  MaxEssential                         6.12      1   0.0134
##  RatioCommentToCode                   8.04      1   0.0046
##  X..Catch                             7.72      1   0.0054
##  TOTAL                              377.62     11   <.0001
## [1] "project:  hibernate-5.0 model:  ALL.BSEH"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE   1.33      1   0.2482
##  Distinct.count.of.Issue.Key.PRE     24.77      1   <.0001
##  AvgCyclomaticModified                2.30      1   0.1296
##  AvgEssential                         0.07      1   0.7946
##  AvgLineBlank                         4.17      1   0.0413
##  AvgLineComment                       0.06      1   0.7999
##  CountDeclClassMethod                 2.03      1   0.1537
##  CountDeclClassVariable               3.37      1   0.0665
##  CountDeclClass                       1.31      1   0.2527
##  CountDeclInstanceVariable            6.10      1   0.0135
##  CountDeclMethodDefault               0.02      1   0.8821
##  CountDeclMethodPrivate               8.83      1   0.0030
##  CountDeclMethodProtected             1.60      1   0.2060
##  CountDeclMethodPublic               23.12      1   <.0001
##  CountLineComment                     7.64      1   0.0057
##  RatioCommentToCode                  21.68      1   <.0001
##  X..Throws                           12.81      1   0.0003
##  X..Catch                             0.21      1   0.6476
##  TOTAL                              312.19     18   <.0001
## [1] "project:  umbraco-7.6 model:  ALL.BSEH"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  24.61     1    <.0001
##  Distinct.count.of.Issue.Key.PRE      0.05     1    0.8206
##  AvgEssential                         1.23     1    0.2676
##  AvgLineBlank                         3.89     1    0.0486
##  AvgLineComment                       2.79     1    0.0950
##  CountDeclClass                       0.20     1    0.6582
##  CountDeclFunction                   10.87     1    0.0010
##  RatioCommentToCode                   2.94     1    0.0864
##  X..Catch                             8.93     1    0.0028
##  TOTAL                              111.72     9    <.0001

Model simplification (MA2), Predictors’ explanatory power estimation (MA3), Predictors’ effect in the outcome measurement (MA4)

model_things_1_BASE = modelSimplification(models_1_BASE, model_things_1_BASE)
## [1] "project:  hadoop-2.6 model:  ALL.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3662    LR chi2     435.76    R2       0.305    C       0.843    
##   FALSE       3439    d.f.             6    g        1.112    Dxy     0.686    
##   TRUE         223    Pr(> chi2) <0.0001    gr       3.039    gamma   0.704    
##  max |deriv| 7e-13                          gp       0.076    tau-a   0.078    
##                                             Brier    0.045                     
##  
##                                  Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                       -4.5565 0.1819 -25.04 <0.0001 
##  Distinct.count.of.Issue.Key.PRE  1.9348 0.4198   4.61 <0.0001 
##  Churn.PRE                        0.7776 0.0900   8.64 <0.0001 
##  AvgLineComment                   1.0720 0.4323   2.48 0.0131  
##  CountDeclClassVariable           0.6407 0.1821   3.52 0.0004  
##  CountDeclMethodDefault           0.3969 0.1810   2.19 0.0283  
##  MaxEssential                     1.0918 0.3207   3.40 0.0007  
##  
## [1] "project:  hadoop-2.6 model:  ALL.BASE Refit - summary"
## [1] "project:  hadoop-2.6 model:  ALL.BASE Refit - validate"
## [1] "project:  hadoop-2.6 model:  ALL.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                          Chi-Square d.f. P     
##  Distinct.count.of.Issue.Key.PRE  21.24     1    <.0001
##  Churn.PRE                        74.61     1    <.0001
##  AvgLineComment                    6.15     1    0.0131
##  CountDeclClassVariable           12.39     1    0.0004
##  CountDeclMethodDefault            4.81     1    0.0283
##  MaxEssential                     11.59     1    0.0007
##  TOTAL                           364.01     6    <.0001
## [1] "Distinct.count.of.Issue.Key.PRE"
## [1] "Churn.PRE"
## [1] "AvgLineComment"
## [1] "CountDeclClassVariable"
## [1] "CountDeclMethodDefault"
## [1] "MaxEssential"
##   Distinct.count.of.Issue.Key.PRE Churn.PRE AvgLineComment
## 1                       0.1815948  35.92873      0.1876024
##   CountDeclClassVariable CountDeclMethodDefault MaxEssential
## 1               3.207264                    2.5     2.105953
## [1] "Fixed at Mean: 0.123004383674709"
## [1] "Distinct.count.of.Issue.Key.PRE  Coef at Mean + 10%: 0.124393541892917"
## [1] "Churn.PRE  Coef at Mean + 10%: 0.126426941265937"
## [1] "AvgLineComment  Coef at Mean + 10%: 0.123793698883498"
## [1] "CountDeclClassVariable  Coef at Mean + 10%: 0.125226626518196"
## [1] "CountDeclMethodDefault  Coef at Mean + 10%: 0.124293128730026"
## [1] "MaxEssential  Coef at Mean + 10%: 0.126399646985173"
## [1] "project:  hibernate-5.0 model:  ALL.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3450    LR chi2     370.78    R2       0.211    C       0.784    
##   FALSE       3096    d.f.             8    g        1.320    Dxy     0.568    
##   TRUE         354    Pr(> chi2) <0.0001    gr       3.742    gamma   0.572    
##  max |deriv| 1e-13                          gp       0.103    tau-a   0.105    
##                                             Brier    0.080                     
##  
##                                  Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                       -4.0272 0.2605 -15.46 <0.0001 
##  Distinct.count.of.Issue.Key.PRE  1.3039 0.2232   5.84 <0.0001 
##  CountDeclClassMethod            -0.6381 0.2609  -2.45 0.0145  
##  CountDeclClassVariable          -0.6382 0.2939  -2.17 0.0299  
##  CountDeclInstanceVariable       -0.6315 0.2013  -3.14 0.0017  
##  CountDeclMethodPrivate           0.8407 0.2600   3.23 0.0012  
##  CountDeclMethodPublic            1.2029 0.1926   6.24 <0.0001 
##  CountLineComment                 1.0839 0.2559   4.24 <0.0001 
##  RatioCommentToCode              -2.7812 0.4801  -5.79 <0.0001 
##  
## [1] "project:  hibernate-5.0 model:  ALL.BASE Refit - summary"
## [1] "project:  hibernate-5.0 model:  ALL.BASE Refit - validate"
## [1] "project:  hibernate-5.0 model:  ALL.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                          Chi-Square d.f. P     
##  Distinct.count.of.Issue.Key.PRE  34.11     1    <.0001
##  CountDeclClassMethod              5.98     1    0.0145
##  CountDeclClassVariable            4.72     1    0.0299
##  CountDeclInstanceVariable         9.85     1    0.0017
##  CountDeclMethodPrivate           10.46     1    0.0012
##  CountDeclMethodPublic            38.99     1    <.0001
##  CountLineComment                 17.94     1    <.0001
##  RatioCommentToCode               33.56     1    <.0001
##  TOTAL                           294.73     8    <.0001
## [1] "Distinct.count.of.Issue.Key.PRE"
## [1] "CountDeclClassMethod"
## [1] "CountDeclClassVariable"
## [1] "CountDeclInstanceVariable"
## [1] "CountDeclMethodPrivate"
## [1] "CountDeclMethodPublic"
## [1] "CountLineComment"
## [1] "RatioCommentToCode"
##   Distinct.count.of.Issue.Key.PRE CountDeclClassMethod
## 1                       0.8608696            0.4921739
##   CountDeclClassVariable CountDeclInstanceVariable CountDeclMethodPrivate
## 1              0.5472464                  2.086667              0.8017391
##   CountDeclMethodPublic CountLineComment RatioCommentToCode
## 1              6.750435         30.19391           1.602136
## [1] "Fixed at Mean: 0.0781686690572031"
## [1] "Distinct.count.of.Issue.Key.PRE  Coef at Mean + 10%: 0.0800340031553238"
## [1] "CountDeclClassMethod  Coef at Mean + 10%: 0.0775231487255036"
## [1] "CountDeclClassVariable  Coef at Mean + 10%: 0.0774773009175703"
## [1] "CountDeclInstanceVariable  Coef at Mean + 10%: 0.0768856493217207"
## [1] "CountDeclMethodPrivate  Coef at Mean + 10%: 0.0793218101740674"
## [1] "CountDeclMethodPublic  Coef at Mean + 10%: 0.0813708002046483"
## [1] "CountLineComment  Coef at Mean + 10%: 0.0813606299522386"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.0731241274231537"
## [1] "project:  umbraco-7.6 model:  ALL.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3083    LR chi2      90.19    R2       0.130    C       0.752    
##   FALSE       2999    d.f.             3    g        0.937    Dxy     0.504    
##   TRUE          84    Pr(> chi2) <0.0001    gr       2.552    gamma   0.526    
##  max |deriv| 4e-08                          gp       0.027    tau-a   0.027    
##                                             Brier    0.025                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -5.0906 0.2761 -18.44 <0.0001 
##  Distinct.count.of.Author.Email.PRE  3.2032 0.4964   6.45 <0.0001 
##  AvgLineBlank                        1.0669 0.4850   2.20 0.0278  
##  CountDeclFunction                   1.2719 0.2435   5.22 <0.0001 
##  
## [1] "project:  umbraco-7.6 model:  ALL.BASE Refit - summary"
## [1] "project:  umbraco-7.6 model:  ALL.BASE Refit - validate"
## [1] "project:  umbraco-7.6 model:  ALL.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE 41.64      1    <.0001
##  AvgLineBlank                        4.84      1    0.0278
##  CountDeclFunction                  27.29      1    <.0001
##  TOTAL                              97.44      3    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "AvgLineBlank"
## [1] "CountDeclFunction"
##   Distinct.count.of.Author.Email.PRE AvgLineBlank CountDeclFunction
## 1                           0.203049    0.4469672          7.637366
## [1] "Fixed at Mean: 0.0301397678996167"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.0308278626078932"
## [1] "AvgLineBlank  Coef at Mean + 10%: 0.030554569960672"
## [1] "CountDeclFunction  Coef at Mean + 10%: 0.0315383464906605"
model_things_1_BSEH = modelSimplification(models_1_BSEH, model_things_1_BSEH)
## [1] "project:  hadoop-2.6 model:  ALL.BSEH Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3662    LR chi2     476.36    R2       0.331    C       0.854    
##   FALSE       3439    d.f.             6    g        1.296    Dxy     0.708    
##   TRUE         223    Pr(> chi2) <0.0001    gr       3.655    gamma   0.716    
##  max |deriv| 1e-09                          gp       0.080    tau-a   0.081    
##                                             Brier    0.043                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -3.8208 0.3011 -12.69 <0.0001 
##  Distinct.count.of.Author.Email.PRE  4.1195 0.3136  13.13 <0.0001 
##  CountDeclClassVariable              0.4366 0.2048   2.13 0.0331  
##  CountDeclMethodPublic              -0.3627 0.1804  -2.01 0.0444  
##  MaxEssential                        0.9128 0.3435   2.66 0.0079  
##  RatioCommentToCode                 -1.7704 0.6652  -2.66 0.0078  
##  X..Catch                            0.8016 0.2342   3.42 0.0006  
##  
## [1] "project:  hadoop-2.6 model:  ALL.BSEH Refit - summary"
## [1] "project:  hadoop-2.6 model:  ALL.BSEH Refit - validate"
## [1] "project:  hadoop-2.6 model:  ALL.BSEH Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE 172.53     1    <.0001
##  CountDeclClassVariable               4.54     1    0.0331
##  CountDeclMethodPublic                4.04     1    0.0444
##  MaxEssential                         7.06     1    0.0079
##  RatioCommentToCode                   7.08     1    0.0078
##  X..Catch                            11.71     1    0.0006
##  TOTAL                              375.72     6    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "CountDeclClassVariable"
## [1] "CountDeclMethodPublic"
## [1] "MaxEssential"
## [1] "RatioCommentToCode"
## [1] "X..Catch"
##   Distinct.count.of.Author.Email.PRE CountDeclClassVariable
## 1                          0.5030038               3.207264
##   CountDeclMethodPublic MaxEssential RatioCommentToCode X..Catch
## 1              20.03359     2.105953           1.249798 1.610049
## [1] "Fixed at Mean: 0.0415111995740052"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.0439188000839555"
## [1] "CountDeclClassVariable  Coef at Mean + 10%: 0.0420689470587737"
## [1] "CountDeclMethodPublic  Coef at Mean + 10%: 0.0409447729221982"
## [1] "MaxEssential  Coef at Mean + 10%: 0.0425584030154636"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.0398884338814381"
## [1] "X..Catch  Coef at Mean + 10%: 0.042348247764015"
## [1] "project:  hibernate-5.0 model:  ALL.BSEH Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3450    LR chi2     380.16    R2       0.216    C       0.787    
##   FALSE       3096    d.f.             8    g        1.288    Dxy     0.574    
##   TRUE         354    Pr(> chi2) <0.0001    gr       3.625    gamma   0.577    
##  max |deriv| 9e-14                          gp       0.104    tau-a   0.106    
##                                             Brier    0.079                     
##  
##                                  Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                       -3.7491 0.2627 -14.27 <0.0001 
##  Distinct.count.of.Issue.Key.PRE  1.2733 0.2245   5.67 <0.0001 
##  CountDeclClassVariable          -0.6819 0.2962  -2.30 0.0213  
##  CountDeclInstanceVariable       -0.5058 0.1964  -2.58 0.0100  
##  CountDeclMethodPrivate           0.6097 0.2483   2.46 0.0141  
##  CountDeclMethodPublic            1.0597 0.1952   5.43 <0.0001 
##  CountLineComment                 0.7713 0.2576   2.99 0.0028  
##  RatioCommentToCode              -2.4712 0.4795  -5.15 <0.0001 
##  X..Throws                        0.7479 0.1880   3.98 <0.0001 
##  
## [1] "project:  hibernate-5.0 model:  ALL.BSEH Refit - summary"
## [1] "project:  hibernate-5.0 model:  ALL.BSEH Refit - validate"
## [1] "project:  hibernate-5.0 model:  ALL.BSEH Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                          Chi-Square d.f. P     
##  Distinct.count.of.Issue.Key.PRE  32.17     1    <.0001
##  CountDeclClassVariable            5.30     1    0.0213
##  CountDeclInstanceVariable         6.63     1    0.0100
##  CountDeclMethodPrivate            6.03     1    0.0141
##  CountDeclMethodPublic            29.48     1    <.0001
##  CountLineComment                  8.96     1    0.0028
##  RatioCommentToCode               26.56     1    <.0001
##  X..Throws                        15.83     1    0.0001
##  TOTAL                           300.29     8    <.0001
## [1] "Distinct.count.of.Issue.Key.PRE"
## [1] "CountDeclClassVariable"
## [1] "CountDeclInstanceVariable"
## [1] "CountDeclMethodPrivate"
## [1] "CountDeclMethodPublic"
## [1] "CountLineComment"
## [1] "RatioCommentToCode"
## [1] "X..Throws"
##   Distinct.count.of.Issue.Key.PRE CountDeclClassVariable
## 1                       0.8608696              0.5472464
##   CountDeclInstanceVariable CountDeclMethodPrivate CountDeclMethodPublic
## 1                  2.086667              0.8017391              6.750435
##   CountLineComment RatioCommentToCode X..Throws
## 1         30.19391           1.602136 0.9147826
## [1] "Fixed at Mean: 0.0872898819679512"
## [1] "Distinct.count.of.Issue.Key.PRE  Coef at Mean + 10%: 0.0893029606500913"
## [1] "CountDeclClassVariable  Coef at Mean + 10%: 0.0864732499914876"
## [1] "CountDeclInstanceVariable  Coef at Mean + 10%: 0.0861519023412411"
## [1] "CountDeclMethodPrivate  Coef at Mean + 10%: 0.0882127342132414"
## [1] "CountDeclMethodPublic  Coef at Mean + 10%: 0.0904008426227624"
## [1] "CountLineComment  Coef at Mean + 10%: 0.0897872626790859"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.0823145150572586"
## [1] "X..Throws  Coef at Mean + 10%: 0.0885050898270227"
## [1] "project:  umbraco-7.6 model:  ALL.BSEH Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3083    LR chi2      94.84    R2       0.137    C       0.750    
##   FALSE       2999    d.f.             3    g        0.790    Dxy     0.501    
##   TRUE          84    Pr(> chi2) <0.0001    gr       2.203    gamma   0.525    
##  max |deriv| 5e-09                          gp       0.026    tau-a   0.027    
##                                             Brier    0.025                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -4.7810 0.2516 -19.01 <0.0001 
##  Distinct.count.of.Author.Email.PRE  3.2044 0.5009   6.40 <0.0001 
##  CountDeclFunction                   0.9304 0.2542   3.66 0.0003  
##  X..Catch                            1.5387 0.4918   3.13 0.0018  
##  
## [1] "project:  umbraco-7.6 model:  ALL.BSEH Refit - summary"
## [1] "project:  umbraco-7.6 model:  ALL.BSEH Refit - validate"
## [1] "project:  umbraco-7.6 model:  ALL.BSEH Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  40.93     1    <.0001
##  CountDeclFunction                   13.39     1    0.0003
##  X..Catch                             9.79     1    0.0018
##  TOTAL                              108.60     3    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "CountDeclFunction"
## [1] "X..Catch"
##   Distinct.count.of.Author.Email.PRE CountDeclFunction X..Catch
## 1                           0.203049          7.637366 0.170613
## [1] "Fixed at Mean: 0.0279969681007204"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.0286378345266876"
## [1] "CountDeclFunction  Coef at Mean + 10%: 0.0289438662470886"
## [1] "X..Catch  Coef at Mean + 10%: 0.028261306759101"

Output 1

Here we output the selected statistics from the R functions results and we output in the CSV files in the folder “output”.

write.table(data.frame(model_things_1_BASE[[1]])[0,], 'output/base_test_1.csv'  , append= F, sep=',', row.names = F, col.names = T )

lapply(model_things_1_BASE, function(x) write.table( data.frame(x), 'output/base_test_1.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
lapply(model_things_1_BSEH, function(x) write.table( data.frame(x), 'output/base_test_1.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL

2- Presented results: Only Files with catch blocks

Filtered files: dive in on Catch Blocks

We now consider that files without catch blocks is missing data. We then re-run the analysis.

all_list_omitted_2 = all_no_missing
all_list_omitted = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = as.data.frame(all_list_omitted_2[i])
  temp_data_bkp = temp_data
  print(paste("nrow:",nrow(temp_data),"ncol:",ncol(temp_data)))
  
  # Make 0 catch blocks become NA and remove NA's.
  No_Catch <- temp_data$X..Catch == 0
  temp_data$X..Catch[No_Catch] <- NA
  temp_data <- na.omit(temp_data)
  print(paste("nrow:",nrow(temp_data),"ncol:",ncol(temp_data)))
  
  if (nrow(temp_data) == 0){
    write.csv(temp_data_bkp, file = "temp_data.csv")
  }
  
  # Remove throws columns
  temp_data = temp_data[,!(names(temp_data) %in% throws_names)]
  print(paste("nrow:",nrow(temp_data),"ncol:",ncol(temp_data)))

  all_list_omitted <- c(all_list_omitted, list(list(project=projects[i], data=temp_data)))
}
## [1] "Project: hadoop-2.6"
## [1] "nrow: 3662 ncol: 159"
## [1] "nrow: 890 ncol: 159"
## [1] "nrow: 890 ncol: 152"
## [1] "Project: hibernate-5.0"
## [1] "nrow: 3450 ncol: 159"
## [1] "nrow: 440 ncol: 159"
## [1] "nrow: 440 ncol: 152"
## [1] "Project: umbraco-7.6"
## [1] "nrow: 3083 ncol: 159"
## [1] "nrow: 230 ncol: 159"
## [1] "nrow: 230 ncol: 152"

Predictor budget estimation (MC2) and normality adjustment (MC3)

modelSelectionAndNormalityAdjustment(all_list_omitted)
## [1] "Project: hadoop-2.6 D.F. Budget: 59"
## [1] "Project: hibernate-5.0 D.F. Budget: 29"
## [1] "Project: umbraco-7.6 D.F. Budget: 15"
## [1] "Project: hadoop-2.6 skewness, 6.74527749465777"
## [1] "Project: hadoop-2.6 kurtosis, 71.519920764886"
## [1] "Project: hibernate-5.0 skewness, 3.71910197274668"
## [1] "Project: hibernate-5.0 kurtosis, 17.4752211740334"
## [1] "Project: umbraco-7.6 skewness, 4.0532455757886"
## [1] "Project: umbraco-7.6 kurtosis, 20.7020533105116"

## Model 0 - BASE Only ### Drop variables

all_list_omitted_m0 = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = as.data.frame(all_list_omitted[[i]]$data)

  keepForID = c("Project", "File.Path")
  keepForCatch = c(keepForID)
  keepForTry = c(keepForID)
  keepForThrows = c(keepForID)

  catch_names_drop = catch_names[!(catch_names %in% keepForCatch)]
  try_names_drop = try_names[!(try_names %in% keepForTry)]
  throws_names_drop = throws_names[!(throws_names %in% keepForThrows)]

  temp_data = temp_data[,!(names(temp_data) %in% catch_names_drop)]
  temp_data = temp_data[,!(names(temp_data) %in% try_names_drop)]
  temp_data = temp_data[,!(names(temp_data) %in% throws_names_drop)]
  
  print(names(temp_data))

  all_list_omitted_m0 <- c(all_list_omitted_m0, list(temp_data))
}
## [1] "Project: hadoop-2.6"
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"                        
## [1] "Project: hibernate-5.0"
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"                        
## [1] "Project: umbraco-7.6"
##  [1] "fileSize"                          
##  [2] "Distinct.count.of.Author.Email.PRE"
##  [3] "Distinct.count.of.Commit.Hash.PRE" 
##  [4] "Distinct.count.of.Issue.Key.POST"  
##  [5] "Distinct.count.of.Issue.Key.PRE"   
##  [6] "Churn.PRE"                         
##  [7] "AvgCyclomaticModified"             
##  [8] "AvgCyclomaticStrict"               
##  [9] "AvgCyclomatic"                     
## [10] "AvgEssential"                      
## [11] "AvgLineBlank"                      
## [12] "AvgLineCode"                       
## [13] "AvgLineComment"                    
## [14] "AvgLine"                           
## [15] "CountDeclClassMethod"              
## [16] "CountDeclClassVariable"            
## [17] "CountDeclClass"                    
## [18] "CountDeclFunction"                 
## [19] "CountDeclInstanceMethod"           
## [20] "CountDeclInstanceVariable"         
## [21] "CountDeclMethodDefault"            
## [22] "CountDeclMethodPrivate"            
## [23] "CountDeclMethodProtected"          
## [24] "CountDeclMethodPublic"             
## [25] "CountDeclMethod"                   
## [26] "CountLineBlank"                    
## [27] "CountLineCodeDecl"                 
## [28] "CountLineCodeExe"                  
## [29] "CountLineCode"                     
## [30] "CountLineComment"                  
## [31] "CountLine"                         
## [32] "CountSemicolon"                    
## [33] "CountStmtDecl"                     
## [34] "CountStmtExe"                      
## [35] "CountStmt"                         
## [36] "File"                              
## [37] "Kind"                              
## [38] "MaxCyclomaticModified"             
## [39] "MaxCyclomaticStrict"               
## [40] "MaxCyclomatic"                     
## [41] "MaxEssential"                      
## [42] "MaxNesting"                        
## [43] "Name"                              
## [44] "Number.of.Records"                 
## [45] "RatioCommentToCode"                
## [46] "SumCyclomaticModified"             
## [47] "SumCyclomaticStrict"               
## [48] "SumCyclomatic"                     
## [49] "SumEssential"                      
## [50] "Table.Name"

Correlation analysis (MC4), Redundancy Analysis (MC5), Budget based correlation analysis (MC6)

all_list_model_m0 = dataApplyReduction(all_list_omitted_m0)
## [1] "Project: hadoop-2.6"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 17"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + MaxEssential + MaxNesting + RatioCommentToCode"
## [1] "Project: hibernate-5.0"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 18"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + MaxEssential + MaxNesting + RatioCommentToCode"
## [1] "Project: umbraco-7.6"

## [1] "NumberOfMetricsInitial: 45"
## [1] "NumberOfMetricsKept: 12"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountLineBlank + MaxCyclomaticModified + MaxEssential + MaxNesting + RatioCommentToCode"
## [1] "Project: hadoop-2.6"
## [1] "Redudant variables: CountDeclMethodPublic,␣CountDeclClassMethod,␣CountDeclMethodPrivate,␣CountDeclInstanceVariable,␣CountDeclClass,␣CountDeclClassVariable,␣CountLineComment"
## [1] "Distinct.count.of.Commit.Hash.PRE + Distinct.count.of.Issue.Key.POST + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclMethodDefault + CountDeclMethodProtected + File + Kind + MaxEssential + MaxNesting + Name + RatioCommentToCode + Table.Name"
## [1] "Project: hibernate-5.0"
## [1] "Redudant variables: "
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + File + Kind + MaxEssential + MaxNesting + Name + RatioCommentToCode + Table.Name"
## [1] "Project: umbraco-7.6"
## [1] "Redudant variables: "
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountLineBlank + File + Kind + MaxCyclomaticModified + MaxEssential + MaxNesting + Name + RatioCommentToCode + Table.Name"
## [1] "Project: hadoop-2.6"
## [1] "NumberOfMetricsInitial: 10 Budget: 59 Over Budget: FALSE NumberOfMetricsKept: 10 CorrelationCutoff: 0.7"
## [1] "Project: hibernate-5.0"
## [1] "NumberOfMetricsInitial: 18 Budget: 29 Over Budget: FALSE NumberOfMetricsKept: 18 CorrelationCutoff: 0.7"
## [1] "Project: umbraco-7.6"
## [1] "NumberOfMetricsInitial: 12 Budget: 15 Over Budget: FALSE NumberOfMetricsKept: 12 CorrelationCutoff: 0.7"

Setup formulas

form_list_bin_m0 = dataSetupFormulasBinary(all_list_model_m0)
## [1] "Project: hadoop-2.6"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclMethodDefault + CountDeclMethodProtected + MaxEssential + MaxNesting + RatioCommentToCode"
## [1] "Project: hibernate-5.0"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticModified + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClassMethod + CountDeclClassVariable + CountDeclClass + CountDeclInstanceVariable + CountDeclMethodDefault + CountDeclMethodPrivate + CountDeclMethodProtected + CountDeclMethodPublic + CountLineComment + MaxEssential + MaxNesting + RatioCommentToCode"
## [1] "Project: umbraco-7.6"
## [1] "Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.PRE + AvgCyclomaticStrict + AvgEssential + AvgLineBlank + AvgLineComment + CountDeclClass + CountLineBlank + MaxCyclomaticModified + MaxEssential + MaxNesting + RatioCommentToCode"

Fit regression model (MC7)

models_2_BASE = modelFitLogistic(all_list_model_m0,form_list_bin_m0,"CAT.BASE")
## [1] "Project: hadoop-2.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     201.97    R2       0.347    C       0.838    
##   FALSE        747    d.f.            10    g        1.472    Dxy     0.677    
##   TRUE         143    Pr(> chi2) <0.0001    gr       4.359    gamma   0.679    
##  max |deriv| 6e-09                          gp       0.179    tau-a   0.183    
##                                             Brier    0.100                     
##  
##                                    Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                         -3.8209 0.7348 -5.20  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE  3.0387 0.3237  9.39  <0.0001 
##  AvgCyclomaticStrict                4.0722 1.2426  3.28  0.0010  
##  AvgEssential                      -4.2398 1.6071 -2.64  0.0083  
##  AvgLineBlank                      -1.1918 0.7955 -1.50  0.1341  
##  AvgLineComment                     0.2730 0.7050  0.39  0.6986  
##  CountDeclMethodDefault             0.2415 0.2406  1.00  0.3157  
##  CountDeclMethodProtected           0.0093 0.2697  0.03  0.9724  
##  MaxEssential                       0.8405 0.5782  1.45  0.1460  
##  MaxNesting                        -0.1061 1.0227 -0.10  0.9173  
##  RatioCommentToCode                -2.2086 1.7023 -1.30  0.1945  
##  
## [1] "Project: hibernate-5.0"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      81.26    R2       0.262    C       0.780    
##   FALSE        347    d.f.            18    g        1.250    Dxy     0.561    
##   TRUE          93    Pr(> chi2) <0.0001    gr       3.491    gamma   0.562    
##  max |deriv| 6e-12                          gp       0.187    tau-a   0.187    
##                                             Brier    0.133                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -3.5085 1.1036 -3.18  0.0015  
##  Distinct.count.of.Author.Email.PRE  1.8362 0.9499  1.93  0.0532  
##  Distinct.count.of.Issue.Key.PRE     0.9870 0.5853  1.69  0.0917  
##  AvgCyclomaticModified              -3.1864 1.7111 -1.86  0.0626  
##  AvgEssential                        1.1959 2.0271  0.59  0.5552  
##  AvgLineBlank                        0.8497 0.9166  0.93  0.3539  
##  AvgLineComment                      0.8245 0.9984  0.83  0.4089  
##  CountDeclClassMethod               -0.5618 0.4770 -1.18  0.2389  
##  CountDeclClassVariable             -0.2766 0.5120 -0.54  0.5890  
##  CountDeclClass                     -0.1171 0.8290 -0.14  0.8876  
##  CountDeclInstanceVariable          -0.4749 0.4822 -0.99  0.3246  
##  CountDeclMethodDefault              0.0178 0.7730  0.02  0.9816  
##  CountDeclMethodPrivate              0.4233 0.5320  0.80  0.4262  
##  CountDeclMethodProtected           -0.3484 0.3928 -0.89  0.3750  
##  CountDeclMethodPublic               0.5626 0.6941  0.81  0.4177  
##  CountLineComment                    1.3411 1.0419  1.29  0.1980  
##  MaxEssential                        0.2553 0.7863  0.32  0.7455  
##  MaxNesting                          0.6304 1.1964  0.53  0.5983  
##  RatioCommentToCode                 -7.1742 4.6800 -1.53  0.1253  
##  
## [1] "Project: umbraco-7.6"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                       Model Likelihood     Discrimination    Rank Discrim.    
##                          Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2     19.27    R2       0.172    C       0.770    
##   FALSE        208    d.f.           12    g        1.105    Dxy     0.539    
##   TRUE          22    Pr(> chi2) 0.0822    gr       3.020    gamma   0.544    
##  max |deriv| 1e-09                         gp       0.090    tau-a   0.094    
##                                            Brier    0.078                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -4.3057 1.8728 -2.30  0.0215  
##  Distinct.count.of.Author.Email.PRE  0.9873 1.4593  0.68  0.4987  
##  Distinct.count.of.Issue.Key.PRE     2.4547 2.3846  1.03  0.3033  
##  AvgCyclomaticStrict                -0.7456 2.5588 -0.29  0.7708  
##  AvgEssential                       -0.5002 3.9258 -0.13  0.8986  
##  AvgLineBlank                        0.8868 1.6796  0.53  0.5975  
##  AvgLineComment                     -0.6507 1.6507 -0.39  0.6934  
##  CountDeclClass                     -0.4990 1.8050 -0.28  0.7822  
##  CountLineBlank                      1.5155 0.9517  1.59  0.1113  
##  MaxCyclomaticModified              -2.5628 2.0197 -1.27  0.2045  
##  MaxEssential                        2.1070 1.4524  1.45  0.1469  
##  MaxNesting                          1.8571 2.0379  0.91  0.3622  
##  RatioCommentToCode                 -4.3581 4.7294 -0.92  0.3568  
## 

Model Analysis for BASE

In this section, we present the selected statistics for our analysis. As explained in our approach, they are the steps: MC7, MA1, MA2, MA3 and MA4.

Here we extract the selected statistics and we add the data (columns) to an object that will be exported to CSV in the section Output.

Fit regression model (MC7): summary stats

model_things_2_BASE = vector("list", 0)
model_things_2_BASE = modelStats(models_2_BASE)

Model stability assessment (MA1)

model_things_2_BASE = modelValidate(models_2_BASE, model_things_2_BASE)

Model significant variables

model_things_2_BASE = modelSignificance(models_2_BASE, model_things_2_BASE)
## [1] "project:  hadoop-2.6 model:  CAT.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                            Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE  88.14      1   <.0001
##  AvgCyclomaticStrict                10.74      1   0.0010
##  AvgEssential                        6.96      1   0.0083
##  AvgLineBlank                        2.24      1   0.1341
##  AvgLineComment                      0.15      1   0.6986
##  CountDeclMethodDefault              1.01      1   0.3157
##  CountDeclMethodProtected            0.00      1   0.9724
##  MaxEssential                        2.11      1   0.1460
##  MaxNesting                          0.01      1   0.9173
##  RatioCommentToCode                  1.68      1   0.1945
##  TOTAL                             146.76     10   <.0001
## [1] "project:  hibernate-5.0 model:  CAT.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  3.74       1   0.0532
##  Distinct.count.of.Issue.Key.PRE     2.84       1   0.0917
##  AvgCyclomaticModified               3.47       1   0.0626
##  AvgEssential                        0.35       1   0.5552
##  AvgLineBlank                        0.86       1   0.3539
##  AvgLineComment                      0.68       1   0.4089
##  CountDeclClassMethod                1.39       1   0.2389
##  CountDeclClassVariable              0.29       1   0.5890
##  CountDeclClass                      0.02       1   0.8876
##  CountDeclInstanceVariable           0.97       1   0.3246
##  CountDeclMethodDefault              0.00       1   0.9816
##  CountDeclMethodPrivate              0.63       1   0.4262
##  CountDeclMethodProtected            0.79       1   0.3750
##  CountDeclMethodPublic               0.66       1   0.4177
##  CountLineComment                    1.66       1   0.1980
##  MaxEssential                        0.11       1   0.7455
##  MaxNesting                          0.28       1   0.5983
##  RatioCommentToCode                  2.35       1   0.1253
##  TOTAL                              63.90      18   <.0001
## [1] "project:  umbraco-7.6 model:  CAT.BASE"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  0.46       1   0.4987
##  Distinct.count.of.Issue.Key.PRE     1.06       1   0.3033
##  AvgCyclomaticStrict                 0.08       1   0.7708
##  AvgEssential                        0.02       1   0.8986
##  AvgLineBlank                        0.28       1   0.5975
##  AvgLineComment                      0.16       1   0.6934
##  CountDeclClass                      0.08       1   0.7822
##  CountLineBlank                      2.54       1   0.1113
##  MaxCyclomaticModified               1.61       1   0.2045
##  MaxEssential                        2.10       1   0.1469
##  MaxNesting                          0.83       1   0.3622
##  RatioCommentToCode                  0.85       1   0.3568
##  TOTAL                              17.11      12   0.1455

Model simplification (MA2), Predictors’ explanatory power estimation (MA3), Predictors’ effect in the outcome measurement (MA4)

model_things_2_BASE = modelSimplification(models_2_BASE, model_things_2_BASE)
## [1] "project:  hadoop-2.6 model:  CAT.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     196.89    R2       0.339    C       0.834    
##   FALSE        747    d.f.             4    g        1.412    Dxy     0.669    
##   TRUE         143    Pr(> chi2) <0.0001    gr       4.105    gamma   0.676    
##  max |deriv| 6e-10                          gp       0.176    tau-a   0.181    
##                                             Brier    0.101                     
##  
##                                    Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                         -3.8552 0.5049 -7.64  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE  3.1135 0.3097 10.05  <0.0001 
##  AvgCyclomaticStrict                3.2955 0.9393  3.51  0.0005  
##  AvgEssential                      -4.5507 1.5179 -3.00  0.0027  
##  MaxEssential                       1.2191 0.4702  2.59  0.0095  
##  
## [1] "project:  hadoop-2.6 model:  CAT.BASE Refit - summary"
## [1] "project:  hadoop-2.6 model:  CAT.BASE Refit - validate"
## [1] "project:  hadoop-2.6 model:  CAT.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                            Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE 101.09     1    <.0001
##  AvgCyclomaticStrict                12.31     1    0.0005
##  AvgEssential                        8.99     1    0.0027
##  MaxEssential                        6.72     1    0.0095
##  TOTAL                             145.32     4    <.0001
## [1] "Distinct.count.of.Commit.Hash.PRE"
## [1] "AvgCyclomaticStrict"
## [1] "AvgEssential"
## [1] "MaxEssential"
##   Distinct.count.of.Commit.Hash.PRE AvgCyclomaticStrict AvgEssential
## 1                          1.695506            2.660674     1.296629
##   MaxEssential
## 1     3.969663
## [1] "Fixed at Mean: 0.189754400454327"
## [1] "Distinct.count.of.Commit.Hash.PRE  Coef at Mean + 10%: 0.202761812337136"
## [1] "AvgCyclomaticStrict  Coef at Mean + 10%: 0.205675857475548"
## [1] "AvgEssential  Coef at Mean + 10%: 0.173624906372303"
## [1] "MaxEssential  Coef at Mean + 10%: 0.196088895383609"
## [1] "project:  hibernate-5.0 model:  CAT.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      69.04    R2       0.226    C       0.763    
##   FALSE        347    d.f.             3    g        1.138    Dxy     0.526    
##   TRUE          93    Pr(> chi2) <0.0001    gr       3.122    gamma   0.528    
##  max |deriv| 3e-13                          gp       0.172    tau-a   0.176    
##                                             Brier    0.137                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -4.2571 0.5728 -7.43  <0.0001 
##  Distinct.count.of.Author.Email.PRE  2.1756 0.8454  2.57  0.0101  
##  CountLineComment                    1.8173 0.4474  4.06  <0.0001 
##  RatioCommentToCode                 -9.7887 2.5889 -3.78  0.0002  
##  
## [1] "project:  hibernate-5.0 model:  CAT.BASE Refit - summary"
## [1] "project:  hibernate-5.0 model:  CAT.BASE Refit - validate"
## [1] "project:  hibernate-5.0 model:  CAT.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  6.62      1    0.0101
##  CountLineComment                   16.50      1    <.0001
##  RatioCommentToCode                 14.30      1    0.0002
##  TOTAL                              54.39      3    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "CountLineComment"
## [1] "RatioCommentToCode"
##   Distinct.count.of.Author.Email.PRE CountLineComment RatioCommentToCode
## 1                           2.129545         46.95455            0.27575
## [1] "Fixed at Mean: 0.238682329326376"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.250167924092426"
## [1] "CountLineComment  Coef at Mean + 10%: 0.252335980398188"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.222557134553471"
## [1] "project:  umbraco-7.6 model:  CAT.BASE Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                       Model Likelihood     Discrimination    Rank Discrim.    
##                          Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2     11.11    R2       0.101    C       0.696    
##   FALSE        208    d.f.            1    g        0.895    Dxy     0.391    
##   TRUE          22    Pr(> chi2) 0.0009    gr       2.448    gamma   0.396    
##  max |deriv| 5e-07                         gp       0.072    tau-a   0.068    
##                                            Brier    0.080                     
##  
##                 Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept      -5.2746 1.0511 -5.02  <0.0001 
##  CountLineBlank  1.8580 0.5873  3.16  0.0016  
##  
## [1] "project:  umbraco-7.6 model:  CAT.BASE Refit - summary"
## [1] "project:  umbraco-7.6 model:  CAT.BASE Refit - validate"
## [1] "project:  umbraco-7.6 model:  CAT.BASE Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor         Chi-Square d.f. P     
##  CountLineBlank 10.01      1    0.0016
##  TOTAL          10.01      1    0.0016
## [1] "CountLineBlank"
##   CountLineBlank
## 1       49.35652
## [1] "Fixed at Mean: 0.107922559017229"
## [1] "CountLineBlank  Coef at Mean + 10%: 0.115404374892222"

Model 2 - Extension: BASE + EH Catch

all_list_omitted_m2 = all_list_omitted

Drop variables - adjust metric sets

To be able to build new models that are an extension of the base model we removed all insignificant metrics according to the related base model construction. This process makes sense since one can adjust metrics based on expertise. In this case, we learned in the previous step what are the significant metrics for base only, therefore we can remove the other base metrics. In this process we need to loop through each model previously built. Each model can have a different set of metrics and, therefore, the construction of their extensions have to be separately done.

all_list_omitted_m2 = vector("list", 0)

for (i in 1:length(models_2_BASE)) {
  name = models_2_BASE[[i]]$name
  project = models_2_BASE[[i]]$project
  fit = models_2_BASE[[i]]$fit
  class = as.character(class(fit)[1])
  print(paste("Project:", project, "Name:", name))
  
  if(class != "try-error") {
  
    temp_data_index = findProjectData(all_list_omitted,project)
    temp_data = as.data.frame(all_list_omitted[[temp_data_index]]$data)
    
    # Keep catch metrics, remove try and throws, adjust base.
    keepForID = c("Project", "File.Path")
    keepForBase = c("Distinct.count.of.Issue.Key.POST",keepForID)
    keepForTry = c(keepForID)
    keepForThrows = c(keepForID)
    
    base_names_drop = base_names[!(base_names %in% keepForBase)]
    try_names_drop = try_names[!(try_names %in% keepForTry)]
    throws_names_drop = throws_names[!(throws_names %in% keepForThrows)]
  
    temp_data = temp_data[,!(names(temp_data) %in% try_names_drop)]
    temp_data = temp_data[,!(names(temp_data) %in% throws_names_drop)]
    
    # Remove insignificant metrics according to related base model.
    temp_sig_index = findModel(model_things_2_BASE,name,project)
    
    # Only move forward with the models that are under budget.
    if (!model_things_2_BASE[[temp_sig_index]][["over_budget"]]){
      temp_significant = model_things_2_BASE[[temp_sig_index]][["signifcant_r"]]
      if (!is.na(temp_significant)) {
        # The significant metrics from the base model as a vector of char.
        temp_significant_list = unlist(strsplit(temp_significant, ", "))
      } else
        temp_significant_list = unlist(strsplit("", ", "))
      
      # The insignificant metrics: all from the base model minus the significant ones.
      base_names_insignificant = base_names_drop[!(base_names_drop %in% temp_significant_list)]
      # The clean list of metrics for modeling: all metrics minus the base insignificant ones.
      temp_data = temp_data[,!(names(temp_data) %in% base_names_insignificant)]
      print(names(temp_data))
  
      all_list_omitted_m2 <- c(all_list_omitted_m2, list(list(name=name, project=project, data=temp_data, sig=temp_significant_list)))
    } else
      print(paste("!-ERROR-! - model over budget."))
    
  } else
    print(paste("!-ERROR-! - model construction had issues."))
}
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
##  [1] "Distinct.count.of.Commit.Hash.PRE"     
##  [2] "Distinct.count.of.Issue.Key.POST"      
##  [3] "AvgCyclomaticStrict"                   
##  [4] "AvgEssential"                          
##  [5] "MaxEssential"                          
##  [6] "X..Catch"                              
##  [7] "Avg.Catch.LOC"                         
##  [8] "Avg.Catch.SLOC"                        
##  [9] "Avg..AP.Catch.and.do.nothing"          
## [10] "Avg..AP.Catch.and.Return.null"         
## [11] "Avg..AP.Destructive.Wrapping"          
## [12] "Avg..AP.Dummy.Handler"                 
## [13] "Avg..AP.Generic.Catch"                 
## [14] "Avg..AP.Ignoring.Interrupted.Exception"
## [15] "Avg..AP.Incomplete.implementation"     
## [16] "Avg..AP.Log.and.Return.null"           
## [17] "Avg..AP.Log.and.Throw"                 
## [18] "Avg..AP.Multi.line.log.messages"       
## [19] "Avg..AP.Nested.try.block"              
## [20] "Avg..AP.Overcatch.and.Abort"           
## [21] "Avg..AP.Overcatch"                     
## [22] "Avg..AP.Relying.on.getCause.."         
## [23] "Avg..AP.Throw.within.finally"          
## [24] "Avg..AP.Unhandled.exceptions"          
## [25] "Avg..AP.Unreachable.Catch.Handler"     
## [26] "Avg..Potentially.Recoverable"          
## [27] "Avg..Potentially.UnRecoverable"        
## [28] "Avg..Recoverability.Not.Relevant"      
## [29] "AP.Catch.and.do.nothing"               
## [30] "AP.Catch.and.Return.null"              
## [31] "Sum.of.AP.Destructive.Wrapping"        
## [32] "Sum.of.AP.Dummy.Handler"               
## [33] "Sum.of.AP.Generic.Catch"               
## [34] "AP.Ignoring.Interrupted.Exception"     
## [35] "AP.Incomplete.implementation"          
## [36] "Sum.of.AP.Log.and.Return.null"         
## [37] "AP.Log.and.Throw"                      
## [38] "AP.Multi.line.log.messages"            
## [39] "AP.Nested.try.block"                   
## [40] "Sum.of.AP.Overcatch.and.Abort"         
## [41] "Sum.of.AP.Overcatch"                   
## [42] "AP.Relying.on.getCause.."              
## [43] "AP.Throw.within.finally"               
## [44] "Sum.of.AP.Unhandled.exceptions"        
## [45] "Sum.of.AP.Unreachable.Catch.Handler"   
## [46] "Catch.LOC"                             
## [47] "Catch.SLOC"                            
## [48] "Potentially.Recoverable"               
## [49] "Potentially.UnRecoverable"             
## [50] "Recoverability.Not.Relevant"           
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
##  [1] "Distinct.count.of.Author.Email.PRE"    
##  [2] "Distinct.count.of.Issue.Key.POST"      
##  [3] "CountLineComment"                      
##  [4] "RatioCommentToCode"                    
##  [5] "X..Catch"                              
##  [6] "Avg.Catch.LOC"                         
##  [7] "Avg.Catch.SLOC"                        
##  [8] "Avg..AP.Catch.and.do.nothing"          
##  [9] "Avg..AP.Catch.and.Return.null"         
## [10] "Avg..AP.Destructive.Wrapping"          
## [11] "Avg..AP.Dummy.Handler"                 
## [12] "Avg..AP.Generic.Catch"                 
## [13] "Avg..AP.Ignoring.Interrupted.Exception"
## [14] "Avg..AP.Incomplete.implementation"     
## [15] "Avg..AP.Log.and.Return.null"           
## [16] "Avg..AP.Log.and.Throw"                 
## [17] "Avg..AP.Multi.line.log.messages"       
## [18] "Avg..AP.Nested.try.block"              
## [19] "Avg..AP.Overcatch.and.Abort"           
## [20] "Avg..AP.Overcatch"                     
## [21] "Avg..AP.Relying.on.getCause.."         
## [22] "Avg..AP.Throw.within.finally"          
## [23] "Avg..AP.Unhandled.exceptions"          
## [24] "Avg..AP.Unreachable.Catch.Handler"     
## [25] "Avg..Potentially.Recoverable"          
## [26] "Avg..Potentially.UnRecoverable"        
## [27] "Avg..Recoverability.Not.Relevant"      
## [28] "AP.Catch.and.do.nothing"               
## [29] "AP.Catch.and.Return.null"              
## [30] "Sum.of.AP.Destructive.Wrapping"        
## [31] "Sum.of.AP.Dummy.Handler"               
## [32] "Sum.of.AP.Generic.Catch"               
## [33] "AP.Ignoring.Interrupted.Exception"     
## [34] "AP.Incomplete.implementation"          
## [35] "Sum.of.AP.Log.and.Return.null"         
## [36] "AP.Log.and.Throw"                      
## [37] "AP.Multi.line.log.messages"            
## [38] "AP.Nested.try.block"                   
## [39] "Sum.of.AP.Overcatch.and.Abort"         
## [40] "Sum.of.AP.Overcatch"                   
## [41] "AP.Relying.on.getCause.."              
## [42] "AP.Throw.within.finally"               
## [43] "Sum.of.AP.Unhandled.exceptions"        
## [44] "Sum.of.AP.Unreachable.Catch.Handler"   
## [45] "Catch.LOC"                             
## [46] "Catch.SLOC"                            
## [47] "Potentially.Recoverable"               
## [48] "Potentially.UnRecoverable"             
## [49] "Recoverability.Not.Relevant"           
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
##  [1] "Distinct.count.of.Issue.Key.POST"      
##  [2] "CountLineBlank"                        
##  [3] "X..Catch"                              
##  [4] "Avg.Catch.LOC"                         
##  [5] "Avg.Catch.SLOC"                        
##  [6] "Avg..AP.Catch.and.do.nothing"          
##  [7] "Avg..AP.Catch.and.Return.null"         
##  [8] "Avg..AP.Destructive.Wrapping"          
##  [9] "Avg..AP.Dummy.Handler"                 
## [10] "Avg..AP.Generic.Catch"                 
## [11] "Avg..AP.Ignoring.Interrupted.Exception"
## [12] "Avg..AP.Incomplete.implementation"     
## [13] "Avg..AP.Log.and.Return.null"           
## [14] "Avg..AP.Log.and.Throw"                 
## [15] "Avg..AP.Multi.line.log.messages"       
## [16] "Avg..AP.Nested.try.block"              
## [17] "Avg..AP.Overcatch.and.Abort"           
## [18] "Avg..AP.Overcatch"                     
## [19] "Avg..AP.Relying.on.getCause.."         
## [20] "Avg..AP.Throw.within.finally"          
## [21] "Avg..AP.Unhandled.exceptions"          
## [22] "Avg..AP.Unreachable.Catch.Handler"     
## [23] "Avg..Potentially.Recoverable"          
## [24] "Avg..Potentially.UnRecoverable"        
## [25] "Avg..Recoverability.Not.Relevant"      
## [26] "AP.Catch.and.do.nothing"               
## [27] "AP.Catch.and.Return.null"              
## [28] "Sum.of.AP.Destructive.Wrapping"        
## [29] "Sum.of.AP.Dummy.Handler"               
## [30] "Sum.of.AP.Generic.Catch"               
## [31] "AP.Ignoring.Interrupted.Exception"     
## [32] "AP.Incomplete.implementation"          
## [33] "Sum.of.AP.Log.and.Return.null"         
## [34] "AP.Log.and.Throw"                      
## [35] "AP.Multi.line.log.messages"            
## [36] "AP.Nested.try.block"                   
## [37] "Sum.of.AP.Overcatch.and.Abort"         
## [38] "Sum.of.AP.Overcatch"                   
## [39] "AP.Relying.on.getCause.."              
## [40] "AP.Throw.within.finally"               
## [41] "Sum.of.AP.Unhandled.exceptions"        
## [42] "Sum.of.AP.Unreachable.Catch.Handler"   
## [43] "Catch.LOC"                             
## [44] "Catch.SLOC"                            
## [45] "Potentially.Recoverable"               
## [46] "Potentially.UnRecoverable"             
## [47] "Recoverability.Not.Relevant"
dropToPredict = c("File.Path",  "Project", "Language", "Table.Name","Name","Kind", "X..Bugs.Post", "File", "Distinct.count.of.Issue.Key.POST")

Correlation analysis (MC4), Redundancy Analysis (MC5), Budget based correlation analysis (MC6)

all_list_model_m2 = dataApplyReductionByModel(all_list_omitted_m2, "BSAP")
## [1] "Project: hadoop-2.6 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 49"
## [1] "NumberOfMetricsKept: 23"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
##  [1] "Distinct.count.of.Commit.Hash.PRE"     
##  [2] "Distinct.count.of.Issue.Key.POST"      
##  [3] "AvgCyclomaticStrict"                   
##  [4] "AvgEssential"                          
##  [5] "MaxEssential"                          
##  [6] "Avg.Catch.LOC"                         
##  [7] "Avg..AP.Catch.and.do.nothing"          
##  [8] "Avg..AP.Catch.and.Return.null"         
##  [9] "Avg..AP.Destructive.Wrapping"          
## [10] "Avg..AP.Dummy.Handler"                 
## [11] "Avg..AP.Generic.Catch"                 
## [12] "Avg..AP.Ignoring.Interrupted.Exception"
## [13] "Avg..AP.Multi.line.log.messages"       
## [14] "Avg..AP.Nested.try.block"              
## [15] "Avg..AP.Throw.within.finally"          
## [16] "Avg..AP.Unhandled.exceptions"          
## [17] "Avg..AP.Unreachable.Catch.Handler"     
## [18] "Avg..Potentially.UnRecoverable"        
## [19] "Avg..Recoverability.Not.Relevant"      
## [20] "AP.Incomplete.implementation"          
## [21] "AP.Log.and.Throw"                      
## [22] "Sum.of.AP.Overcatch.and.Abort"         
## [23] "AP.Relying.on.getCause.."              
## [24] "Potentially.Recoverable"               
## [1] "Project: hibernate-5.0 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 48"
## [1] "NumberOfMetricsKept: 19"
## [1] "Distinct.count.of.Author.Email.PRE + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
##  [1] "Distinct.count.of.Author.Email.PRE"
##  [2] "Distinct.count.of.Issue.Key.POST"  
##  [3] "CountLineComment"                  
##  [4] "RatioCommentToCode"                
##  [5] "Avg.Catch.LOC"                     
##  [6] "Avg.Catch.SLOC"                    
##  [7] "Avg..AP.Catch.and.do.nothing"      
##  [8] "Avg..AP.Catch.and.Return.null"     
##  [9] "Avg..AP.Generic.Catch"             
## [10] "Avg..AP.Nested.try.block"          
## [11] "Avg..AP.Throw.within.finally"      
## [12] "Avg..AP.Unhandled.exceptions"      
## [13] "Avg..AP.Unreachable.Catch.Handler" 
## [14] "Avg..Potentially.Recoverable"      
## [15] "Avg..Recoverability.Not.Relevant"  
## [16] "Sum.of.AP.Destructive.Wrapping"    
## [17] "Sum.of.AP.Dummy.Handler"           
## [18] "Sum.of.AP.Log.and.Return.null"     
## [19] "Catch.SLOC"                        
## [20] "Potentially.Recoverable"           
## [1] "Project: umbraco-7.6 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 46"
## [1] "NumberOfMetricsKept: 16"
## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + Sum.of.AP.Overcatch + AP.Relying.on.getCause.."
##  [1] "Distinct.count.of.Issue.Key.POST" 
##  [2] "CountLineBlank"                   
##  [3] "Avg.Catch.LOC"                    
##  [4] "Avg..AP.Catch.and.Return.null"    
##  [5] "Avg..AP.Destructive.Wrapping"     
##  [6] "Avg..AP.Dummy.Handler"            
##  [7] "Avg..AP.Log.and.Return.null"      
##  [8] "Avg..AP.Overcatch"                
##  [9] "Avg..AP.Unhandled.exceptions"     
## [10] "Avg..AP.Unreachable.Catch.Handler"
## [11] "Avg..Potentially.UnRecoverable"   
## [12] "AP.Catch.and.do.nothing"          
## [13] "Sum.of.AP.Generic.Catch"          
## [14] "AP.Multi.line.log.messages"       
## [15] "AP.Nested.try.block"              
## [16] "Sum.of.AP.Overcatch"              
## [17] "AP.Relying.on.getCause.."         
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Commit.Hash.PRE + Distinct.count.of.Issue.Key.POST + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
## [1] "NumberOfMetricsInitial: 24"
## [1] "NumberOfMetricsKept: 24"
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
## [1] "NumberOfMetricsInitial: 20"
## [1] "NumberOfMetricsKept: 20"
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Issue.Key.POST + CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + Sum.of.AP.Overcatch + AP.Relying.on.getCause.."
## [1] "NumberOfMetricsInitial: 17"
## [1] "NumberOfMetricsKept: 17"
## [1] "------------------------------------Project: hadoop-2.6 Name: CAT.BASE"
## [1] "NumberOfMetricsInitial: 23 Budget: 59 Over Budget: FALSE NumberOfMetricsKept: 23 CorrelationCutoff: 0.7"
## [1] "NumberOfMetricsInitial: 23 Budget: 59 Over Budget: FALSE NumberOfMetricsKept: 23 CorrelationCutoff: 0.7"
## [1] "------------------------------------Project: hibernate-5.0 Name: CAT.BASE"
## [1] "NumberOfMetricsInitial: 19 Budget: 29 Over Budget: FALSE NumberOfMetricsKept: 19 CorrelationCutoff: 0.7"
## [1] "NumberOfMetricsInitial: 19 Budget: 29 Over Budget: FALSE NumberOfMetricsKept: 19 CorrelationCutoff: 0.7"
## [1] "------------------------------------Project: umbraco-7.6 Name: CAT.BASE"

## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + AP.Relying.on.getCause.."
## [1] "NumberOfMetricsInitial: 16 Budget: 15 Over Budget: TRUE NumberOfMetricsKept: 15 CorrelationCutoff: 0.671"
## [1] "NumberOfMetricsInitial: 16 Budget: 15 Over Budget: TRUE NumberOfMetricsKept: 15 CorrelationCutoff: 0.671"

Setup formulas

form_list_bin_m2 = dataSetupFormulasBinaryByModel(all_list_model_m2)
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## [1] "Distinct.count.of.Author.Email.PRE + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + AP.Relying.on.getCause.."

Fit regression model (MC7)

models_2_BSAP = modelFitLogisticByModel(all_list_model_m2,form_list_bin_m2,"CAT.BSAP")
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     233.46    R2       0.394    C       0.862    
##   FALSE        747    d.f.            23    g        1.730    Dxy     0.724    
##   TRUE         143    Pr(> chi2) <0.0001    gr       5.640    gamma   0.726    
##  max |deriv| 2e-06                          gp       0.191    tau-a   0.196    
##                                             Brier    0.095                     
##  
##                                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                              -4.8011 1.0842 -4.43  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE       2.9187 0.3478  8.39  <0.0001 
##  AvgCyclomaticStrict                     2.9866 1.0155  2.94  0.0033  
##  AvgEssential                           -3.9923 1.6472 -2.42  0.0154  
##  MaxEssential                            0.6824 0.5346  1.28  0.2018  
##  Avg.Catch.LOC                           1.3236 1.1710  1.13  0.2584  
##  Avg..AP.Catch.and.do.nothing            0.5908 2.2376  0.26  0.7917  
##  Avg..AP.Catch.and.Return.null          -2.4554 2.1356 -1.15  0.2502  
##  Avg..AP.Destructive.Wrapping           -0.9398 1.1800 -0.80  0.4258  
##  Avg..AP.Dummy.Handler                   0.9190 1.4537  0.63  0.5273  
##  Avg..AP.Generic.Catch                  -1.9456 1.4520 -1.34  0.1803  
##  Avg..AP.Ignoring.Interrupted.Exception  5.1785 1.5936  3.25  0.0012  
##  Avg..AP.Multi.line.log.messages         0.8636 3.0204  0.29  0.7749  
##  Avg..AP.Nested.try.block                2.0894 2.3797  0.88  0.3799  
##  Avg..AP.Throw.within.finally           -3.5422 2.3058 -1.54  0.1245  
##  Avg..AP.Unhandled.exceptions           -1.1283 1.1293 -1.00  0.3177  
##  Avg..AP.Unreachable.Catch.Handler       0.2953 1.9069  0.15  0.8769  
##  Avg..Potentially.UnRecoverable          0.5821 1.8214  0.32  0.7493  
##  Avg..Recoverability.Not.Relevant       -4.4751 3.2719 -1.37  0.1714  
##  AP.Incomplete.implementation            2.6999 4.1987  0.64  0.5202  
##  AP.Log.and.Throw                        4.1366 2.0714  2.00  0.0458  
##  Sum.of.AP.Overcatch.and.Abort           0.1437 2.1522  0.07  0.9468  
##  AP.Relying.on.getCause..               -0.6957 1.3245 -0.53  0.5994  
##  Potentially.Recoverable                 0.8492 0.3661  2.32  0.0204  
##  
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      91.81    R2       0.293    C       0.798    
##   FALSE        347    d.f.            19    g        1.414    Dxy     0.596    
##   TRUE          93    Pr(> chi2) <0.0001    gr       4.113    gamma   0.598    
##  max |deriv| 8e-10                          gp       0.199    tau-a   0.199    
##                                             Brier    0.128                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -4.8631 1.2596 -3.86  0.0001  
##  Distinct.count.of.Author.Email.PRE  2.8461 0.9505  2.99  0.0028  
##  CountLineComment                    1.7835 0.5056  3.53  0.0004  
##  RatioCommentToCode                 -9.6397 2.9651 -3.25  0.0011  
##  Avg.Catch.LOC                      -0.9464 1.4915 -0.63  0.5257  
##  Avg.Catch.SLOC                      3.9438 1.9831  1.99  0.0467  
##  Avg..AP.Catch.and.do.nothing        0.8616 2.5484  0.34  0.7353  
##  Avg..AP.Catch.and.Return.null       4.5145 2.6002  1.74  0.0825  
##  Avg..AP.Generic.Catch              -0.6409 1.4690 -0.44  0.6626  
##  Avg..AP.Nested.try.block           -0.2740 2.9307 -0.09  0.9255  
##  Avg..AP.Throw.within.finally        4.0430 2.6656  1.52  0.1293  
##  Avg..AP.Unhandled.exceptions        1.5102 1.2390  1.22  0.2229  
##  Avg..AP.Unreachable.Catch.Handler   0.5841 1.9556  0.30  0.7652  
##  Avg..Potentially.Recoverable       -0.6864 2.6134 -0.26  0.7928  
##  Avg..Recoverability.Not.Relevant   -0.9000 2.7225 -0.33  0.7410  
##  Sum.of.AP.Destructive.Wrapping      0.8648 0.6441  1.34  0.1793  
##  Sum.of.AP.Dummy.Handler             3.0046 1.0077  2.98  0.0029  
##  Sum.of.AP.Log.and.Return.null      -0.0299 2.4376 -0.01  0.9902  
##  Catch.SLOC                         -1.0628 0.9990 -1.06  0.2874  
##  Potentially.Recoverable            -0.5009 1.2268 -0.41  0.6831  
##  
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                       Model Likelihood     Discrimination    Rank Discrim.    
##                          Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2     43.16    R2       0.366    C       0.872    
##   FALSE        208    d.f.           15    g        5.042    Dxy     0.744    
##   TRUE          22    Pr(> chi2) 0.0001    gr     154.833    gamma   0.747    
##  max |deriv| 0.003                         gp       0.129    tau-a   0.129    
##                                            Brier    0.066                     
##  
##                                    Coef     S.E.     Wald Z Pr(>|Z|)
##  Intercept                          -5.4210   1.7433 -3.11  0.0019  
##  CountLineBlank                      1.3541   0.7092  1.91  0.0562  
##  Avg.Catch.LOC                      -2.0201   2.4529 -0.82  0.4102  
##  Avg..AP.Catch.and.Return.null       1.4990   3.9958  0.38  0.7076  
##  Avg..AP.Destructive.Wrapping      -89.7540 349.8585 -0.26  0.7975  
##  Avg..AP.Dummy.Handler               6.3450   2.5707  2.47  0.0136  
##  Avg..AP.Log.and.Return.null       -16.9573  19.2147 -0.88  0.3775  
##  Avg..AP.Overcatch                  -1.0034   2.9495 -0.34  0.7337  
##  Avg..AP.Unhandled.exceptions      -11.7078   6.3336 -1.85  0.0645  
##  Avg..AP.Unreachable.Catch.Handler  -7.8174   4.3969 -1.78  0.0754  
##  Avg..Potentially.UnRecoverable     10.1290   6.9465  1.46  0.1448  
##  AP.Catch.and.do.nothing            -0.4214   1.9185 -0.22  0.8261  
##  Sum.of.AP.Generic.Catch             5.7515   1.8439  3.12  0.0018  
##  AP.Multi.line.log.messages        -30.0528 283.2997 -0.11  0.9155  
##  AP.Nested.try.block                 0.1366   3.6782  0.04  0.9704  
##  AP.Relying.on.getCause..          -18.4489 234.3618 -0.08  0.9373  
## 

Model Analysis for BSAP

In this section, we present the selected statistics for our analysis. As explained in our approach, they are the steps: MC7, MA1, MA2, MA3 and MA4.

Here we extract the selected statistics and we add the data (columns) to an object that will be exported to CSV in the section Output.

Fit regression model (MC7): summary stats
model_things_2_BSAP = vector("list", 0)
model_things_2_BSAP = modelStats(models_2_BSAP)
Model stability assessment (MA1)
model_things_2_BSAP = modelValidate(models_2_BSAP, model_things_2_BSAP)
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## 
## Divergence or singularity in 57 samples
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Relying.on.getCause.. 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Nested.try.block 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## 
## Divergence or singularity in 19 samples
Model significant variables
model_things_2_BSAP = modelSignificance(models_2_BSAP, model_things_2_BSAP)
## [1] "project:  hadoop-2.6 model:  CAT.BSAP"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                                 Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE       70.44      1   <.0001
##  AvgCyclomaticStrict                      8.65      1   0.0033
##  AvgEssential                             5.87      1   0.0154
##  MaxEssential                             1.63      1   0.2018
##  Avg.Catch.LOC                            1.28      1   0.2584
##  Avg..AP.Catch.and.do.nothing             0.07      1   0.7917
##  Avg..AP.Catch.and.Return.null            1.32      1   0.2502
##  Avg..AP.Destructive.Wrapping             0.63      1   0.4258
##  Avg..AP.Dummy.Handler                    0.40      1   0.5273
##  Avg..AP.Generic.Catch                    1.80      1   0.1803
##  Avg..AP.Ignoring.Interrupted.Exception  10.56      1   0.0012
##  Avg..AP.Multi.line.log.messages          0.08      1   0.7749
##  Avg..AP.Nested.try.block                 0.77      1   0.3799
##  Avg..AP.Throw.within.finally             2.36      1   0.1245
##  Avg..AP.Unhandled.exceptions             1.00      1   0.3177
##  Avg..AP.Unreachable.Catch.Handler        0.02      1   0.8769
##  Avg..Potentially.UnRecoverable           0.10      1   0.7493
##  Avg..Recoverability.Not.Relevant         1.87      1   0.1714
##  AP.Incomplete.implementation             0.41      1   0.5202
##  AP.Log.and.Throw                         3.99      1   0.0458
##  Sum.of.AP.Overcatch.and.Abort            0.00      1   0.9468
##  AP.Relying.on.getCause..                 0.28      1   0.5994
##  Potentially.Recoverable                  5.38      1   0.0204
##  TOTAL                                  149.23     23   <.0001
## [1] "project:  hibernate-5.0 model:  CAT.BSAP"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  8.97       1   0.0028
##  CountLineComment                   12.44       1   0.0004
##  RatioCommentToCode                 10.57       1   0.0011
##  Avg.Catch.LOC                       0.40       1   0.5257
##  Avg.Catch.SLOC                      3.95       1   0.0467
##  Avg..AP.Catch.and.do.nothing        0.11       1   0.7353
##  Avg..AP.Catch.and.Return.null       3.01       1   0.0825
##  Avg..AP.Generic.Catch               0.19       1   0.6626
##  Avg..AP.Nested.try.block            0.01       1   0.9255
##  Avg..AP.Throw.within.finally        2.30       1   0.1293
##  Avg..AP.Unhandled.exceptions        1.49       1   0.2229
##  Avg..AP.Unreachable.Catch.Handler   0.09       1   0.7652
##  Avg..Potentially.Recoverable        0.07       1   0.7928
##  Avg..Recoverability.Not.Relevant    0.11       1   0.7410
##  Sum.of.AP.Destructive.Wrapping      1.80       1   0.1793
##  Sum.of.AP.Dummy.Handler             8.89       1   0.0029
##  Sum.of.AP.Log.and.Return.null       0.00       1   0.9902
##  Catch.SLOC                          1.13       1   0.2874
##  Potentially.Recoverable             0.17       1   0.6831
##  TOTAL                              66.79      19   <.0001
## [1] "project:  umbraco-7.6 model:  CAT.BSAP"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                            Chi-Square d.f. P     
##  CountLineBlank                     3.65       1   0.0562
##  Avg.Catch.LOC                      0.68       1   0.4102
##  Avg..AP.Catch.and.Return.null      0.14       1   0.7076
##  Avg..AP.Destructive.Wrapping       0.07       1   0.7975
##  Avg..AP.Dummy.Handler              6.09       1   0.0136
##  Avg..AP.Log.and.Return.null        0.78       1   0.3775
##  Avg..AP.Overcatch                  0.12       1   0.7337
##  Avg..AP.Unhandled.exceptions       3.42       1   0.0645
##  Avg..AP.Unreachable.Catch.Handler  3.16       1   0.0754
##  Avg..Potentially.UnRecoverable     2.13       1   0.1448
##  AP.Catch.and.do.nothing            0.05       1   0.8261
##  Sum.of.AP.Generic.Catch            9.73       1   0.0018
##  AP.Multi.line.log.messages         0.01       1   0.9155
##  AP.Nested.try.block                0.00       1   0.9704
##  AP.Relying.on.getCause..           0.01       1   0.9373
##  TOTAL                             21.98      15   0.1082
Model simplification (MA2), Predictors’ explanatory power estimation (MA3), Predictors’ effect in the outcome measurement (MA4)
model_things_2_BSAP = modelSimplification(models_2_BSAP, model_things_2_BSAP)
## [1] "project:  hadoop-2.6 model:  CAT.BSAP Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     215.21    R2       0.367    C       0.848    
##   FALSE        747    d.f.             6    g        1.530    Dxy     0.696    
##   TRUE         143    Pr(> chi2) <0.0001    gr       4.616    gamma   0.700    
##  max |deriv| 3e-09                          gp       0.184    tau-a   0.188    
##                                             Brier    0.098                     
##  
##                                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                              -4.2845 0.5392 -7.95  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE       3.2525 0.3124 10.41  <0.0001 
##  AvgCyclomaticStrict                     3.1582 0.9624  3.28  0.0010  
##  AvgEssential                           -2.9884 1.3791 -2.17  0.0302  
##  Avg..AP.Ignoring.Interrupted.Exception  5.1063 1.4278  3.58  0.0003  
##  AP.Log.and.Throw                        4.1034 1.9466  2.11  0.0350  
##  Potentially.Recoverable                 0.7444 0.2817  2.64  0.0082  
##  
## [1] "project:  hadoop-2.6 model:  CAT.BSAP Refit - summary"
## [1] "project:  hadoop-2.6 model:  CAT.BSAP Refit - validate"
## [1] "project:  hadoop-2.6 model:  CAT.BSAP Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                                 Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE      108.40     1    <.0001
##  AvgCyclomaticStrict                     10.77     1    0.0010
##  AvgEssential                             4.70     1    0.0302
##  Avg..AP.Ignoring.Interrupted.Exception  12.79     1    0.0003
##  AP.Log.and.Throw                         4.44     1    0.0350
##  Potentially.Recoverable                  6.98     1    0.0082
##  TOTAL                                  147.46     6    <.0001
## [1] "Distinct.count.of.Commit.Hash.PRE"
## [1] "AvgCyclomaticStrict"
## [1] "AvgEssential"
## [1] "Avg..AP.Ignoring.Interrupted.Exception"
## [1] "AP.Log.and.Throw"
## [1] "Potentially.Recoverable"
##   Distinct.count.of.Commit.Hash.PRE AvgCyclomaticStrict AvgEssential
## 1                          1.695506            2.660674     1.296629
##   Avg..AP.Ignoring.Interrupted.Exception AP.Log.and.Throw
## 1                             0.08380526       0.02022472
##   Potentially.Recoverable
## 1                6.004494
## [1] "Fixed at Mean: 0.207569898618515"
## [1] "Distinct.count.of.Commit.Hash.PRE  Coef at Mean + 10%: 0.222100399673733"
## [1] "AvgCyclomaticStrict  Coef at Mean + 10%: 0.223844488271616"
## [1] "AvgEssential  Coef at Mean + 10%: 0.196089668996072"
## [1] "Avg..AP.Ignoring.Interrupted.Exception  Coef at Mean + 10%: 0.210393662256916"
## [1] "AP.Log.and.Throw  Coef at Mean + 10%: 0.208151005992509"
## [1] "Potentially.Recoverable  Coef at Mean + 10%: 0.211977383118221"
## [1] "project:  hibernate-5.0 model:  CAT.BSAP Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      79.74    R2       0.258    C       0.786    
##   FALSE        347    d.f.             5    g        1.272    Dxy     0.571    
##   TRUE          93    Pr(> chi2) <0.0001    gr       3.569    gamma   0.573    
##  max |deriv| 8e-12                          gp       0.185    tau-a   0.191    
##                                             Brier    0.134                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -5.0342 0.6565 -7.67  <0.0001 
##  Distinct.count.of.Author.Email.PRE  2.4206 0.8657  2.80  0.0052  
##  CountLineComment                    1.6859 0.4563  3.69  0.0002  
##  RatioCommentToCode                 -9.3723 2.6388 -3.55  0.0004  
##  Avg.Catch.SLOC                      2.1113 0.8268  2.55  0.0107  
##  Sum.of.AP.Dummy.Handler             1.9714 0.9105  2.17  0.0304  
##  
## [1] "project:  hibernate-5.0 model:  CAT.BSAP Refit - summary"
## [1] "project:  hibernate-5.0 model:  CAT.BSAP Refit - validate"
## [1] "project:  hibernate-5.0 model:  CAT.BSAP Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  7.82      1    0.0052
##  CountLineComment                   13.65      1    0.0002
##  RatioCommentToCode                 12.61      1    0.0004
##  Avg.Catch.SLOC                      6.52      1    0.0107
##  Sum.of.AP.Dummy.Handler             4.69      1    0.0304
##  TOTAL                              60.95      5    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "CountLineComment"
## [1] "RatioCommentToCode"
## [1] "Avg.Catch.SLOC"
## [1] "Sum.of.AP.Dummy.Handler"
##   Distinct.count.of.Author.Email.PRE CountLineComment RatioCommentToCode
## 1                           2.129545         46.95455            0.27575
##   Avg.Catch.SLOC Sum.of.AP.Dummy.Handler
## 1       1.286742               0.1409091
## [1] "Fixed at Mean: 0.24582152754868"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.258875914390769"
## [1] "CountLineComment  Coef at Mean + 10%: 0.25872098632262"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.230043577539418"
## [1] "Avg.Catch.SLOC  Coef at Mean + 10%: 0.255245915466901"
## [1] "Sum.of.AP.Dummy.Handler  Coef at Mean + 10%: 0.24777507698998"
## [1] "project:  umbraco-7.6 model:  CAT.BSAP Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2      22.86    R2       0.202    C       0.794    
##   FALSE        208    d.f.             2    g        1.228    Dxy     0.589    
##   TRUE          22    Pr(> chi2) <0.0001    gr       3.413    gamma   0.645    
##  max |deriv| 8e-11                          gp       0.097    tau-a   0.102    
##                                             Brier    0.078                     
##  
##                          Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept               -4.5430 0.6758 -6.72  <0.0001 
##  Avg..AP.Dummy.Handler    5.3086 1.9281  2.75  0.0059  
##  Sum.of.AP.Generic.Catch  4.0066 1.0406  3.85  0.0001  
##  
## [1] "project:  umbraco-7.6 model:  CAT.BSAP Refit - summary"
## [1] "project:  umbraco-7.6 model:  CAT.BSAP Refit - validate"
## [1] "project:  umbraco-7.6 model:  CAT.BSAP Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                  Chi-Square d.f. P     
##  Avg..AP.Dummy.Handler    7.58      1    0.0059
##  Sum.of.AP.Generic.Catch 14.82      1    0.0001
##  TOTAL                   18.68      2    0.0001
## [1] "Avg..AP.Dummy.Handler"
## [1] "Sum.of.AP.Generic.Catch"
##   Avg..AP.Dummy.Handler Sum.of.AP.Generic.Catch
## 1              0.160471                1.852174
## [1] "Fixed at Mean: 0.0850053202474649"
## [1] "Avg..AP.Dummy.Handler  Coef at Mean + 10%: 0.0875005781572317"
## [1] "Sum.of.AP.Generic.Catch  Coef at Mean + 10%: 0.0939165463703944"

Model Extension: BASE + EH Try

all_list_omitted_m3 = all_list_omitted

Drop variables - adjust metric sets

To be able to build new models that are an extension of the base model we removed all insignificant metrics according to the related base model construction. This process makes sense since one can adjust metrics based on expertise. In this case, we learned in the previous step what are the significant metrics for base only, therefore we can remove the other base metrics. In this process we need to loop through each model previously built. Each model can have a different set of metrics and, therefore, the construction of their extensions have to be separately done.

all_list_omitted_m3 = vector("list", 0)

for (i in 1:length(models_2_BASE)) {
  name = models_2_BASE[[i]]$name
  project = models_2_BASE[[i]]$project
  fit = models_2_BASE[[i]]$fit
  class = as.character(class(fit)[1])
  print(paste("Project:", project, "Name:", name))
  
  if(class != "try-error") {
  
    temp_data_index = findProjectData(all_list_omitted,project)
    temp_data = as.data.frame(all_list_omitted[[temp_data_index]]$data)
    
    # Keep catch metrics, remove try and throws, adjust base.
    keepForID = c("Project", "File.Path")
    keepForBase = c("Distinct.count.of.Issue.Key.POST",keepForID)
    keepForCatch = c(keepForID)
    keepForThrows = c(keepForID)
    
    base_names_drop = base_names[!(base_names %in% keepForBase)]
    catch_names_drop = catch_names[!(catch_names %in% keepForCatch)]
    throws_names_drop = throws_names[!(throws_names %in% keepForThrows)]
  
    temp_data = temp_data[,!(names(temp_data) %in% catch_names_drop)]
    temp_data = temp_data[,!(names(temp_data) %in% throws_names_drop)]
    
    # Remove insignificant metrics according to related base model.
    temp_sig_index = findModel(model_things_2_BASE,name,project)
    
    # Only move forward with the models that are under budget.
    if (!model_things_2_BASE[[temp_sig_index]][["over_budget"]]){
      temp_significant = model_things_2_BASE[[temp_sig_index]][["signifcant_r"]]
      if (!is.na(temp_significant)) {
        # The significant metrics from the base model as a vector of char.
        temp_significant_list = unlist(strsplit(temp_significant, ", "))
      } else
        temp_significant_list = unlist(strsplit("", ", "))
      
      # The insignificant metrics: all from the base model minus the significant ones.
      base_names_insignificant = base_names_drop[!(base_names_drop %in% temp_significant_list)]
      # The clean list of metrics for modeling: all metrics minus the base insignificant ones.
      temp_data = temp_data[,!(names(temp_data) %in% base_names_insignificant)]
      print(names(temp_data))
  
      all_list_omitted_m3 <- c(all_list_omitted_m3, list(list(name=name, project=project, data=temp_data, sig=temp_significant_list)))
    } else
      print(paste("!-ERROR-! - model over budget."))
    
  } else
    print(paste("!-ERROR-! - model construction had issues."))
}
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
##  [1] "Distinct.count.of.Commit.Hash.PRE"          
##  [2] "Distinct.count.of.Issue.Key.POST"           
##  [3] "AvgCyclomaticStrict"                        
##  [4] "AvgEssential"                               
##  [5] "MaxEssential"                               
##  [6] "X..Try.in.Conditional"                      
##  [7] "X..Try.in.Declaration"                      
##  [8] "X..Try.in.EH.Feature"                       
##  [9] "X..Try.in.Loop"                             
## [10] "X..Try.in.Other"                            
## [11] "X..Try"                                     
## [12] "X..Try.in.Conditional.1"                    
## [13] "X..Try.in.Declaration.1"                    
## [14] "X..Try.in.EH.Feature.1"                     
## [15] "X..Try.in.Loop.1"                           
## [16] "X..Try.in.Other.1"                          
## [17] "Avg.Max.Depth"                              
## [18] "Avg.Try.LOC"                                
## [19] "Avg.Try.SLOC"                               
## [20] "X..Handled.with.Abort"                      
## [21] "X..Handled.with.Continue"                   
## [22] "X..Handled.with.Default"                    
## [23] "X..Handled.with.Empty"                      
## [24] "X..Handled.with.Log"                        
## [25] "X..Handled.with.Method"                     
## [26] "X..Handled.with.Nested.Try"                 
## [27] "X..Handled.with.Return"                     
## [28] "X..Handled.with.Throw.with.New"             
## [29] "X..Handled.with.Throw.without.New"          
## [30] "X..Handled.with.Throw.Wrap"                 
## [31] "X..Handled.with.ToDo"                       
## [32] "X..Possible.Exceptions"                     
## [33] "X..Propagated.and.Potentially.Recoverable"  
## [34] "X..Propagated"                              
## [35] "X..Doc.in.Comment"                          
## [36] "X..External.Doc"                            
## [37] "X..Handled.with.Abort.1"                    
## [38] "X..Handled.with.Continue.1"                 
## [39] "X..Handled.with.Default.1"                  
## [40] "X..Handled.with.Empty.1"                    
## [41] "X..Handled.with.Log.1"                      
## [42] "X..Handled.with.Method.1"                   
## [43] "X..Handled.with.Nested.Try.1"               
## [44] "X..Handled.with.Return.1"                   
## [45] "X..Handled.with.Throw.with.New.1"           
## [46] "X..Handled.with.Throw.without.New.1"        
## [47] "X..Handled.with.Throw.Wrap.1"               
## [48] "X..Handled.with.ToDo.1"                     
## [49] "X..Method.Declaration"                      
## [50] "X..Propagated.and.Potentially.Recoverable.1"
## [51] "X..Propagated.1"                            
## [52] "X..Specific"                                
## [53] "X..Subsumption"                             
## [54] "X..Throw.Statement"                         
## [55] "Avg...Declaring.Methods"                    
## [56] "Avg...Invoked.Method"                       
## [57] "Avg.Depth"                                  
## [58] "Avg.Exc.Type.Prevalence"                    
## [59] "Num.Distinct.Methods"                       
## [60] "Sum.of.Num.Distinct.Methods"                
## [61] "Try.LOC"                                    
## [62] "Try.SLOC"                                   
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
##  [1] "Distinct.count.of.Author.Email.PRE"         
##  [2] "Distinct.count.of.Issue.Key.POST"           
##  [3] "CountLineComment"                           
##  [4] "RatioCommentToCode"                         
##  [5] "X..Try.in.Conditional"                      
##  [6] "X..Try.in.Declaration"                      
##  [7] "X..Try.in.EH.Feature"                       
##  [8] "X..Try.in.Loop"                             
##  [9] "X..Try.in.Other"                            
## [10] "X..Try"                                     
## [11] "X..Try.in.Conditional.1"                    
## [12] "X..Try.in.Declaration.1"                    
## [13] "X..Try.in.EH.Feature.1"                     
## [14] "X..Try.in.Loop.1"                           
## [15] "X..Try.in.Other.1"                          
## [16] "Avg.Max.Depth"                              
## [17] "Avg.Try.LOC"                                
## [18] "Avg.Try.SLOC"                               
## [19] "X..Handled.with.Abort"                      
## [20] "X..Handled.with.Continue"                   
## [21] "X..Handled.with.Default"                    
## [22] "X..Handled.with.Empty"                      
## [23] "X..Handled.with.Log"                        
## [24] "X..Handled.with.Method"                     
## [25] "X..Handled.with.Nested.Try"                 
## [26] "X..Handled.with.Return"                     
## [27] "X..Handled.with.Throw.with.New"             
## [28] "X..Handled.with.Throw.without.New"          
## [29] "X..Handled.with.Throw.Wrap"                 
## [30] "X..Handled.with.ToDo"                       
## [31] "X..Possible.Exceptions"                     
## [32] "X..Propagated.and.Potentially.Recoverable"  
## [33] "X..Propagated"                              
## [34] "X..Doc.in.Comment"                          
## [35] "X..External.Doc"                            
## [36] "X..Handled.with.Abort.1"                    
## [37] "X..Handled.with.Continue.1"                 
## [38] "X..Handled.with.Default.1"                  
## [39] "X..Handled.with.Empty.1"                    
## [40] "X..Handled.with.Log.1"                      
## [41] "X..Handled.with.Method.1"                   
## [42] "X..Handled.with.Nested.Try.1"               
## [43] "X..Handled.with.Return.1"                   
## [44] "X..Handled.with.Throw.with.New.1"           
## [45] "X..Handled.with.Throw.without.New.1"        
## [46] "X..Handled.with.Throw.Wrap.1"               
## [47] "X..Handled.with.ToDo.1"                     
## [48] "X..Method.Declaration"                      
## [49] "X..Propagated.and.Potentially.Recoverable.1"
## [50] "X..Propagated.1"                            
## [51] "X..Specific"                                
## [52] "X..Subsumption"                             
## [53] "X..Throw.Statement"                         
## [54] "Avg...Declaring.Methods"                    
## [55] "Avg...Invoked.Method"                       
## [56] "Avg.Depth"                                  
## [57] "Avg.Exc.Type.Prevalence"                    
## [58] "Num.Distinct.Methods"                       
## [59] "Sum.of.Num.Distinct.Methods"                
## [60] "Try.LOC"                                    
## [61] "Try.SLOC"                                   
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
##  [1] "Distinct.count.of.Issue.Key.POST"           
##  [2] "CountLineBlank"                             
##  [3] "X..Try.in.Conditional"                      
##  [4] "X..Try.in.Declaration"                      
##  [5] "X..Try.in.EH.Feature"                       
##  [6] "X..Try.in.Loop"                             
##  [7] "X..Try.in.Other"                            
##  [8] "X..Try"                                     
##  [9] "X..Try.in.Conditional.1"                    
## [10] "X..Try.in.Declaration.1"                    
## [11] "X..Try.in.EH.Feature.1"                     
## [12] "X..Try.in.Loop.1"                           
## [13] "X..Try.in.Other.1"                          
## [14] "Avg.Max.Depth"                              
## [15] "Avg.Try.LOC"                                
## [16] "Avg.Try.SLOC"                               
## [17] "X..Handled.with.Abort"                      
## [18] "X..Handled.with.Continue"                   
## [19] "X..Handled.with.Default"                    
## [20] "X..Handled.with.Empty"                      
## [21] "X..Handled.with.Log"                        
## [22] "X..Handled.with.Method"                     
## [23] "X..Handled.with.Nested.Try"                 
## [24] "X..Handled.with.Return"                     
## [25] "X..Handled.with.Throw.with.New"             
## [26] "X..Handled.with.Throw.without.New"          
## [27] "X..Handled.with.Throw.Wrap"                 
## [28] "X..Handled.with.ToDo"                       
## [29] "X..Possible.Exceptions"                     
## [30] "X..Propagated.and.Potentially.Recoverable"  
## [31] "X..Propagated"                              
## [32] "X..Doc.in.Comment"                          
## [33] "X..External.Doc"                            
## [34] "X..Handled.with.Abort.1"                    
## [35] "X..Handled.with.Continue.1"                 
## [36] "X..Handled.with.Default.1"                  
## [37] "X..Handled.with.Empty.1"                    
## [38] "X..Handled.with.Log.1"                      
## [39] "X..Handled.with.Method.1"                   
## [40] "X..Handled.with.Nested.Try.1"               
## [41] "X..Handled.with.Return.1"                   
## [42] "X..Handled.with.Throw.with.New.1"           
## [43] "X..Handled.with.Throw.without.New.1"        
## [44] "X..Handled.with.Throw.Wrap.1"               
## [45] "X..Handled.with.ToDo.1"                     
## [46] "X..Method.Declaration"                      
## [47] "X..Propagated.and.Potentially.Recoverable.1"
## [48] "X..Propagated.1"                            
## [49] "X..Specific"                                
## [50] "X..Subsumption"                             
## [51] "X..Throw.Statement"                         
## [52] "Avg...Declaring.Methods"                    
## [53] "Avg...Invoked.Method"                       
## [54] "Avg.Depth"                                  
## [55] "Avg.Exc.Type.Prevalence"                    
## [56] "Num.Distinct.Methods"                       
## [57] "Sum.of.Num.Distinct.Methods"                
## [58] "Try.LOC"                                    
## [59] "Try.SLOC"
dropToPredict = c("File.Path",  "Project", "Language", "Table.Name","Name","Kind", "X..Bugs.Post", "File", "Distinct.count.of.Issue.Key.POST")

Correlation analysis (MC4), Redundancy Analysis (MC5), Budget based correlation analysis (MC6)

all_list_model_m3 = dataApplyReductionByModel(all_list_omitted_m3, "BSFC")
## [1] "Project: hadoop-2.6 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 49"
## [1] "NumberOfMetricsKept: 23"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
##  [1] "Distinct.count.of.Commit.Hash.PRE"     
##  [2] "Distinct.count.of.Issue.Key.POST"      
##  [3] "AvgCyclomaticStrict"                   
##  [4] "AvgEssential"                          
##  [5] "MaxEssential"                          
##  [6] "Avg.Catch.LOC"                         
##  [7] "Avg..AP.Catch.and.do.nothing"          
##  [8] "Avg..AP.Catch.and.Return.null"         
##  [9] "Avg..AP.Destructive.Wrapping"          
## [10] "Avg..AP.Dummy.Handler"                 
## [11] "Avg..AP.Generic.Catch"                 
## [12] "Avg..AP.Ignoring.Interrupted.Exception"
## [13] "Avg..AP.Multi.line.log.messages"       
## [14] "Avg..AP.Nested.try.block"              
## [15] "Avg..AP.Throw.within.finally"          
## [16] "Avg..AP.Unhandled.exceptions"          
## [17] "Avg..AP.Unreachable.Catch.Handler"     
## [18] "Avg..Potentially.UnRecoverable"        
## [19] "Avg..Recoverability.Not.Relevant"      
## [20] "AP.Incomplete.implementation"          
## [21] "AP.Log.and.Throw"                      
## [22] "Sum.of.AP.Overcatch.and.Abort"         
## [23] "AP.Relying.on.getCause.."              
## [24] "Potentially.Recoverable"               
## [1] "Project: hibernate-5.0 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 48"
## [1] "NumberOfMetricsKept: 19"
## [1] "Distinct.count.of.Author.Email.PRE + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
##  [1] "Distinct.count.of.Author.Email.PRE"
##  [2] "Distinct.count.of.Issue.Key.POST"  
##  [3] "CountLineComment"                  
##  [4] "RatioCommentToCode"                
##  [5] "Avg.Catch.LOC"                     
##  [6] "Avg.Catch.SLOC"                    
##  [7] "Avg..AP.Catch.and.do.nothing"      
##  [8] "Avg..AP.Catch.and.Return.null"     
##  [9] "Avg..AP.Generic.Catch"             
## [10] "Avg..AP.Nested.try.block"          
## [11] "Avg..AP.Throw.within.finally"      
## [12] "Avg..AP.Unhandled.exceptions"      
## [13] "Avg..AP.Unreachable.Catch.Handler" 
## [14] "Avg..Potentially.Recoverable"      
## [15] "Avg..Recoverability.Not.Relevant"  
## [16] "Sum.of.AP.Destructive.Wrapping"    
## [17] "Sum.of.AP.Dummy.Handler"           
## [18] "Sum.of.AP.Log.and.Return.null"     
## [19] "Catch.SLOC"                        
## [20] "Potentially.Recoverable"           
## [1] "Project: umbraco-7.6 Name: CAT.BASE"

## [1] "NumberOfMetricsInitial: 46"
## [1] "NumberOfMetricsKept: 16"
## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + Sum.of.AP.Overcatch + AP.Relying.on.getCause.."
##  [1] "Distinct.count.of.Issue.Key.POST" 
##  [2] "CountLineBlank"                   
##  [3] "Avg.Catch.LOC"                    
##  [4] "Avg..AP.Catch.and.Return.null"    
##  [5] "Avg..AP.Destructive.Wrapping"     
##  [6] "Avg..AP.Dummy.Handler"            
##  [7] "Avg..AP.Log.and.Return.null"      
##  [8] "Avg..AP.Overcatch"                
##  [9] "Avg..AP.Unhandled.exceptions"     
## [10] "Avg..AP.Unreachable.Catch.Handler"
## [11] "Avg..Potentially.UnRecoverable"   
## [12] "AP.Catch.and.do.nothing"          
## [13] "Sum.of.AP.Generic.Catch"          
## [14] "AP.Multi.line.log.messages"       
## [15] "AP.Nested.try.block"              
## [16] "Sum.of.AP.Overcatch"              
## [17] "AP.Relying.on.getCause.."         
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Commit.Hash.PRE + Distinct.count.of.Issue.Key.POST + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
## [1] "NumberOfMetricsInitial: 24"
## [1] "NumberOfMetricsKept: 24"
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Author.Email.PRE + Distinct.count.of.Issue.Key.POST + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
## [1] "NumberOfMetricsInitial: 20"
## [1] "NumberOfMetricsKept: 20"
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## [1] "Redudant variables: "
## [1] "Final variables: Distinct.count.of.Issue.Key.POST + CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + Sum.of.AP.Overcatch + AP.Relying.on.getCause.."
## [1] "NumberOfMetricsInitial: 17"
## [1] "NumberOfMetricsKept: 17"
## [1] "------------------------------------Project: hadoop-2.6 Name: CAT.BASE"
## [1] "NumberOfMetricsInitial: 23 Budget: 59 Over Budget: FALSE NumberOfMetricsKept: 23 CorrelationCutoff: 0.7"
## [1] "NumberOfMetricsInitial: 23 Budget: 59 Over Budget: FALSE NumberOfMetricsKept: 23 CorrelationCutoff: 0.7"
## [1] "------------------------------------Project: hibernate-5.0 Name: CAT.BASE"
## [1] "NumberOfMetricsInitial: 19 Budget: 29 Over Budget: FALSE NumberOfMetricsKept: 19 CorrelationCutoff: 0.7"
## [1] "NumberOfMetricsInitial: 19 Budget: 29 Over Budget: FALSE NumberOfMetricsKept: 19 CorrelationCutoff: 0.7"
## [1] "------------------------------------Project: umbraco-7.6 Name: CAT.BASE"

## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + AP.Relying.on.getCause.."
## [1] "NumberOfMetricsInitial: 16 Budget: 15 Over Budget: TRUE NumberOfMetricsKept: 15 CorrelationCutoff: 0.671"
## [1] "NumberOfMetricsInitial: 16 Budget: 15 Over Budget: TRUE NumberOfMetricsKept: 15 CorrelationCutoff: 0.671"

Setup formulas

form_list_bin_m3 = dataSetupFormulasBinaryByModel(all_list_model_m3)
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## [1] "Distinct.count.of.Commit.Hash.PRE + AvgCyclomaticStrict + AvgEssential + MaxEssential + Avg.Catch.LOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Generic.Catch + Avg..AP.Ignoring.Interrupted.Exception + Avg..AP.Multi.line.log.messages + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + Avg..Recoverability.Not.Relevant + AP.Incomplete.implementation + AP.Log.and.Throw + Sum.of.AP.Overcatch.and.Abort + AP.Relying.on.getCause.. + Potentially.Recoverable"
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## [1] "Distinct.count.of.Author.Email.PRE + CountLineComment + RatioCommentToCode + Avg.Catch.LOC + Avg.Catch.SLOC + Avg..AP.Catch.and.do.nothing + Avg..AP.Catch.and.Return.null + Avg..AP.Generic.Catch + Avg..AP.Nested.try.block + Avg..AP.Throw.within.finally + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.Recoverable + Avg..Recoverability.Not.Relevant + Sum.of.AP.Destructive.Wrapping + Sum.of.AP.Dummy.Handler + Sum.of.AP.Log.and.Return.null + Catch.SLOC + Potentially.Recoverable"
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## [1] "CountLineBlank + Avg.Catch.LOC + Avg..AP.Catch.and.Return.null + Avg..AP.Destructive.Wrapping + Avg..AP.Dummy.Handler + Avg..AP.Log.and.Return.null + Avg..AP.Overcatch + Avg..AP.Unhandled.exceptions + Avg..AP.Unreachable.Catch.Handler + Avg..Potentially.UnRecoverable + AP.Catch.and.do.nothing + Sum.of.AP.Generic.Catch + AP.Multi.line.log.messages + AP.Nested.try.block + AP.Relying.on.getCause.."

Fit regression model (MC7)

models_2_BSFC = modelFitLogisticByModel(all_list_model_m3,form_list_bin_m3,"CAT.BSFC")
## [1] "Project: hadoop-2.6 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     233.46    R2       0.394    C       0.862    
##   FALSE        747    d.f.            23    g        1.730    Dxy     0.724    
##   TRUE         143    Pr(> chi2) <0.0001    gr       5.640    gamma   0.726    
##  max |deriv| 2e-06                          gp       0.191    tau-a   0.196    
##                                             Brier    0.095                     
##  
##                                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                              -4.8011 1.0842 -4.43  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE       2.9187 0.3478  8.39  <0.0001 
##  AvgCyclomaticStrict                     2.9866 1.0155  2.94  0.0033  
##  AvgEssential                           -3.9923 1.6472 -2.42  0.0154  
##  MaxEssential                            0.6824 0.5346  1.28  0.2018  
##  Avg.Catch.LOC                           1.3236 1.1710  1.13  0.2584  
##  Avg..AP.Catch.and.do.nothing            0.5908 2.2376  0.26  0.7917  
##  Avg..AP.Catch.and.Return.null          -2.4554 2.1356 -1.15  0.2502  
##  Avg..AP.Destructive.Wrapping           -0.9398 1.1800 -0.80  0.4258  
##  Avg..AP.Dummy.Handler                   0.9190 1.4537  0.63  0.5273  
##  Avg..AP.Generic.Catch                  -1.9456 1.4520 -1.34  0.1803  
##  Avg..AP.Ignoring.Interrupted.Exception  5.1785 1.5936  3.25  0.0012  
##  Avg..AP.Multi.line.log.messages         0.8636 3.0204  0.29  0.7749  
##  Avg..AP.Nested.try.block                2.0894 2.3797  0.88  0.3799  
##  Avg..AP.Throw.within.finally           -3.5422 2.3058 -1.54  0.1245  
##  Avg..AP.Unhandled.exceptions           -1.1283 1.1293 -1.00  0.3177  
##  Avg..AP.Unreachable.Catch.Handler       0.2953 1.9069  0.15  0.8769  
##  Avg..Potentially.UnRecoverable          0.5821 1.8214  0.32  0.7493  
##  Avg..Recoverability.Not.Relevant       -4.4751 3.2719 -1.37  0.1714  
##  AP.Incomplete.implementation            2.6999 4.1987  0.64  0.5202  
##  AP.Log.and.Throw                        4.1366 2.0714  2.00  0.0458  
##  Sum.of.AP.Overcatch.and.Abort           0.1437 2.1522  0.07  0.9468  
##  AP.Relying.on.getCause..               -0.6957 1.3245 -0.53  0.5994  
##  Potentially.Recoverable                 0.8492 0.3661  2.32  0.0204  
##  
## [1] "Project: hibernate-5.0 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      91.81    R2       0.293    C       0.798    
##   FALSE        347    d.f.            19    g        1.414    Dxy     0.596    
##   TRUE          93    Pr(> chi2) <0.0001    gr       4.113    gamma   0.598    
##  max |deriv| 8e-10                          gp       0.199    tau-a   0.199    
##                                             Brier    0.128                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -4.8631 1.2596 -3.86  0.0001  
##  Distinct.count.of.Author.Email.PRE  2.8461 0.9505  2.99  0.0028  
##  CountLineComment                    1.7835 0.5056  3.53  0.0004  
##  RatioCommentToCode                 -9.6397 2.9651 -3.25  0.0011  
##  Avg.Catch.LOC                      -0.9464 1.4915 -0.63  0.5257  
##  Avg.Catch.SLOC                      3.9438 1.9831  1.99  0.0467  
##  Avg..AP.Catch.and.do.nothing        0.8616 2.5484  0.34  0.7353  
##  Avg..AP.Catch.and.Return.null       4.5145 2.6002  1.74  0.0825  
##  Avg..AP.Generic.Catch              -0.6409 1.4690 -0.44  0.6626  
##  Avg..AP.Nested.try.block           -0.2740 2.9307 -0.09  0.9255  
##  Avg..AP.Throw.within.finally        4.0430 2.6656  1.52  0.1293  
##  Avg..AP.Unhandled.exceptions        1.5102 1.2390  1.22  0.2229  
##  Avg..AP.Unreachable.Catch.Handler   0.5841 1.9556  0.30  0.7652  
##  Avg..Potentially.Recoverable       -0.6864 2.6134 -0.26  0.7928  
##  Avg..Recoverability.Not.Relevant   -0.9000 2.7225 -0.33  0.7410  
##  Sum.of.AP.Destructive.Wrapping      0.8648 0.6441  1.34  0.1793  
##  Sum.of.AP.Dummy.Handler             3.0046 1.0077  2.98  0.0029  
##  Sum.of.AP.Log.and.Return.null      -0.0299 2.4376 -0.01  0.9902  
##  Catch.SLOC                         -1.0628 0.9990 -1.06  0.2874  
##  Potentially.Recoverable            -0.5009 1.2268 -0.41  0.6831  
##  
## [1] "Project: umbraco-7.6 Name: CAT.BASE"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(form_bin), data = temp_data_log, x = T, 
##      y = T)
##  
##                       Model Likelihood     Discrimination    Rank Discrim.    
##                          Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2     43.16    R2       0.366    C       0.872    
##   FALSE        208    d.f.           15    g        5.042    Dxy     0.744    
##   TRUE          22    Pr(> chi2) 0.0001    gr     154.833    gamma   0.747    
##  max |deriv| 0.003                         gp       0.129    tau-a   0.129    
##                                            Brier    0.066                     
##  
##                                    Coef     S.E.     Wald Z Pr(>|Z|)
##  Intercept                          -5.4210   1.7433 -3.11  0.0019  
##  CountLineBlank                      1.3541   0.7092  1.91  0.0562  
##  Avg.Catch.LOC                      -2.0201   2.4529 -0.82  0.4102  
##  Avg..AP.Catch.and.Return.null       1.4990   3.9958  0.38  0.7076  
##  Avg..AP.Destructive.Wrapping      -89.7540 349.8585 -0.26  0.7975  
##  Avg..AP.Dummy.Handler               6.3450   2.5707  2.47  0.0136  
##  Avg..AP.Log.and.Return.null       -16.9573  19.2147 -0.88  0.3775  
##  Avg..AP.Overcatch                  -1.0034   2.9495 -0.34  0.7337  
##  Avg..AP.Unhandled.exceptions      -11.7078   6.3336 -1.85  0.0645  
##  Avg..AP.Unreachable.Catch.Handler  -7.8174   4.3969 -1.78  0.0754  
##  Avg..Potentially.UnRecoverable     10.1290   6.9465  1.46  0.1448  
##  AP.Catch.and.do.nothing            -0.4214   1.9185 -0.22  0.8261  
##  Sum.of.AP.Generic.Catch             5.7515   1.8439  3.12  0.0018  
##  AP.Multi.line.log.messages        -30.0528 283.2997 -0.11  0.9155  
##  AP.Nested.try.block                 0.1366   3.6782  0.04  0.9704  
##  AP.Relying.on.getCause..          -18.4489 234.3618 -0.08  0.9373  
## 

Model Analysis for BSFC

In this section, we present the selected statistics for our analysis. As explained in our approach, they are the steps: MC7, MA1, MA2, MA3 and MA4.

Here we extract the selected statistics and we add the data (columns) to an object that will be exported to CSV in the section Output.

Fit regression model (MC7): summary stats
model_things_2_BSFC = vector("list", 0)
model_things_2_BSFC = modelStats(models_2_BSFC)
Model stability assessment (MA1)
model_things_2_BSFC = modelValidate(models_2_BSFC, model_things_2_BSFC)
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## singular information matrix in lrm.fit (rank= 23 ).  Offending variable(s):
## AP.Incomplete.implementation 
## 
## Divergence or singularity in 40 samples
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Nested.try.block 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Relying.on.getCause.. 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## singular information matrix in lrm.fit (rank= 15 ).  Offending variable(s):
## AP.Multi.line.log.messages 
## 
## Divergence or singularity in 19 samples
Model significant variables
model_things_2_BSFC = modelSignificance(models_2_BSFC, model_things_2_BSFC)
## [1] "project:  hadoop-2.6 model:  CAT.BSFC"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                                 Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE       70.44      1   <.0001
##  AvgCyclomaticStrict                      8.65      1   0.0033
##  AvgEssential                             5.87      1   0.0154
##  MaxEssential                             1.63      1   0.2018
##  Avg.Catch.LOC                            1.28      1   0.2584
##  Avg..AP.Catch.and.do.nothing             0.07      1   0.7917
##  Avg..AP.Catch.and.Return.null            1.32      1   0.2502
##  Avg..AP.Destructive.Wrapping             0.63      1   0.4258
##  Avg..AP.Dummy.Handler                    0.40      1   0.5273
##  Avg..AP.Generic.Catch                    1.80      1   0.1803
##  Avg..AP.Ignoring.Interrupted.Exception  10.56      1   0.0012
##  Avg..AP.Multi.line.log.messages          0.08      1   0.7749
##  Avg..AP.Nested.try.block                 0.77      1   0.3799
##  Avg..AP.Throw.within.finally             2.36      1   0.1245
##  Avg..AP.Unhandled.exceptions             1.00      1   0.3177
##  Avg..AP.Unreachable.Catch.Handler        0.02      1   0.8769
##  Avg..Potentially.UnRecoverable           0.10      1   0.7493
##  Avg..Recoverability.Not.Relevant         1.87      1   0.1714
##  AP.Incomplete.implementation             0.41      1   0.5202
##  AP.Log.and.Throw                         3.99      1   0.0458
##  Sum.of.AP.Overcatch.and.Abort            0.00      1   0.9468
##  AP.Relying.on.getCause..                 0.28      1   0.5994
##  Potentially.Recoverable                  5.38      1   0.0204
##  TOTAL                                  149.23     23   <.0001
## [1] "project:  hibernate-5.0 model:  CAT.BSFC"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  8.97       1   0.0028
##  CountLineComment                   12.44       1   0.0004
##  RatioCommentToCode                 10.57       1   0.0011
##  Avg.Catch.LOC                       0.40       1   0.5257
##  Avg.Catch.SLOC                      3.95       1   0.0467
##  Avg..AP.Catch.and.do.nothing        0.11       1   0.7353
##  Avg..AP.Catch.and.Return.null       3.01       1   0.0825
##  Avg..AP.Generic.Catch               0.19       1   0.6626
##  Avg..AP.Nested.try.block            0.01       1   0.9255
##  Avg..AP.Throw.within.finally        2.30       1   0.1293
##  Avg..AP.Unhandled.exceptions        1.49       1   0.2229
##  Avg..AP.Unreachable.Catch.Handler   0.09       1   0.7652
##  Avg..Potentially.Recoverable        0.07       1   0.7928
##  Avg..Recoverability.Not.Relevant    0.11       1   0.7410
##  Sum.of.AP.Destructive.Wrapping      1.80       1   0.1793
##  Sum.of.AP.Dummy.Handler             8.89       1   0.0029
##  Sum.of.AP.Log.and.Return.null       0.00       1   0.9902
##  Catch.SLOC                          1.13       1   0.2874
##  Potentially.Recoverable             0.17       1   0.6831
##  TOTAL                              66.79      19   <.0001
## [1] "project:  umbraco-7.6 model:  CAT.BSFC"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                            Chi-Square d.f. P     
##  CountLineBlank                     3.65       1   0.0562
##  Avg.Catch.LOC                      0.68       1   0.4102
##  Avg..AP.Catch.and.Return.null      0.14       1   0.7076
##  Avg..AP.Destructive.Wrapping       0.07       1   0.7975
##  Avg..AP.Dummy.Handler              6.09       1   0.0136
##  Avg..AP.Log.and.Return.null        0.78       1   0.3775
##  Avg..AP.Overcatch                  0.12       1   0.7337
##  Avg..AP.Unhandled.exceptions       3.42       1   0.0645
##  Avg..AP.Unreachable.Catch.Handler  3.16       1   0.0754
##  Avg..Potentially.UnRecoverable     2.13       1   0.1448
##  AP.Catch.and.do.nothing            0.05       1   0.8261
##  Sum.of.AP.Generic.Catch            9.73       1   0.0018
##  AP.Multi.line.log.messages         0.01       1   0.9155
##  AP.Nested.try.block                0.00       1   0.9704
##  AP.Relying.on.getCause..           0.01       1   0.9373
##  TOTAL                             21.98      15   0.1082
Model simplification (MA2), Predictors’ explanatory power estimation (MA3), Predictors’ effect in the outcome measurement (MA4)
model_things_2_BSFC = modelSimplification(models_2_BSFC, model_things_2_BSFC)
## [1] "project:  hadoop-2.6 model:  CAT.BSFC Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           890    LR chi2     215.21    R2       0.367    C       0.848    
##   FALSE        747    d.f.             6    g        1.530    Dxy     0.696    
##   TRUE         143    Pr(> chi2) <0.0001    gr       4.616    gamma   0.700    
##  max |deriv| 3e-09                          gp       0.184    tau-a   0.188    
##                                             Brier    0.098                     
##  
##                                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                              -4.2845 0.5392 -7.95  <0.0001 
##  Distinct.count.of.Commit.Hash.PRE       3.2525 0.3124 10.41  <0.0001 
##  AvgCyclomaticStrict                     3.1582 0.9624  3.28  0.0010  
##  AvgEssential                           -2.9884 1.3791 -2.17  0.0302  
##  Avg..AP.Ignoring.Interrupted.Exception  5.1063 1.4278  3.58  0.0003  
##  AP.Log.and.Throw                        4.1034 1.9466  2.11  0.0350  
##  Potentially.Recoverable                 0.7444 0.2817  2.64  0.0082  
##  
## [1] "project:  hadoop-2.6 model:  CAT.BSFC Refit - summary"
## [1] "project:  hadoop-2.6 model:  CAT.BSFC Refit - validate"
## [1] "project:  hadoop-2.6 model:  CAT.BSFC Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                                 Chi-Square d.f. P     
##  Distinct.count.of.Commit.Hash.PRE      108.40     1    <.0001
##  AvgCyclomaticStrict                     10.77     1    0.0010
##  AvgEssential                             4.70     1    0.0302
##  Avg..AP.Ignoring.Interrupted.Exception  12.79     1    0.0003
##  AP.Log.and.Throw                         4.44     1    0.0350
##  Potentially.Recoverable                  6.98     1    0.0082
##  TOTAL                                  147.46     6    <.0001
## [1] "Distinct.count.of.Commit.Hash.PRE"
## [1] "AvgCyclomaticStrict"
## [1] "AvgEssential"
## [1] "Avg..AP.Ignoring.Interrupted.Exception"
## [1] "AP.Log.and.Throw"
## [1] "Potentially.Recoverable"
##   Distinct.count.of.Commit.Hash.PRE AvgCyclomaticStrict AvgEssential
## 1                          1.695506            2.660674     1.296629
##   Avg..AP.Ignoring.Interrupted.Exception AP.Log.and.Throw
## 1                             0.08380526       0.02022472
##   Potentially.Recoverable
## 1                6.004494
## [1] "Fixed at Mean: 0.207569898618515"
## [1] "Distinct.count.of.Commit.Hash.PRE  Coef at Mean + 10%: 0.222100399673733"
## [1] "AvgCyclomaticStrict  Coef at Mean + 10%: 0.223844488271616"
## [1] "AvgEssential  Coef at Mean + 10%: 0.196089668996072"
## [1] "Avg..AP.Ignoring.Interrupted.Exception  Coef at Mean + 10%: 0.210393662256916"
## [1] "AP.Log.and.Throw  Coef at Mean + 10%: 0.208151005992509"
## [1] "Potentially.Recoverable  Coef at Mean + 10%: 0.211977383118221"
## [1] "project:  hibernate-5.0 model:  CAT.BSFC Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           440    LR chi2      79.74    R2       0.258    C       0.786    
##   FALSE        347    d.f.             5    g        1.272    Dxy     0.571    
##   TRUE          93    Pr(> chi2) <0.0001    gr       3.569    gamma   0.573    
##  max |deriv| 8e-12                          gp       0.185    tau-a   0.191    
##                                             Brier    0.134                     
##  
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept                          -5.0342 0.6565 -7.67  <0.0001 
##  Distinct.count.of.Author.Email.PRE  2.4206 0.8657  2.80  0.0052  
##  CountLineComment                    1.6859 0.4563  3.69  0.0002  
##  RatioCommentToCode                 -9.3723 2.6388 -3.55  0.0004  
##  Avg.Catch.SLOC                      2.1113 0.8268  2.55  0.0107  
##  Sum.of.AP.Dummy.Handler             1.9714 0.9105  2.17  0.0304  
##  
## [1] "project:  hibernate-5.0 model:  CAT.BSFC Refit - summary"
## [1] "project:  hibernate-5.0 model:  CAT.BSFC Refit - validate"
## [1] "project:  hibernate-5.0 model:  CAT.BSFC Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                             Chi-Square d.f. P     
##  Distinct.count.of.Author.Email.PRE  7.82      1    0.0052
##  CountLineComment                   13.65      1    0.0002
##  RatioCommentToCode                 12.61      1    0.0004
##  Avg.Catch.SLOC                      6.52      1    0.0107
##  Sum.of.AP.Dummy.Handler             4.69      1    0.0304
##  TOTAL                              60.95      5    <.0001
## [1] "Distinct.count.of.Author.Email.PRE"
## [1] "CountLineComment"
## [1] "RatioCommentToCode"
## [1] "Avg.Catch.SLOC"
## [1] "Sum.of.AP.Dummy.Handler"
##   Distinct.count.of.Author.Email.PRE CountLineComment RatioCommentToCode
## 1                           2.129545         46.95455            0.27575
##   Avg.Catch.SLOC Sum.of.AP.Dummy.Handler
## 1       1.286742               0.1409091
## [1] "Fixed at Mean: 0.24582152754868"
## [1] "Distinct.count.of.Author.Email.PRE  Coef at Mean + 10%: 0.258875914390769"
## [1] "CountLineComment  Coef at Mean + 10%: 0.25872098632262"
## [1] "RatioCommentToCode  Coef at Mean + 10%: 0.230043577539418"
## [1] "Avg.Catch.SLOC  Coef at Mean + 10%: 0.255245915466901"
## [1] "Sum.of.AP.Dummy.Handler  Coef at Mean + 10%: 0.24777507698998"
## [1] "project:  umbraco-7.6 model:  CAT.BSFC Refit"
## Logistic Regression Model
##  
##  lrm(formula = as.formula(final_form), data = temp_data, x = T, 
##      y = T)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           230    LR chi2      22.86    R2       0.202    C       0.794    
##   FALSE        208    d.f.             2    g        1.228    Dxy     0.589    
##   TRUE          22    Pr(> chi2) <0.0001    gr       3.413    gamma   0.645    
##  max |deriv| 8e-11                          gp       0.097    tau-a   0.102    
##                                             Brier    0.078                     
##  
##                          Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept               -4.5430 0.6758 -6.72  <0.0001 
##  Avg..AP.Dummy.Handler    5.3086 1.9281  2.75  0.0059  
##  Sum.of.AP.Generic.Catch  4.0066 1.0406  3.85  0.0001  
##  
## [1] "project:  umbraco-7.6 model:  CAT.BSFC Refit - summary"
## [1] "project:  umbraco-7.6 model:  CAT.BSFC Refit - validate"
## [1] "project:  umbraco-7.6 model:  CAT.BSFC Refit - anova"
##                 Wald Statistics          Response: Distinct.count.of.Issue.Key.POST > 0 
## 
##  Factor                  Chi-Square d.f. P     
##  Avg..AP.Dummy.Handler    7.58      1    0.0059
##  Sum.of.AP.Generic.Catch 14.82      1    0.0001
##  TOTAL                   18.68      2    0.0001
## [1] "Avg..AP.Dummy.Handler"
## [1] "Sum.of.AP.Generic.Catch"
##   Avg..AP.Dummy.Handler Sum.of.AP.Generic.Catch
## 1              0.160471                1.852174
## [1] "Fixed at Mean: 0.0850053202474649"
## [1] "Avg..AP.Dummy.Handler  Coef at Mean + 10%: 0.0875005781572317"
## [1] "Sum.of.AP.Generic.Catch  Coef at Mean + 10%: 0.0939165463703944"

Output 2

Here we output the selected statistics from the R functions results and we output in the CSV files in the folder “output”.

write.table(data.frame(model_things_2_BASE[[1]])[0,], 'output/base_test_2.csv'  , append= F, sep=',', row.names = F, col.names = T )

lapply(model_things_2_BASE, function(x) write.table( data.frame(x), 'output/base_test_2.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
lapply(model_things_2_BSAP, function(x) write.table( data.frame(x), 'output/base_test_2.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
lapply(model_things_2_BSFC, function(x) write.table( data.frame(x), 'output/base_test_2.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL