suppressMessages(require(rms, quietly = TRUE, warn.conflicts = FALSE))
require(splines, quietly = TRUE)
require(plotly, quietly = TRUE, warn.conflicts = FALSE)
require(Hmisc, quietly = TRUE)
require(e1071, quietly = TRUE)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
require(caret, quietly = TRUE)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
require(rmarkdown, quietly = TRUE)
## Warning: package 'rmarkdown' was built under R version 3.4.2
require(BiodiversityR, quietly = TRUE)
## This is vegan 2.4-3
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:caret':
## 
##     tolerance
## The following object is masked from 'package:rms':
## 
##     calibrate
## BiodiversityR 2.8-3: Use command BiodiversityRGUI() to launch the Graphical User Interface and to learn about backward compatibility

Constants and Functions

NA_THRESHOLD = 0.03
CORR_THRESHOLD = 0.7

myProjectList <- function(x, projects) {
  temp_list = vector("list", 0)
  
  for (i in 1:length(projects)) {
    temp_project_data <- x[x[,"Project"] == projects[i],]
    temp_list <- c(temp_list, list(temp_project_data))
  }
  return (temp_list)
}

myMissingData <- function(x) {  
  
  temp_naclus = naclus(x)
  plot(temp_naclus)
  
  if (sum(temp_naclus$sim < NA_THRESHOLD)) {
    temp_project_data_omitted <- na.omit(x)
    print(paste("MissingData: Small fraction found: Single imputation or discard. Discard is chosen. Discarded rows:",nrow(x)-nrow(temp_project_data_omitted), "/", nrow(x)))
  } else {
    # TODO: implement multiple imputation
    temp_project_data_omitted = x
    print(paste("MissingData: High fraction found: Multiple imputation needed! Discarded rows:",nrow(x)-nrow(temp_project_data_omitted), "/", nrow(x)))
  }
  return(as.data.frame(temp_project_data_omitted))
}

Model Construction

Data sources - Individual Analysis

Our data is used at the file level. We do not consider unit test files, or build generated files. The data comes from five different sources. They are: source control and issue tracker, Understand tool, throws blocks, catch blocks, try blocks.

For the data source load work, please set your working directory to the scripts package root folder. That can be done using the setwd command. Example: setwd(“~/Downloads/eh-model-defects2018_data_scripts”)

# Load Data sources
git_data        <- read.csv("input/File_Level-Git-Jira.csv", stringsAsFactors=FALSE)
understand_data <- read.csv("input/File_Level-Understand.csv", stringsAsFactors=FALSE)
throws_data     <- read.csv("input/File_Level-Throws_Based.csv", stringsAsFactors=FALSE)
catch_data      <- read.csv("input/File_Level-Catch_Based.csv", stringsAsFactors=FALSE)
try_data        <- read.csv("input/File_Level-Try_Based.csv", stringsAsFactors=FALSE)

drop = c("Language")

git_data = git_data[,!(names(git_data) %in% drop)]
understand_data = understand_data[,!(names(understand_data) %in% drop)]
throws_data = throws_data[,!(names(throws_data) %in% drop)]
catch_data = catch_data[,!(names(catch_data) %in% drop)]
try_data = try_data[,!(names(try_data) %in% drop)]

We inspected the number of files in each data source. We can see that not all data sources have the same amount of files. For example, there are files in which there are no throws blocks or catch blocks. We also considered the number of metrics.

print(paste("DataSource,","NumberFiles,","NumberMetrics"))
## [1] "DataSource, NumberFiles, NumberMetrics"
print(paste("VCS_ITS,", nrow(git_data),",", ncol(git_data)))
## [1] "VCS_ITS, 17685 , 8"
print(paste("Understand,", nrow(understand_data),",", ncol(understand_data)))
## [1] "Understand, 10360 , 46"
print(paste("Throws,", nrow(throws_data),",", ncol(throws_data)))
## [1] "Throws, 2459 , 7"
print(paste("Catch,", nrow(catch_data),",", ncol(catch_data)))
## [1] "Catch, 1735 , 47"
print(paste("Try,", nrow(try_data),",", ncol(try_data)))
## [1] "Try, 1569 , 59"

Separate data according to projects

projects = as.vector(unique(git_data$Project))

Throws blocks data

throws_names = names(throws_data)
throws_data_list = myProjectList(throws_data,projects)

Catch blocks data

catch_names = names(catch_data)
catch_data_list = myProjectList(catch_data,projects)

Try blocks data

try_names = names(try_data)
try_data_list = myProjectList(try_data,projects)

Understand data

understand_names = names(understand_data)
understand_data_list = myProjectList(understand_data,projects)

VCS_ITS data

git_names = names(git_data)
git_data_list = myProjectList(git_data,projects)

Data Sources - Combined Analysis

After separate analysis of each data source we now merge the data sources for model construction.

Merging Data Sources

These values are the number of files in each dataset. As we can see, git and understand have a lot more files. This happens due to the lack of catch blocks, throws or possible exceptions in those files.

for (i in 1:length(projects)) {
  print(paste(projects[i],",DataSource,","NumberFiles,","NumberMetrics"))
  print(paste(projects[i],",VCS_ITS,", nrow(as.data.frame(git_data_list[i])),",", ncol(as.data.frame(git_data_list[i]))))
  print(paste(projects[i],",Understand,", nrow(as.data.frame(understand_data_list[i])),",", ncol(as.data.frame(understand_data_list[i]))))
  print(paste(projects[i],",Throws,", nrow(as.data.frame(throws_data_list[i])),",", ncol(as.data.frame(throws_data_list[i]))))
  print(paste(projects[i],",Catch,", nrow(as.data.frame(catch_data_list[i])),",", ncol(as.data.frame(catch_data_list[i]))))
  print(paste(projects[i],",Try,", nrow(as.data.frame(try_data_list[i])),",", ncol(as.data.frame(try_data_list[i]))))
}
## [1] "hadoop-2.6 ,DataSource, NumberFiles, NumberMetrics"
## [1] "hadoop-2.6 ,VCS_ITS, 6073 , 8"
## [1] "hadoop-2.6 ,Understand, 3698 , 46"
## [1] "hadoop-2.6 ,Throws, 1583 , 7"
## [1] "hadoop-2.6 ,Catch, 926 , 47"
## [1] "hadoop-2.6 ,Try, 890 , 59"
## [1] "hibernate-5.0 ,DataSource, NumberFiles, NumberMetrics"
## [1] "hibernate-5.0 ,VCS_ITS, 8006 , 8"
## [1] "hibernate-5.0 ,Understand, 3488 , 46"
## [1] "hibernate-5.0 ,Throws, 876 , 7"
## [1] "hibernate-5.0 ,Catch, 488 , 47"
## [1] "hibernate-5.0 ,Try, 449 , 59"
## [1] "umbraco-7.6 ,DataSource, NumberFiles, NumberMetrics"
## [1] "umbraco-7.6 ,VCS_ITS, 3606 , 8"
## [1] "umbraco-7.6 ,Understand, 3174 , 46"
## [1] "umbraco-7.6 ,Throws, 0 , 7"
## [1] "umbraco-7.6 ,Catch, 321 , 47"
## [1] "umbraco-7.6 ,Try, 230 , 59"

Our model construction will follow merging these datasets. We will accept the files that exist in at least one extra dataset besides git and understand. We aim to evaluate the files that are missing in the next step: missing data.

all_list = vector("list", 0)

for (i in 1:length(projects)) {
  
  g = as.data.frame(git_data_list[i])
  u = as.data.frame(understand_data_list[i])
  th = as.data.frame(throws_data_list[i])
  c = as.data.frame(catch_data_list[i])
  tr = as.data.frame(try_data_list[i])
  
  # Understand metrics for Java only: NA means real not applicable, since those don't exist in C#
  # Adding an extra category for valid non-applicable is suggested and accepted, according to Frank Harrell.
  NAs_Other <- is.na(u)
  u[NAs_Other] <- -1
  
  # if (g$Project == "nhibernate"){
  #   drop = c("Language")
  #   git_data = git_data[,!(names(git_data) %in% drop)]
  # }
  
  # Try metrics for Java only: NA means real not applicable, since those don't exist in C#
  # Adding an extra category for valid non-applicable is suggested and accepted, according to Frank Harrell.
  NAs_Other <- is.na(tr$X..Method.Declaration)
  tr$X..Method.Declaration[NAs_Other] <- -1
  
  # Git + Understand
  g_u = merge(g,u, by = c("File.Path","Project"))
  
  # Git + Understand + Throws
  g_u_t = merge(g_u, th, all.x = TRUE, by = c("File.Path","Project"))
  
  # N of Throws: NA actually means 0 throws in that file.
  NAs_N_Trows <- is.na(g_u_t$X..Throws)  
  g_u_t$X..Throws[NAs_N_Trows] <- 0
  
  # Throws APs: NA means real not applicable, since N Throws is zero.
  # Adding an extra category for valid non-applicable is suggested and accepted, according to Frank Harrell.
  NAs_Other <- is.na(g_u_t)
  g_u_t[NAs_Other] <- -1
  
  # Catch blocks + Try blocks
  # During data inspection we noticed that some catch blocks don't have related try blocks. (The other way around is not true).
  # That can happen because of non-identified possible exceptions for that try block. 
  # However, in reality there should be always a try block for any catch block.
  c_t = merge(c,tr, all.x = TRUE, by = c("File.Path","Project"))
  
  # In this case we should not eliminate rows in which catch data is available, but no try data. We don't want to miss catch data, even if try data is missing.
  # So, we flag these try blocks as real NAs that will be kept as NA for missing data analysis in the model construction.
  NAs_Try_Blocks <- is.na(c_t)
  c_t[NAs_Try_Blocks] <- -9
  
  # (Git + Understand + Throws) + (Catch blocks + Try Blocks)
  g_u_t_c_t = merge(g_u_t, c_t, all.x = TRUE, by = c("File.Path","Project"))
  
  # N of Catch: NA actually means 0 catchs in that file.
  NAs_N_Catch <- is.na(g_u_t_c_t$X..Catch)
  g_u_t_c_t$X..Catch[NAs_N_Catch] <- 0
  
  # Other Catch data: NA means real not applicable, since N Catch is zero.
  # Adding an extra category for valid non-applicable is suggested and accepted, according to Frank Harrell.
  NAs_Other <- is.na(g_u_t_c_t)
  g_u_t_c_t[NAs_Other] <- -1
  
  # Now we convert back the missing data due to catch without try blocks.
  Missing_Try_Blocks <- g_u_t_c_t == -9
  g_u_t_c_t[Missing_Try_Blocks] <- NA
  
  # write.csv(g_u_t_c_t, file = "g_u_t_c_t.csv")
  # write.csv(c_t, file = "c_t.csv")
  
  all_list <- c(all_list, list(g_u_t_c_t))
}
write.table(all_list[[1]][0,], 'data.csv'  , append= F, sep=',', row.names = F, col.names = T )
lapply(all_list, function(x) write.table( x, 'data.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL

Missing data

As expected due to the difference on the number of files for each data source, here we evaluate the missing data. If the data has a small fraction of missing (i.e. < 0.03) we chose to discard it, since our number of records is large (i.e. > 100).

all_no_missing = vector("list", 0)
for (i in 1:length(projects)) {
  print(paste("Project:", projects[i]))
  temp_data = as.data.frame(all_list[i])
  
  temp_omitted = myMissingData(temp_data)
  all_no_missing <- c(all_no_missing, list(temp_omitted))
}
## [1] "Project: hadoop-2.6"

## [1] "MissingData: Small fraction found: Single imputation or discard. Discard is chosen. Discarded rows: 36 / 3698"
## [1] "Project: hibernate-5.0"

## [1] "MissingData: Small fraction found: Single imputation or discard. Discard is chosen. Discarded rows: 38 / 3488"
## [1] "Project: umbraco-7.6"

## [1] "MissingData: Small fraction found: Single imputation or discard. Discard is chosen. Discarded rows: 91 / 3174"
base_names = c(git_names, understand_names)
save(all_no_missing, projects, try_names, catch_names, throws_names, base_names, file ="0-all_no_missing.RData")

write.table(all_no_missing[[1]][0,], 'all_no_missing.csv'  , append= F, sep=',', row.names = F, col.names = T )
lapply(all_no_missing, function(x) write.table( x, 'all_no_missing.csv'  , append= T, sep=',', row.names = F, col.names = F ))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL