update readme file and few minor changes

UBOdin · Mar 6, 2017 · d5aa9f0 · d5aa9f0
1 parent c913275
commit d5aa9f0
Show file tree

Hide file tree

Showing 63 changed files with 145 additions and 32,699 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -0,0 +1,114 @@
+source(file = "./evaluation.R")
+source(file = "./utils.R")
+# load supporting libraries
+library(cluster)
+library(factoextra)
+library(RColorBrewer)
+dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/bombay_aligon.csv")
+avgSilhoette(distMat, dataset$label)
+BetaCV(distMat, dataset$label)
+DunnIndex(distMat, dataset$label)
+setwd("~/github/EttuBench")
+source(file = "./evaluation.R")
+source(file = "./utils.R")
+# load supporting libraries
+library(cluster)
+library(factoextra)
+library(RColorBrewer)
+dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/bombay_aligon.csv")
+avgSilhoette(distMat, dataset$label)
+BetaCV(distMat, dataset$label)
+DunnIndex(distMat, dataset$label)
+# load two files evaluation.R and utils.R
+source(file = "./evaluation.R")
+source(file = "./utils.R")
+# load supporting libraries
+library(cluster)
+library(factoextra)
+library(RColorBrewer)
+dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/bombay_aligon.csv")
+print(avgSilhoette(distMat, dataset$label))
+print(BetaCV(distMat, dataset$label))
+print(DunnIndex(distMat, dataset$label))
+setwd("~/github/EttuBench")
+# load two files evaluation.R and utils.R
+source(file = "./evaluation.R")
+source(file = "./utils.R")
+# load supporting libraries
+library(cluster)
+library(factoextra)
+library(RColorBrewer)
+dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/bombay_aligon.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_bombay_Aligon.pdf")
+distMat <- readDistMat("./data/bombay_aligon_regularization.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_bombay_Aligon_regularization.pdf")
+dataset <- read.csv(file = "./data/ub_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/ub_aligon.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_ub_Aligon.pdf")
+distMat <- readDistMat("./data/ub_aligon_regularization.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_ub_Aligon_regularization.pdf")
+dataset <- read.csv(file = "./data/googleplus_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/googleplus_aligon.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_googleplus_Aligon.pdf")
+distMat <- readDistMat("./data/googleplus_aligon_regularization.csv")
+silhouettePlot(distMat, dataset$label, "./figure/sil_googleplus_Aligon_regularization.pdf")
+setwd("~/github/EttuBench")
+library(ggplot2)
+comparison <- read.csv(file = "./data/result.csv", header = TRUE)
+comparison$dataset <- factor(comparison$dataset,
+levels = c("IIT Bombay Dataset",
+"UB Exam Dataset",
+"PocketData-Google+"))
+ggplot(data = comparison, aes(x = metric, y = silhouette, fill = regularization)) +
+geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
+ylab("Average Silhouette Coefficient") + xlab("Metric") +
+theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
+ggsave(filename = "./figure/compare_silhouette.pdf")
+ggplot(data = comparison, aes(x = metric, y = beta_cv, fill = regularization)) +
+geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
+ylab("Average Silhouette Coefficient") + xlab("Metric") +
+theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
+ggsave(filename = "./figure/compare_betacv.pdf")
+ggplot(data = comparison, aes(x = metric, y = dunn, fill = regularization)) +
+geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
+ylab("Average Silhouette Coefficient") + xlab("Metric") +
+theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
+ggsave(filename = "./figure/compare_dunn.pdf")
+setwd("~/github/EttuBench")
+library(ggplot2)
+comparison <- read.csv(file = "./data/modules.csv")
+comparison$Regularization <- factor(comparison$Regularization,
+levels = c("No Regularization",
+"Naming",
+"Expression Standardization",
+"FROM-nested Subquery",
+"UNION Pull-out"))
+comparison$Dataset <- factor(comparison$Dataset,
+levels = c("IIT Bombay Dataset", "UB Exam Dataset",
+"PhoneLab-Google+"))
+# individual module analysis
+ggplot(data = comparison, aes(x = Metric, y = Silhouette, fill=Regularization)) +
+geom_bar(position="dodge", stat="identity") + facet_grid(~ Dataset) +
+ylab("Average Silhouette Coefficient") + xlab("Metric") +
+theme_bw(base_size = 14) + theme(legend.position = "top", legend.title = element_blank()) +
+scale_fill_brewer(palette = "Dark2") +
+ggsave(file = "./figure/module.pdf")
+setwd("~/github/EttuBench")
+source(file = "./evaluation.R")
+source(file = "./utils.R")
+library(cluster)
+library(factoextra)
+library(RColorBrewer)
+dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
+distMat <- readDistMat("./data/bombay_aligon_regularization.csv")
+print(avgSilhoette(distMat, dataset$label))
+print(BetaCV(distMat, dataset$label))
+print(DunnIndex(distMat, dataset$label))
+distMat <- readDistMat("./data/bombay_aouiche_regularization.csv")
+print(avgSilhoette(distMat, dataset$label))
+print(BetaCV(distMat, dataset$label))
+print(DunnIndex(distMat, dataset$label))
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ This repository contains all the code and data we use to produce experimental re
 ## Organization of the repository
 - data folder: Contains all data files that are used in the experiments
 - figure folder: This folder is used to store all the output figures from experiments
+- SQLFeatureExtraction: This folder stores Java code that we used to extract features from SQL queries and producing pairwise distance matrix 
 - evaluation.R: contains implementation of 3 clustering validation measures including (average silhouette coefficients, Dunn Index, BetaCV). It also contains function to provide plot for distribution of silhouette coefficients.
 - utils.R: other supporting functions such as reading distance matrix.
 - script_figure_2.R: produce Figure 2 as shown in the paper.
@@ -26,6 +27,23 @@ In order to evaluate different modules in regularization step, we consider 4 dif
 **modules.csv** contains all the number that are needed to produce Figure 4 in the paper.
 
 ## Reproducing experimental results
+## Extracting features from SQL queries and computing pairwise distance matrix
+Our Java code for extracting features frm SQL queries and compute the pairwise distance matrix among queries is given in folder **SQLFeatureExtraction**. The Java source code is provided in folder **SQLFeatureExtraction/src**. 
+
+**SQLComparison.jar** is an executable file that user can use to reproduce all pairwise distance matrices with all possible options for regularization. This jar file can be run from command line as follow:
+
+    java -jar SQLComparison.jar [-options]
+
+where the possible options are as follow:
+- dataset(*ub*, *bombay* or *googleplus*) to be applied by using option *-input*. If no *-input* option is given, all three datasets *ub*, *bombay* and *googleplus* will be used by default to reproduce all possible pairwise distance matrices with all regularization options. For example, the *ub* dataset can be specified as follow:
+    java -jar SQLComparison.jar -input ub
+- similarity metric (*aligon*, *makiyama* or *aouiche*) to be applied by using option *-metric*. If no *-metric* option is given, all three metrics including *aligon*, *aouiche* and *makiyama* will be used by default. For example, user can specify the metric as *aligon* using the following command:
+    java -jar SQLComparison.jar -metric aligon
+- query regularization module (ID=1: Naming; ID=2: Expression Standardization; ID=3: Flattening From-Nested Sub-query; ID=4: Union Pull-out) can be specified by using option *-modules*. User can specify multiple modules by using their IDs with "&" delimiter. If no *-modules* option given, all modules will be used by default. For example, if user wants modules Naming(ID=1) and Expression Standardization(ID=2) to be applied, the following command can be used: 
+    java -jar SQLComparison.jar -modules 1&2
+
+The output distance matrices can be found in folder *SQLFeatureExtraction/data/*
+
 ### Reproduce figure 2
 In order to reproduce distribution of silhouette coefficients when using Aligon similarity without regularization and when regularization is applied as shown in Figure 2 of the paper, users can open the file *script_figure_2.R*. Running this script file will produce the silhouette plots in folder *figure*.
 
@@ -61,4 +79,4 @@ In order to reproduce the plots for comparing the effect of different modules in
 
 This script requires an input file *modules.csv* in *data* folder. We have filled all the numbers in this file. For reproducibility, the numbers in this file can be manually filled by computing average silhouette coefficients, BetaCV and Dunn Index for each module in regularization.
 
-When the input file is ready, running this *script_figure_4.R* will produce the corresponding figure in folder *figure*.
+When the input file is ready, running this *script_figure_4.R* will produce the corresponding figure in folder *figure*.