Skip to content

Commit

Permalink
update readme file and few minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
luongthanhanhduc committed Mar 6, 2017
1 parent c913275 commit d5aa9f0
Show file tree
Hide file tree
Showing 63 changed files with 145 additions and 32,699 deletions.
114 changes: 114 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
source(file = "./evaluation.R")
source(file = "./utils.R")
# load supporting libraries
library(cluster)
library(factoextra)
library(RColorBrewer)
dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/bombay_aligon.csv")
avgSilhoette(distMat, dataset$label)
BetaCV(distMat, dataset$label)
DunnIndex(distMat, dataset$label)
setwd("~/github/EttuBench")
source(file = "./evaluation.R")
source(file = "./utils.R")
# load supporting libraries
library(cluster)
library(factoextra)
library(RColorBrewer)
dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/bombay_aligon.csv")
avgSilhoette(distMat, dataset$label)
BetaCV(distMat, dataset$label)
DunnIndex(distMat, dataset$label)
# load two files evaluation.R and utils.R
source(file = "./evaluation.R")
source(file = "./utils.R")
# load supporting libraries
library(cluster)
library(factoextra)
library(RColorBrewer)
dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/bombay_aligon.csv")
print(avgSilhoette(distMat, dataset$label))
print(BetaCV(distMat, dataset$label))
print(DunnIndex(distMat, dataset$label))
setwd("~/github/EttuBench")
# load two files evaluation.R and utils.R
source(file = "./evaluation.R")
source(file = "./utils.R")
# load supporting libraries
library(cluster)
library(factoextra)
library(RColorBrewer)
dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/bombay_aligon.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_bombay_Aligon.pdf")
distMat <- readDistMat("./data/bombay_aligon_regularization.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_bombay_Aligon_regularization.pdf")
dataset <- read.csv(file = "./data/ub_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/ub_aligon.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_ub_Aligon.pdf")
distMat <- readDistMat("./data/ub_aligon_regularization.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_ub_Aligon_regularization.pdf")
dataset <- read.csv(file = "./data/googleplus_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/googleplus_aligon.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_googleplus_Aligon.pdf")
distMat <- readDistMat("./data/googleplus_aligon_regularization.csv")
silhouettePlot(distMat, dataset$label, "./figure/sil_googleplus_Aligon_regularization.pdf")
setwd("~/github/EttuBench")
library(ggplot2)
comparison <- read.csv(file = "./data/result.csv", header = TRUE)
comparison$dataset <- factor(comparison$dataset,
levels = c("IIT Bombay Dataset",
"UB Exam Dataset",
"PocketData-Google+"))
ggplot(data = comparison, aes(x = metric, y = silhouette, fill = regularization)) +
geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
ylab("Average Silhouette Coefficient") + xlab("Metric") +
theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
ggsave(filename = "./figure/compare_silhouette.pdf")
ggplot(data = comparison, aes(x = metric, y = beta_cv, fill = regularization)) +
geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
ylab("Average Silhouette Coefficient") + xlab("Metric") +
theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
ggsave(filename = "./figure/compare_betacv.pdf")
ggplot(data = comparison, aes(x = metric, y = dunn, fill = regularization)) +
geom_bar(position="dodge", stat="identity") + facet_grid(~ dataset) +
ylab("Average Silhouette Coefficient") + xlab("Metric") +
theme_bw(base_size = 18) + theme(legend.position = "top") + scale_fill_grey() +
ggsave(filename = "./figure/compare_dunn.pdf")
setwd("~/github/EttuBench")
library(ggplot2)
comparison <- read.csv(file = "./data/modules.csv")
comparison$Regularization <- factor(comparison$Regularization,
levels = c("No Regularization",
"Naming",
"Expression Standardization",
"FROM-nested Subquery",
"UNION Pull-out"))
comparison$Dataset <- factor(comparison$Dataset,
levels = c("IIT Bombay Dataset", "UB Exam Dataset",
"PhoneLab-Google+"))
# individual module analysis
ggplot(data = comparison, aes(x = Metric, y = Silhouette, fill=Regularization)) +
geom_bar(position="dodge", stat="identity") + facet_grid(~ Dataset) +
ylab("Average Silhouette Coefficient") + xlab("Metric") +
theme_bw(base_size = 14) + theme(legend.position = "top", legend.title = element_blank()) +
scale_fill_brewer(palette = "Dark2") +
ggsave(file = "./figure/module.pdf")
setwd("~/github/EttuBench")
source(file = "./evaluation.R")
source(file = "./utils.R")
library(cluster)
library(factoextra)
library(RColorBrewer)
dataset <- read.csv(file = "./data/bombay_queries.csv", header = TRUE, sep = "\t")
distMat <- readDistMat("./data/bombay_aligon_regularization.csv")
print(avgSilhoette(distMat, dataset$label))
print(BetaCV(distMat, dataset$label))
print(DunnIndex(distMat, dataset$label))
distMat <- readDistMat("./data/bombay_aouiche_regularization.csv")
print(avgSilhoette(distMat, dataset$label))
print(BetaCV(distMat, dataset$label))
print(DunnIndex(distMat, dataset$label))
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This repository contains all the code and data we use to produce experimental re
## Organization of the repository
- data folder: Contains all data files that are used in the experiments
- figure folder: This folder is used to store all the output figures from experiments
- SQLFeatureExtraction: This folder stores Java code that we used to extract features from SQL queries and producing pairwise distance matrix
- evaluation.R: contains implementation of 3 clustering validation measures including (average silhouette coefficients, Dunn Index, BetaCV). It also contains function to provide plot for distribution of silhouette coefficients.
- utils.R: other supporting functions such as reading distance matrix.
- script_figure_2.R: produce Figure 2 as shown in the paper.
Expand All @@ -26,6 +27,23 @@ In order to evaluate different modules in regularization step, we consider 4 dif
**modules.csv** contains all the number that are needed to produce Figure 4 in the paper.

## Reproducing experimental results
## Extracting features from SQL queries and computing pairwise distance matrix
Our Java code for extracting features frm SQL queries and compute the pairwise distance matrix among queries is given in folder **SQLFeatureExtraction**. The Java source code is provided in folder **SQLFeatureExtraction/src**.

**SQLComparison.jar** is an executable file that user can use to reproduce all pairwise distance matrices with all possible options for regularization. This jar file can be run from command line as follow:

java -jar SQLComparison.jar [-options]

where the possible options are as follow:
- dataset(*ub*, *bombay* or *googleplus*) to be applied by using option *-input*. If no *-input* option is given, all three datasets *ub*, *bombay* and *googleplus* will be used by default to reproduce all possible pairwise distance matrices with all regularization options. For example, the *ub* dataset can be specified as follow:
java -jar SQLComparison.jar -input ub
- similarity metric (*aligon*, *makiyama* or *aouiche*) to be applied by using option *-metric*. If no *-metric* option is given, all three metrics including *aligon*, *aouiche* and *makiyama* will be used by default. For example, user can specify the metric as *aligon* using the following command:
java -jar SQLComparison.jar -metric aligon
- query regularization module (ID=1: Naming; ID=2: Expression Standardization; ID=3: Flattening From-Nested Sub-query; ID=4: Union Pull-out) can be specified by using option *-modules*. User can specify multiple modules by using their IDs with "&" delimiter. If no *-modules* option given, all modules will be used by default. For example, if user wants modules Naming(ID=1) and Expression Standardization(ID=2) to be applied, the following command can be used:
java -jar SQLComparison.jar -modules 1&2

The output distance matrices can be found in folder *SQLFeatureExtraction/data/*

### Reproduce figure 2
In order to reproduce distribution of silhouette coefficients when using Aligon similarity without regularization and when regularization is applied as shown in Figure 2 of the paper, users can open the file *script_figure_2.R*. Running this script file will produce the silhouette plots in folder *figure*.

Expand Down Expand Up @@ -61,4 +79,4 @@ In order to reproduce the plots for comparing the effect of different modules in

This script requires an input file *modules.csv* in *data* folder. We have filled all the numbers in this file. For reproducibility, the numbers in this file can be manually filled by computing average silhouette coefficients, BetaCV and Dunn Index for each module in regularization.

When the input file is ready, running this *script_figure_4.R* will produce the corresponding figure in folder *figure*.
When the input file is ready, running this *script_figure_4.R* will produce the corresponding figure in folder *figure*.
Loading

0 comments on commit d5aa9f0

Please sign in to comment.