New scala version. Added CEP, WEP, fixed some bugs.

ZackMitkin · Apr 12, 2019 · 5fbf6e8 · 5fbf6e8
1 parent 3c003ef
commit 5fbf6e8
Show file tree

Hide file tree

Showing 125 changed files with 4,355 additions and 0 deletions.
diff --git a/multi-data-source-sparker/README.md → ...sions/multi-data-source-sparker/README.md b/multi-data-source-sparker/README.md → ...sions/multi-data-source-sparker/README.md
diff --git a/multi-data-source-sparker/build.sbt → ...sions/multi-data-source-sparker/build.sbt b/multi-data-source-sparker/build.sbt → ...sions/multi-data-source-sparker/build.sbt
diff --git a/...source-sparker/custom_lib/ppjoinspark.jar → ...source-sparker/custom_lib/ppjoinspark.jar b/...source-sparker/custom_lib/ppjoinspark.jar → ...source-sparker/custom_lib/ppjoinspark.jar
diff --git a/...e-sparker/custom_lib/serializedLoader.jar → ...e-sparker/custom_lib/serializedLoader.jar b/...e-sparker/custom_lib/serializedLoader.jar → ...e-sparker/custom_lib/serializedLoader.jar
diff --git a/...-source-sparker/multidataset2/amazon.json → ...-source-sparker/multidataset2/amazon.json b/...-source-sparker/multidataset2/amazon.json → ...-source-sparker/multidataset2/amazon.json
diff --git a/...ce-sparker/multidataset2/groundtruth.json → ...ce-sparker/multidataset2/groundtruth.json b/...ce-sparker/multidataset2/groundtruth.json → ...ce-sparker/multidataset2/groundtruth.json
diff --git a/...ta-source-sparker/multidataset2/imdb.json → ...ta-source-sparker/multidataset2/imdb.json b/...ta-source-sparker/multidataset2/imdb.json → ...ta-source-sparker/multidataset2/imdb.json
diff --git a/...-source-sparker/multidataset2/rotten.json → ...-source-sparker/multidataset2/rotten.json b/...-source-sparker/multidataset2/rotten.json → ...-source-sparker/multidataset2/rotten.json
diff --git a/...ata-source-sparker/multidataset2/tmd.json → ...ata-source-sparker/multidataset2/tmd.json b/...ata-source-sparker/multidataset2/tmd.json → ...ata-source-sparker/multidataset2/tmd.json
diff --git a/.../BlockBuildingMethods/BlockingUtils.scala → .../BlockBuildingMethods/BlockingUtils.scala b/.../BlockBuildingMethods/BlockingUtils.scala → .../BlockBuildingMethods/BlockingUtils.scala
diff --git a/...la-2.11/BlockBuildingMethods/LSHMio.scala → ...la-2.11/BlockBuildingMethods/LSHMio.scala b/...la-2.11/BlockBuildingMethods/LSHMio.scala → ...la-2.11/BlockBuildingMethods/LSHMio.scala
diff --git a/.../BlockBuildingMethods/TokenBlocking.scala → .../BlockBuildingMethods/TokenBlocking.scala b/.../BlockBuildingMethods/TokenBlocking.scala → .../BlockBuildingMethods/TokenBlocking.scala
diff --git a/...ockRefinementMethods/BlockFiltering.scala → ...ockRefinementMethods/BlockFiltering.scala b/...ockRefinementMethods/BlockFiltering.scala → ...ockRefinementMethods/BlockFiltering.scala
diff --git a/...BlockRefinementMethods/BlockPurging.scala → ...BlockRefinementMethods/BlockPurging.scala b/...BlockRefinementMethods/BlockPurging.scala → ...BlockRefinementMethods/BlockPurging.scala
diff --git a/...nementMethods/PruningMethods/CNPFor.scala → ...nementMethods/PruningMethods/CNPFor.scala b/...nementMethods/PruningMethods/CNPFor.scala → ...nementMethods/PruningMethods/CNPFor.scala
diff --git a/...ds/PruningMethods/CommonNodePruning.scala → ...ds/PruningMethods/CommonNodePruning.scala b/...ds/PruningMethods/CommonNodePruning.scala → ...ds/PruningMethods/CommonNodePruning.scala
diff --git a/...ethods/PruningMethods/PCPQBlockCalc.scala → ...ethods/PruningMethods/PCPQBlockCalc.scala b/...ethods/PruningMethods/PCPQBlockCalc.scala → ...ethods/PruningMethods/PCPQBlockCalc.scala
diff --git a/...Methods/PruningMethods/PruningUtils.scala → ...Methods/PruningMethods/PruningUtils.scala b/...Methods/PruningMethods/PruningUtils.scala → ...Methods/PruningMethods/PruningUtils.scala
diff --git a/...nementMethods/PruningMethods/WNPFor.scala → ...nementMethods/PruningMethods/WNPFor.scala b/...nementMethods/PruningMethods/WNPFor.scala → ...nementMethods/PruningMethods/WNPFor.scala
diff --git a/...a-2.11/DataStructures/BlockAbstract.scala → ...a-2.11/DataStructures/BlockAbstract.scala b/...a-2.11/DataStructures/BlockAbstract.scala → ...a-2.11/DataStructures/BlockAbstract.scala
diff --git a/...cala-2.11/DataStructures/BlockClean.scala → ...cala-2.11/DataStructures/BlockClean.scala b/...cala-2.11/DataStructures/BlockClean.scala → ...cala-2.11/DataStructures/BlockClean.scala
diff --git a/...cala-2.11/DataStructures/BlockDirty.scala → ...cala-2.11/DataStructures/BlockDirty.scala b/...cala-2.11/DataStructures/BlockDirty.scala → ...cala-2.11/DataStructures/BlockDirty.scala
diff --git a/...aStructures/BlockWithComparisonSize.scala → ...aStructures/BlockWithComparisonSize.scala b/...aStructures/BlockWithComparisonSize.scala → ...aStructures/BlockWithComparisonSize.scala
diff --git a/...scala-2.11/DataStructures/EdgeTrait.scala → ...scala-2.11/DataStructures/EdgeTrait.scala b/...scala-2.11/DataStructures/EdgeTrait.scala → ...scala-2.11/DataStructures/EdgeTrait.scala
diff --git a/.../scala-2.11/DataStructures/KeyValue.scala → .../scala-2.11/DataStructures/KeyValue.scala b/.../scala-2.11/DataStructures/KeyValue.scala → .../scala-2.11/DataStructures/KeyValue.scala
diff --git a/...ala-2.11/DataStructures/KeysCluster.scala → ...ala-2.11/DataStructures/KeysCluster.scala b/...ala-2.11/DataStructures/KeysCluster.scala → ...ala-2.11/DataStructures/KeysCluster.scala
diff --git a/....11/DataStructures/MatchingEntities.scala → ....11/DataStructures/MatchingEntities.scala b/....11/DataStructures/MatchingEntities.scala → ....11/DataStructures/MatchingEntities.scala
diff --git a/...n/scala-2.11/DataStructures/Profile.scala → ...n/scala-2.11/DataStructures/Profile.scala b/...n/scala-2.11/DataStructures/Profile.scala → ...n/scala-2.11/DataStructures/Profile.scala
diff --git a/...a-2.11/DataStructures/ProfileBlocks.scala → ...a-2.11/DataStructures/ProfileBlocks.scala b/...a-2.11/DataStructures/ProfileBlocks.scala → ...a-2.11/DataStructures/ProfileBlocks.scala
diff --git a/...la-2.11/DataStructures/ProfileTrait.scala → ...la-2.11/DataStructures/ProfileTrait.scala b/...la-2.11/DataStructures/ProfileTrait.scala → ...la-2.11/DataStructures/ProfileTrait.scala
diff --git a/...-2.11/DataStructures/UnweightedEdge.scala → ...-2.11/DataStructures/UnweightedEdge.scala b/...-2.11/DataStructures/UnweightedEdge.scala → ...-2.11/DataStructures/UnweightedEdge.scala
diff --git a/...la-2.11/DataStructures/WeightedEdge.scala → ...la-2.11/DataStructures/WeightedEdge.scala b/...la-2.11/DataStructures/WeightedEdge.scala → ...la-2.11/DataStructures/WeightedEdge.scala
diff --git a/...rc/main/scala-2.11/Experiments/Main.scala → ...rc/main/scala-2.11/Experiments/Main.scala b/...rc/main/scala-2.11/Experiments/Main.scala → ...rc/main/scala-2.11/Experiments/Main.scala
diff --git a/...c/main/scala-2.11/Experiments/Main3.scala → ...c/main/scala-2.11/Experiments/Main3.scala b/...c/main/scala-2.11/Experiments/Main3.scala → ...c/main/scala-2.11/Experiments/Main3.scala
diff --git a/...2.11/Utilities/BoundedPriorityQueue.scala → ...2.11/Utilities/BoundedPriorityQueue.scala b/...2.11/Utilities/BoundedPriorityQueue.scala → ...2.11/Utilities/BoundedPriorityQueue.scala
diff --git a/...ain/scala-2.11/Utilities/Converters.scala → ...ain/scala-2.11/Utilities/Converters.scala b/...ain/scala-2.11/Utilities/Converters.scala → ...ain/scala-2.11/Utilities/Converters.scala
diff --git a/...la-2.11/Utilities/CustomPartitioner.scala → ...la-2.11/Utilities/CustomPartitioner.scala b/...la-2.11/Utilities/CustomPartitioner.scala → ...la-2.11/Utilities/CustomPartitioner.scala
diff --git a/...a-2.11/Utilities/CustomPartitioner2.scala → ...a-2.11/Utilities/CustomPartitioner2.scala b/...a-2.11/Utilities/CustomPartitioner2.scala → ...a-2.11/Utilities/CustomPartitioner2.scala
diff --git a/.../scala-2.11/Utilities/MyPartitioner.scala → .../scala-2.11/Utilities/MyPartitioner.scala b/.../scala-2.11/Utilities/MyPartitioner.scala → .../scala-2.11/Utilities/MyPartitioner.scala
diff --git a/...la-2.11/Utilities/RandomPartitioner.scala → ...la-2.11/Utilities/RandomPartitioner.scala b/...la-2.11/Utilities/RandomPartitioner.scala → ...la-2.11/Utilities/RandomPartitioner.scala
diff --git a/...-2.11/Utilities/StatisticsEstimator.scala → ...-2.11/Utilities/StatisticsEstimator.scala b/...-2.11/Utilities/StatisticsEstimator.scala → ...-2.11/Utilities/StatisticsEstimator.scala
diff --git a/...main/scala-2.11/Wrappers/CSVWrapper.scala → ...main/scala-2.11/Wrappers/CSVWrapper.scala b/...main/scala-2.11/Wrappers/CSVWrapper.scala → ...main/scala-2.11/Wrappers/CSVWrapper.scala
diff --git a/...ain/scala-2.11/Wrappers/JSONWrapper.scala → ...ain/scala-2.11/Wrappers/JSONWrapper.scala b/...ain/scala-2.11/Wrappers/JSONWrapper.scala → ...ain/scala-2.11/Wrappers/JSONWrapper.scala
diff --git a/....11/Wrappers/SerializedObjectLoader.scala → ....11/Wrappers/SerializedObjectLoader.scala b/....11/Wrappers/SerializedObjectLoader.scala → ....11/Wrappers/SerializedObjectLoader.scala
diff --git a/...1/Wrappers/SerializedProfilesLoader.scala → ...1/Wrappers/SerializedProfilesLoader.scala b/...1/Wrappers/SerializedProfilesLoader.scala → ...1/Wrappers/SerializedProfilesLoader.scala
diff --git a/...in/scala-2.11/Wrappers/WrapperTrait.scala → ...in/scala-2.11/Wrappers/WrapperTrait.scala b/...in/scala-2.11/Wrappers/WrapperTrait.scala → ...in/scala-2.11/Wrappers/WrapperTrait.scala
diff --git a/sparker/build.sbt → old_versions/sparker/build.sbt b/sparker/build.sbt → old_versions/sparker/build.sbt
diff --git a/sparker/custom_lib/serializedLoader.jar → ...s/sparker/custom_lib/serializedLoader.jar b/sparker/custom_lib/serializedLoader.jar → ...s/sparker/custom_lib/serializedLoader.jar
diff --git a/sparker/launch.sh → old_versions/sparker/launch.sh b/sparker/launch.sh → old_versions/sparker/launch.sh
diff --git a/.../BlockBuildingMethods/BlockingUtils.scala → .../BlockBuildingMethods/BlockingUtils.scala b/.../BlockBuildingMethods/BlockingUtils.scala → .../BlockBuildingMethods/BlockingUtils.scala
diff --git a/...a-2.11/BlockBuildingMethods/LSHLuca.scala → ...a-2.11/BlockBuildingMethods/LSHLuca.scala b/...a-2.11/BlockBuildingMethods/LSHLuca.scala → ...a-2.11/BlockBuildingMethods/LSHLuca.scala
diff --git a/...-2.11/BlockBuildingMethods/LSHSpark.scala → ...-2.11/BlockBuildingMethods/LSHSpark.scala b/...-2.11/BlockBuildingMethods/LSHSpark.scala → ...-2.11/BlockBuildingMethods/LSHSpark.scala
diff --git a/....11/BlockBuildingMethods/LSHTwitter.scala → ....11/BlockBuildingMethods/LSHTwitter.scala b/....11/BlockBuildingMethods/LSHTwitter.scala → ....11/BlockBuildingMethods/LSHTwitter.scala
diff --git a/.../BlockBuildingMethods/TokenBlocking.scala → .../BlockBuildingMethods/TokenBlocking.scala b/.../BlockBuildingMethods/TokenBlocking.scala → .../BlockBuildingMethods/TokenBlocking.scala
diff --git a/...ockRefinementMethods/BlockFiltering.scala → ...ockRefinementMethods/BlockFiltering.scala b/...ockRefinementMethods/BlockFiltering.scala → ...ockRefinementMethods/BlockFiltering.scala
diff --git a/...BlockRefinementMethods/BlockPurging.scala → ...BlockRefinementMethods/BlockPurging.scala b/...BlockRefinementMethods/BlockPurging.scala → ...BlockRefinementMethods/BlockPurging.scala
diff --git a/...nementMethods/PruningMethods/CNPFor.scala → ...nementMethods/PruningMethods/CNPFor.scala b/...nementMethods/PruningMethods/CNPFor.scala → ...nementMethods/PruningMethods/CNPFor.scala
diff --git a/...entMethods/PruningMethods/CNPForOld.scala → ...entMethods/PruningMethods/CNPForOld.scala b/...entMethods/PruningMethods/CNPForOld.scala → ...entMethods/PruningMethods/CNPForOld.scala
diff --git a/...ds/PruningMethods/CommonNodePruning.scala → ...ds/PruningMethods/CommonNodePruning.scala b/...ds/PruningMethods/CommonNodePruning.scala → ...ds/PruningMethods/CommonNodePruning.scala
diff --git a/...Methods/PruningMethods/PruningUtils.scala → ...Methods/PruningMethods/PruningUtils.scala b/...Methods/PruningMethods/PruningUtils.scala → ...Methods/PruningMethods/PruningUtils.scala
diff --git a/...nementMethods/PruningMethods/WNPFor.scala → ...nementMethods/PruningMethods/WNPFor.scala b/...nementMethods/PruningMethods/WNPFor.scala → ...nementMethods/PruningMethods/WNPFor.scala
diff --git a/...entMethods/PruningMethods/WNPForOld.scala → ...entMethods/PruningMethods/WNPForOld.scala b/...entMethods/PruningMethods/WNPForOld.scala → ...entMethods/PruningMethods/WNPForOld.scala
diff --git a/...a-2.11/DataStructures/BlockAbstract.scala → ...a-2.11/DataStructures/BlockAbstract.scala b/...a-2.11/DataStructures/BlockAbstract.scala → ...a-2.11/DataStructures/BlockAbstract.scala
diff --git a/...cala-2.11/DataStructures/BlockClean.scala → ...cala-2.11/DataStructures/BlockClean.scala b/...cala-2.11/DataStructures/BlockClean.scala → ...cala-2.11/DataStructures/BlockClean.scala
diff --git a/...cala-2.11/DataStructures/BlockDirty.scala → ...cala-2.11/DataStructures/BlockDirty.scala b/...cala-2.11/DataStructures/BlockDirty.scala → ...cala-2.11/DataStructures/BlockDirty.scala
diff --git a/...aStructures/BlockWithComparisonSize.scala → ...aStructures/BlockWithComparisonSize.scala b/...aStructures/BlockWithComparisonSize.scala → ...aStructures/BlockWithComparisonSize.scala
diff --git a/...scala-2.11/DataStructures/EdgeTrait.scala → ...scala-2.11/DataStructures/EdgeTrait.scala b/...scala-2.11/DataStructures/EdgeTrait.scala → ...scala-2.11/DataStructures/EdgeTrait.scala
diff --git a/.../scala-2.11/DataStructures/KeyValue.scala → .../scala-2.11/DataStructures/KeyValue.scala b/.../scala-2.11/DataStructures/KeyValue.scala → .../scala-2.11/DataStructures/KeyValue.scala
diff --git a/...ala-2.11/DataStructures/KeysCluster.scala → ...ala-2.11/DataStructures/KeysCluster.scala b/...ala-2.11/DataStructures/KeysCluster.scala → ...ala-2.11/DataStructures/KeysCluster.scala
diff --git a/....11/DataStructures/MatchingEntities.scala → ....11/DataStructures/MatchingEntities.scala b/....11/DataStructures/MatchingEntities.scala → ....11/DataStructures/MatchingEntities.scala
diff --git a/...n/scala-2.11/DataStructures/Profile.scala → ...n/scala-2.11/DataStructures/Profile.scala b/...n/scala-2.11/DataStructures/Profile.scala → ...n/scala-2.11/DataStructures/Profile.scala
diff --git a/...a-2.11/DataStructures/ProfileBlocks.scala → ...a-2.11/DataStructures/ProfileBlocks.scala b/...a-2.11/DataStructures/ProfileBlocks.scala → ...a-2.11/DataStructures/ProfileBlocks.scala
diff --git a/...la-2.11/DataStructures/ProfileTrait.scala → ...la-2.11/DataStructures/ProfileTrait.scala b/...la-2.11/DataStructures/ProfileTrait.scala → ...la-2.11/DataStructures/ProfileTrait.scala
diff --git a/...-2.11/DataStructures/UnweightedEdge.scala → ...-2.11/DataStructures/UnweightedEdge.scala b/...-2.11/DataStructures/UnweightedEdge.scala → ...-2.11/DataStructures/UnweightedEdge.scala
diff --git a/...la-2.11/DataStructures/WeightedEdge.scala → ...la-2.11/DataStructures/WeightedEdge.scala b/...la-2.11/DataStructures/WeightedEdge.scala → ...la-2.11/DataStructures/WeightedEdge.scala
diff --git a/...main/scala-2.11/Experiments/AllTest.scala → ...main/scala-2.11/Experiments/AllTest.scala b/...main/scala-2.11/Experiments/AllTest.scala → ...main/scala-2.11/Experiments/AllTest.scala
diff --git a/...2.11/Utilities/BoundedPriorityQueue.scala → ...2.11/Utilities/BoundedPriorityQueue.scala b/...2.11/Utilities/BoundedPriorityQueue.scala → ...2.11/Utilities/BoundedPriorityQueue.scala
diff --git a/...ain/scala-2.11/Utilities/Converters.scala → ...ain/scala-2.11/Utilities/Converters.scala b/...ain/scala-2.11/Utilities/Converters.scala → ...ain/scala-2.11/Utilities/Converters.scala
diff --git a/...la-2.11/Utilities/CustomPartitioner.scala → ...la-2.11/Utilities/CustomPartitioner.scala b/...la-2.11/Utilities/CustomPartitioner.scala → ...la-2.11/Utilities/CustomPartitioner.scala
diff --git a/...a-2.11/Utilities/CustomPartitioner2.scala → ...a-2.11/Utilities/CustomPartitioner2.scala b/...a-2.11/Utilities/CustomPartitioner2.scala → ...a-2.11/Utilities/CustomPartitioner2.scala
diff --git a/.../scala-2.11/Utilities/MyPartitioner.scala → .../scala-2.11/Utilities/MyPartitioner.scala b/.../scala-2.11/Utilities/MyPartitioner.scala → .../scala-2.11/Utilities/MyPartitioner.scala
diff --git a/...la-2.11/Utilities/RandomPartitioner.scala → ...la-2.11/Utilities/RandomPartitioner.scala b/...la-2.11/Utilities/RandomPartitioner.scala → ...la-2.11/Utilities/RandomPartitioner.scala
diff --git a/...-2.11/Utilities/StatisticsEstimator.scala → ...-2.11/Utilities/StatisticsEstimator.scala b/...-2.11/Utilities/StatisticsEstimator.scala → ...-2.11/Utilities/StatisticsEstimator.scala
diff --git a/...main/scala-2.11/Wrappers/CSVWrapper.scala → ...main/scala-2.11/Wrappers/CSVWrapper.scala b/...main/scala-2.11/Wrappers/CSVWrapper.scala → ...main/scala-2.11/Wrappers/CSVWrapper.scala
diff --git a/...ain/scala-2.11/Wrappers/JSONWrapper.scala → ...ain/scala-2.11/Wrappers/JSONWrapper.scala b/...ain/scala-2.11/Wrappers/JSONWrapper.scala → ...ain/scala-2.11/Wrappers/JSONWrapper.scala
diff --git a/....11/Wrappers/SerializedObjectLoader.scala → ....11/Wrappers/SerializedObjectLoader.scala b/....11/Wrappers/SerializedObjectLoader.scala → ....11/Wrappers/SerializedObjectLoader.scala
diff --git a/...1/Wrappers/SerializedProfilesLoader.scala → ...1/Wrappers/SerializedProfilesLoader.scala b/...1/Wrappers/SerializedProfilesLoader.scala → ...1/Wrappers/SerializedProfilesLoader.scala
diff --git a/...in/scala-2.11/Wrappers/WrapperTrait.scala → ...in/scala-2.11/Wrappers/WrapperTrait.scala b/...in/scala-2.11/Wrappers/WrapperTrait.scala → ...in/scala-2.11/Wrappers/WrapperTrait.scala
diff --git a/scala/README.md b/scala/README.md
diff --git a/scala/sparker/build.sbt b/scala/sparker/build.sbt
@@ -0,0 +1,37 @@
+name := "spark_er"
+version := "1.0"
+scalaVersion := "2.11.8"
+val sparkVersion = "2.0.2"
+
+unmanagedBase := baseDirectory.value / "custom_lib"
+
+
+libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "2.1.0"
+
+// https://mvnrepository.com/artifact/org.apache.spark/spark-sql_2.11
+libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.1.0"
+
+// https://mvnrepository.com/artifact/org.apache.spark/spark-graphx_2.11
+libraryDependencies += "org.apache.spark" % "spark-graphx_2.11" % "2.1.0"
+
+libraryDependencies += "org.apache.spark" % "spark-mllib_2.11" % "2.1.0"
+
+libraryDependencies += "org.apache.spark" % "spark-hive_2.11" % "2.1.0"
+
+// https://mvnrepository.com/artifact/com.twitter/algebird-core_2.11
+//libraryDependencies += "com.twitter" % "algebird-core_2.11" % "0.12.3"
+
+// https://mvnrepository.com/artifact/org.apache.commons/commons-math3
+libraryDependencies += "org.apache.commons" % "commons-math3" % "3.6.1"
+
+// https://mvnrepository.com/artifact/commons-codec/commons-codec
+libraryDependencies += "commons-codec" % "commons-codec" % "1.11"
+
+// https://mvnrepository.com/artifact/org.jgrapht/jgrapht-core
+libraryDependencies += "org.jgrapht" % "jgrapht-core" % "1.0.1"
+
+// https://mvnrepository.com/artifact/org.json/json
+libraryDependencies += "org.json" % "json" % "20170516"
+
+
+//mainClass in Compile := Some("Experiments.Main")
diff --git a/scala/sparker/custom_lib/serializedLoader.jar b/scala/sparker/custom_lib/serializedLoader.jar
diff --git a/scala/sparker/src/main/scala-2.11/Experiments/Main.scala b/scala/sparker/src/main/scala-2.11/Experiments/Main.scala
@@ -0,0 +1,157 @@
+package Experiments
+
+import SparkER.BlockBuildingMethods.{BlockingUtils, LSH, TokenBlocking}
+import SparkER.BlockBuildingMethods.LSH.Settings
+import SparkER.BlockRefinementMethods.PruningMethods._
+import SparkER.BlockRefinementMethods.{BlockFiltering, BlockPurging}
+import SparkER.Utilities.Converters
+import SparkER.Wrappers.{CSVWrapper, JSONWrapper}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+/**
+  * Test WNP meta-blocking
+  *
+  * @author Luca Gagliardelli
+  * @since 18/12/2018
+  **/
+object Main {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf()
+      .setAppName("Main")
+      .setMaster("local[*]")
+      .set("spark.default.parallelism", "4")
+
+    val sc = new SparkContext(conf)
+
+    /**
+      * Loads two datasets
+      **/
+    val path = "C:\\Users\\gagli\\Desktop\\datasets\\clean\\movies\\"
+
+    val dataset1 = JSONWrapper.loadProfiles(path + "dataset1.json", realIDField = "realProfileID", sourceId = 1)
+    val maxIdDataset1 = dataset1.map(_.id).max()
+
+    val dataset2 = JSONWrapper.loadProfiles(path + "dataset2.json", realIDField = "realProfileID", sourceId = 2, startIDFrom = maxIdDataset1 + 1)
+
+    val maxProfileID = dataset2.map(_.id).max()
+
+    val separators = Array(maxIdDataset1)
+
+    val profiles = dataset1.union(dataset2)
+
+
+    //Loads the groundtruth
+    val groundtruth = JSONWrapper.loadGroundtruth(path + "groundtruth.json", firstDatasetAttribute = "id1", secondDatasetAttribute = "id2")
+
+    //Converts the id in the groundtruth to the autogenerated ones
+    val realIdIds1 = sc.broadcast(dataset1.map { p =>
+      (p.originalID, p.id)
+    }.collectAsMap())
+
+    val realIdIds2 = sc.broadcast(dataset2.map { p =>
+      (p.originalID, p.id)
+    }.collectAsMap())
+
+    var newGT: Set[(Long, Long)] = null
+    newGT = groundtruth.map { g =>
+      val first = realIdIds1.value.get(g.firstEntityID)
+      val second = realIdIds2.value.get(g.secondEntityID)
+      if (first.isDefined && second.isDefined) {
+        val f = first.get
+        val s = second.get
+        if (f < s) {
+          (f, s)
+        }
+        else {
+          (s, f)
+        }
+      }
+      else {
+        (-1L, -1L)
+      }
+    }.filter(_._1 >= 0).collect().toSet
+
+
+    val newGTSize = newGT.size
+
+    val gt = sc.broadcast(newGT)
+
+    //Token blocking
+    val blocks = TokenBlocking.createBlocks(profiles, separators)
+    val useEntropy = false
+
+    //Loose meta-blocking
+    /*val clusters = LSHMio.clusterSimilarAttributes(
+      profiles = profiles,
+      numHashes = 128,
+      targetThreshold = 0.3,
+      maxFactor = 1.0,
+      numBands = -1,
+      keysToExclude = Nil,
+      computeEntropy = useEntropy,
+      separator = Settings.SOURCE_NAME_SEPARATOR
+    )
+
+    clusters.foreach(println)
+
+    val useEntropy = true
+
+    val blocks = TokenBlocking.createBlocksCluster(profiles, separators, clusters)
+
+    */
+
+    //Purging
+    val blocksPurged = BlockPurging.blockPurging(blocks, 1.015)
+
+    //Filtering
+    val profileBlocks = Converters.blocksToProfileBlocks(blocksPurged)
+    val profileBlocksFiltered = BlockFiltering.blockFiltering(profileBlocks, 0.8)
+    val blocksAfterFiltering = Converters.profilesBlockToBlocks(profileBlocksFiltered, separators)
+
+
+    //Metablocking
+
+    val blockIndexMap = blocksAfterFiltering.map(b => (b.blockID, b.profiles)).collectAsMap()
+    val blockIndex = sc.broadcast(blockIndexMap)
+    val profileBlocksSizeIndex: Broadcast[scala.collection.Map[Long, Int]] = sc.broadcast(profileBlocksFiltered.map(pb => (pb.profileID, pb.blocks.size)).collectAsMap())
+
+    val blocksEntropiesMap: Broadcast[scala.collection.Map[Long, Double]] = {
+      if (useEntropy) {
+        val blocksEntropies = blocks.map(b => (b.blockID, b.entropy)).collectAsMap()
+        sc.broadcast(blocksEntropies)
+      }
+      else {
+        null
+      }
+    }
+
+    val edgesAndCount = WNP.WNP(
+      profileBlocksFiltered,
+      blockIndex,
+      maxProfileID.toInt,
+      separators,
+      gt,
+      PruningUtils.ThresholdTypes.AVG,
+      PruningUtils.WeightTypes.CBS,
+      profileBlocksSizeIndex,
+      useEntropy,
+      blocksEntropiesMap,
+      2.0,
+      PruningUtils.ComparisonTypes.OR
+    )
+
+    val numCandidates = edgesAndCount.map(_._1).sum()
+    val perfectMatch = edgesAndCount.map(_._2).sum()
+    val candidatePairs = edgesAndCount.flatMap(_._3)
+
+    val pc = perfectMatch.toFloat / newGTSize.toFloat
+    val pq = perfectMatch.toFloat / numCandidates.toFloat
+
+    println("PC = " + pc)
+    println("PQ = " + pq)
+    println("Retained edges " + numCandidates)
+
+  }
+}
diff --git a/scala/sparker/src/main/scala-2.11/SparkER/BlockBuildingMethods/BlockingUtils.scala b/scala/sparker/src/main/scala-2.11/SparkER/BlockBuildingMethods/BlockingUtils.scala
@@ -0,0 +1,42 @@
+package SparkER.BlockBuildingMethods
+
+import SparkER.BlockBuildingMethods.LSH.Settings
+//import BlockBuildingMethods.LSHTwitter.Settings
+import SparkER.DataStructures.{BlockAbstract, KeyValue, Profile}
+import org.apache.spark.rdd.RDD
+
+/**
+ * Common methods for the different blocking techniques
+ * @author Luca Gagliardelli
+ * @since 2016/12/07
+ */
+object BlockingUtils {
+  /** Defines the pattern used for tokenization */
+  object TokenizerPattern {
+    /** Split the token by underscore, whitespaces and punctuation */
+    val DEFAULT_SPLITTING = "[\\W_]"
+  }
+
+  /**
+   * Given a tuple (entity ID, [List of entity tokens])
+   * produces a list of tuple (token, entityID)
+    *
+    * @param profileKs couple (entity ID, [List of entity keys])
+   **/
+  def associateKeysToProfileID(profileKs: (Long, Iterable[String])): Iterable[(String, Long)] = {
+    val profileId = profileKs._1
+    val keys = profileKs._2
+    keys.map(key => (key, profileId))
+  }
+
+  /**
+    * Used in the method that calculates the entropy of each block
+    * @param profileKs couple (entity ID, [List of entity' tokens])
+    * @return a list of (token, (profileID, [tokens hashes]))
+    **/
+  def associateKeysToProfileIdEntropy(profileKs: (Long, Iterable[String])): Iterable[(String, (Long, Iterable[Int]))] = {
+    val profileId = profileKs._1
+    val tokens = profileKs._2
+    tokens.map(tokens => (tokens, (profileId, tokens.map(_.hashCode))))
+  }
+}