Improve Cluster Average Similarity Calculation

& add JDoc
jplag · Jun 25, 2022 · 76ecc87 · 76ecc87
1 parent ae68a2f
commit 76ecc87
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 19 deletions.
diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java
@@ -7,6 +7,7 @@
 
 /**
  * Cluster part of a {@link ClusteringResult}.
+ *
  * @param <T> type of the clusters members
  */
 public class Cluster<T> {
@@ -16,6 +17,11 @@ public class Cluster<T> {
     private ClusteringResult<T> clusteringResult = null;
     private final float averageSimilarity;
 
+    /**
+     * @param members           Members of the cluster.
+     * @param communityStrength A metric of how strongly the members of this cluster are connected.
+     * @param averageSimilarity The average similarity between all tuple comparisons of the members in this cluster.
+     */
     public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
         this.members = new ArrayList<>(members);
         this.communityStrength = communityStrength;
@@ -33,6 +39,7 @@ public float getAverageSimilarity() {
 
     /**
      * See {@link ClusteringResult#getCommunityStrength}
+     *
      * @return community strength of the cluster
      */
     public float getCommunityStrength() {
@@ -42,6 +49,7 @@ public float getCommunityStrength() {
     /**
      * Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their
      * own clusters.
+     *
      * @param clusteringResult the clustering result
      */
     public void setClusteringResult(ClusteringResult<T> clusteringResult) {
@@ -62,6 +70,7 @@ public float getCommunityStrengthPerConnection() {
      * Computes a normalized community strength per connection. Can be used as measure for strength of evidence in
      * comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate
      * non-clusters. This method may only be called on clusters that are part of a ClusteringResult.
+     *
      * @return normalized community strength per connection
      */
     public float getNormalizedCommunityStrengthPerConnection() {
@@ -88,6 +97,7 @@ public double getWorth(BiFunction<T, T, Float> similarity) {
 
     /**
      * Computes the average similarity inside the cluster.
+     *
      * @param similarity function that supplies the similarity of two cluster members.
      * @return average similarity
      */
@@ -112,6 +122,7 @@ private int connections() {
 
     /**
      * Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process.
+     *
      * @return is bad
      */
     public boolean isBadCluster() {

diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
@@ -9,12 +9,13 @@
 
 /**
  * Set of clusters dividing a set of entities.
+ *
  * @param <T> type of the clustered entities (e.g. Submission)
  */
 public class ClusteringResult<T> {
 
     private final List<Cluster<T>> clusters;
-    private float communityStrength;
+    private final float communityStrength;
 
     public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
         this.clusters = List.copyOf(clusters);
@@ -34,6 +35,7 @@ public Collection<Cluster<T>> getClusters() {
      * changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in
      * networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi:
      * 10.1103/PhysRevE.69.026113 It's called modularity in that paper.
+     *
      * @return community strength
      */
     public float getCommunityStrength() {
@@ -42,6 +44,7 @@ public float getCommunityStrength() {
 
     /**
      * How much this clustering result is worth during optimization.
+     *
      * @param similarity TODO DF: JAVADOC
      * @return worth
      */
@@ -96,27 +99,24 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
 
     private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
         var sumOfSimilarities = 0f;
-        var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster));
-        for (Integer indexOfSubmission1 : cluster) {
-            for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) {
-                if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) {
-                    sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
-                }
+        List<Integer> indices = List.copyOf(cluster);
+        for (int i = 1; i < cluster.size(); i++) {
+            int indexOfSubmission1 = indices.get(i);
+            for (int j = 0; j < i; j++) { // as the similarity matrix is symmetrical we need only iterate over one half of it
+                int indexOfSubmission2 = indices.get(j);
+                sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
             }
-            submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to
-                                                                                        // avoid adding them anew unnecessary
         }
         int nMinusOne = cluster.size() - 1;
-        float numberOfComparisons = (nMinusOne * (nMinusOne + 1))
-                / 2f; /*
-                       * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
-                       * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element two
-                       * all other except itself and first element (as these two were already compared when we processed the first element),
-                       * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
-                       * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it
-                       * has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) =
-                       * Gauss sum of (n-1)
-                       */
+        float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /*
+         * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
+         * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element to
+         * all other except itself and first element (as these two were already compared when we processed the first element),
+         * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
+         * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it
+         * has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) =
+         * Gauss sum of (n-1)
+         */
         return sumOfSimilarities / numberOfComparisons;
     }