From 8503fbf8089d4748871dcf3b9340f91a21677e40 Mon Sep 17 00:00:00 2001
From: Nesta Bentum <nesta.bentum@gmail.com>
Date: Fri, 10 Jun 2022 11:42:51 +0200
Subject: [PATCH 1/3] Incorporate Average Similarity into Cluster

---
 .../java/de/jplag/clustering/Cluster.java     |  20 +-
 .../jplag/clustering/ClusteringAdapter.java   |   6 +-
 .../de/jplag/clustering/ClusteringResult.java |  37 ++-
 .../reporting/reportobject/model/Cluster.java |  30 +--
 .../java/de/jplag/clustering/ClusterTest.java |  17 +-
 .../clustering/ClusteringRealDataTest.java    | 215 ------------------
 .../clustering/ClusteringResultTest.java      |  63 ++++-
 7 files changed, 114 insertions(+), 274 deletions(-)
 delete mode 100644 jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java
diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java
index b91312d5b..8a1225b23 100644
--- a/jplag/src/main/java/de/jplag/clustering/Cluster.java
+++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java
@@ -8,6 +8,7 @@
 
 /**
  * Cluster part of a {@link ClusteringResult}.
+ *
  * @param <T> type of the clusters members
  */
 public class Cluster<T> {
@@ -15,10 +16,13 @@ public class Cluster<T> {
     private final float communityStrength;
     private final Collection<T> members;
     private ClusteringResult<T> clusteringResult = null;
+    private final float averageSimilarity;
 
-    public Cluster(Collection<T> members, float communityStrength) {
+
+    public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
         this.members = new ArrayList<>(members);
         this.communityStrength = communityStrength;
+        this.averageSimilarity = averageSimilarity;
     }
 
     public Collection<T> getMembers() {
@@ -26,8 +30,13 @@ public Collection<T> getMembers() {
         return members;
     }
 
+    public float getAverageSimilarity() {
+        return averageSimilarity;
+    }
+
     /**
      * See {@link ClusteringResult#getCommunityStrength}
+     *
      * @return community strength of the cluster
      */
     public float getCommunityStrength() {
@@ -37,6 +46,7 @@ public float getCommunityStrength() {
     /**
      * Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their
      * own clusters.
+     *
      * @param clusteringResult the clustering result
      */
     public void setClusteringResult(ClusteringResult<T> clusteringResult) {
@@ -57,11 +67,11 @@ public float getCommunityStrengthPerConnection() {
      * Computes a normalized community strength per connection. Can be used as measure for strength of evidence in
      * comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate
      * non-clusters. This method may only be called on clusters that are part of a ClusteringResult.
+     *
      * @return normalized community strength per connection
      */
     public float getNormalizedCommunityStrengthPerConnection() {
-        List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0)
-                .collect(Collectors.toList());
+        List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0).toList();
         float posCommunityStrengthSum = (float) goodClusters.stream().mapToDouble(Cluster::getCommunityStrengthPerConnection).sum();
 
         int size = clusteringResult.getClusters().size();
@@ -84,10 +94,11 @@ public double getWorth(BiFunction<T, T, Float> similarity) {
 
     /**
      * Computes the average similarity inside the cluster.
+     *
      * @param similarity function that supplies the similarity of two cluster members.
      * @return average similarity
      */
-    public float averageSimilarity(BiFunction<T, T, Float> similarity) {
+    private float averageSimilarity(BiFunction<T, T, Float> similarity) {
         List<T> members = new ArrayList<>(getMembers());
         if (members.size() < 2) {
             return 1;
@@ -108,6 +119,7 @@ private int connections() {
 
     /**
      * Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process.
+     *
      * @return is bad
      */
     public boolean isBadCluster() {
diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
index 8ff57851d..c94893ebe 100644
--- a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
+++ b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
@@ -28,8 +28,9 @@ public class ClusteringAdapter {
     /**
      * Creates the clustering adapter. Only submissions that appear in those similarities might also appear in
      * {@link ClusteringResult}s obtained from this adapter.
+     *
      * @param comparisons that should be included in the process of clustering
-     * @param metric function that assigns a similarity to each comparison
+     * @param metric      function that assigns a similarity to each comparison
      */
     public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlagComparison, Float> metric) {
         mapping = new IntegerMapping<>(comparisons.size());
@@ -52,6 +53,7 @@ public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlag
     /**
      * Use a generic clustering algorithm to cluster the submissions, that were included in this {@link ClusteringAdapter}'s
      * comparison.
+     *
      * @param algorithm that is used for clustering
      * @return the clustered submissions
      */
@@ -60,7 +62,7 @@ public ClusteringResult<Submission> doClustering(GenericClusteringAlgorithm algo
         ClusteringResult<Integer> modularityClusterResult = ClusteringResult.fromIntegerCollections(new ArrayList<>(intResult), similarityMatrix);
         List<Cluster<Submission>> mappedClusters = modularityClusterResult.getClusters().stream()
                 .map(unmappedCluster -> new Cluster<>(unmappedCluster.getMembers().stream().map(mapping::unmap).collect(Collectors.toList()),
-                        unmappedCluster.getCommunityStrength()))
+                        unmappedCluster.getCommunityStrength(), unmappedCluster.getAverageSimilarity()))
                 .collect(Collectors.toList());
         return new ClusteringResult<>(mappedClusters, modularityClusterResult.getCommunityStrength());
     }
diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
index cf8bc57af..007b5eae9 100644
--- a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
+++ b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
@@ -1,12 +1,6 @@
 package de.jplag.clustering;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.function.BiFunction;
 import java.util.stream.DoubleStream;
 
@@ -15,12 +9,13 @@
 
 /**
  * Set of clusters dividing a set of entities.
+ *
  * @param <T> type of the clustered entities (e.g. Submission)
  */
 public class ClusteringResult<T> {
 
     private final List<Cluster<T>> clusters;
-    private float communityStrength = 0;
+    private float communityStrength;
 
     public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
         this.clusters = List.copyOf(clusters);
@@ -40,6 +35,7 @@ public Collection<Cluster<T>> getClusters() {
      * changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in
      * networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi:
      * 10.1103/PhysRevE.69.026113 It's called modularity in that paper.
+     *
      * @return community strength
      */
     public float getCommunityStrength() {
@@ -48,6 +44,7 @@ public float getCommunityStrength() {
 
     /**
      * How much this clustering result is worth during optimization.
+     *
      * @param similarity TODO DF: JAVADOC
      * @return worth
      */
@@ -92,11 +89,33 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
             for (int i = 0; i < clustering.size(); i++) {
                 double outWeightSum = percentagesOfSimilaritySums.getRowVector(i).getL1Norm();
                 double clusterCommunityStrength = percentagesOfSimilaritySums.getEntry(i, i) - outWeightSum * outWeightSum;
-                clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength));
+                float averageSimilarity = calculateAverageSimilarityFor(clustering.get(i), similarity);
+                clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength, averageSimilarity));
                 communityStrength += clusterCommunityStrength;
             }
         }
         return new ClusteringResult<>(clusters, communityStrength);
     }
 
+    private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
+        var sumOfSimilarities = 0f;
+        var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster));
+        for (Integer indexOfSubmission1 : cluster) {
+            for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) {
+                if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) {
+                    sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
+                }
+            }
+            submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to avoid adding them anew unnecessary
+        }
+        int nMinusOne = cluster.size() - 1;
+        float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 comparisons:
+                                                                           compare first element of cluster to all other except itself: n-1 comparisons. compare second element two all other except itself and first element
+                                                                         (as these two were already compared when we processed the first element), n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so on.
+                                                                         when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it has already been compared to all other.
+                                                                         adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = Gauss sum of (n-1)
+         */
+        return sumOfSimilarities / numberOfComparisons;
+    }
+
 }
diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
index 29c6bac88..3f229f538 100644
--- a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
+++ b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
@@ -4,32 +4,6 @@
 
 import com.fasterxml.jackson.annotation.JsonProperty;
 
-public class Cluster {
-
-    @JsonProperty("average_similarity")
-    private final float averageSimilarity;
-
-    @JsonProperty("strength")
-    private final float strength;
-
-    @JsonProperty("members")
-    private final List<String> members;
-
-    public Cluster(float averageSimilarity, float strength, List<String> members) {
-        this.averageSimilarity = averageSimilarity;
-        this.strength = strength;
-        this.members = List.copyOf(members);
-    }
-
-    public float getAverageSimilarity() {
-        return averageSimilarity;
-    }
-
-    public float getStrength() {
-        return strength;
-    }
-
-    public List<String> getMembers() {
-        return members;
-    }
+public record Cluster(@JsonProperty("average_similarity") float averageSimilarity,
+                      @JsonProperty("strength") float strength, @JsonProperty("members") List<String> members) {
 }
diff --git a/jplag/src/test/java/de/jplag/clustering/ClusterTest.java b/jplag/src/test/java/de/jplag/clustering/ClusterTest.java
index 5142f61fa..6e7a442a2 100644
--- a/jplag/src/test/java/de/jplag/clustering/ClusterTest.java
+++ b/jplag/src/test/java/de/jplag/clustering/ClusterTest.java
@@ -13,36 +13,27 @@ public class ClusterTest {
     private static final double EPSILON = 0.00001;
     Cluster<Character> cluster;
 
-    @Test
-    public void testAverageSimilarity() {
-        cluster = new Cluster<>(List.of('a', 'b', 'c'), 0);
-        float averageSimilarity = cluster.averageSimilarity((a, b) -> {
-            return Math.abs((float) (((int) a) - ((int) b)));
-        });
-        assertEquals((1.f + 2.f + 1.f + 1.f + 2.f + 1.f) / 6, averageSimilarity, EPSILON);
-    }
-
     @Test
     public void testCommunityStrengthPerConnectionOneMember() {
-        cluster = new Cluster<>(List.of('a'), 10);
+        cluster = new Cluster<>(List.of('a'), 10, 0);
         assertEquals(0.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
     }
 
     @Test
     public void testCommunityStrengthPerConnectionTwoMembers() {
-        cluster = new Cluster<>(List.of('a', 'b'), 10);
+        cluster = new Cluster<>(List.of('a', 'b'), 10, 0);
         assertEquals(10.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
     }
 
     @Test
     public void testCommunityStrengthPerConnectionThreeMembers() {
-        cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
+        cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
         assertEquals(10.0 / 3, cluster.getCommunityStrengthPerConnection(), EPSILON);
     }
 
     @Test
     public void testNormalizedCommunityStrength() {
-        cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
+        cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
         @SuppressWarnings("unchecked")
         ClusteringResult<Character> clusteringResult = mock(ClusteringResult.class);
         when(clusteringResult.getClusters()).thenReturn(List.of(cluster, cluster));
diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java
deleted file mode 100644
index 8e3844332..000000000
--- a/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java
+++ /dev/null
@@ -1,215 +0,0 @@
-package de.jplag.clustering;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assumptions.assumeTrue;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.math3.linear.Array2DRowRealMatrix;
-import org.apache.commons.math3.linear.RealMatrix;
-import org.junit.jupiter.api.Test;
-
-import de.jplag.clustering.algorithm.GenericClusteringAlgorithm;
-import de.jplag.clustering.algorithm.SpectralClustering;
-import de.jplag.clustering.preprocessors.CumulativeDistributionFunctionPreprocessor;
-
-/**
- * These test are not meant to be run during normal unit testing. They can be used to test the clustering algorithms
- * against data from the private pseudomized reports repository. These tests test PROBABILISTIC behavior, so use with
- * caution!
- */
-public class ClusteringRealDataTest {
-
-    private static class TestFile {
-
-        private String uri;
-        private Optional<List<String>> expected;
-
-        public TestFile(String uri, Optional<List<String>> expected) {
-            this.uri = uri;
-            this.expected = expected;
-        }
-    }
-
-    private static List<String> B_POSITIVE = Arrays.asList(new String[] {"Student (31)", "Student (223)"});
-    private static List<String> C_POSITIVE = Arrays.asList(new String[] {"Student (166)", "Student (212)", "Student (236)", "Student (229)"});
-
-    private static final TestFile[] OLD_CLUSTERING_DATA = {new TestFile("de/jplag/PseudonymizedReports/alt/A_1000_matches_max.csv", Optional.empty()),
-            new TestFile("de/jplag/PseudonymizedReports/alt/B_1000_matches_max.csv", Optional.of(B_POSITIVE)),
-            new TestFile("de/jplag/PseudonymizedReports/alt/C_1000_matches_max.csv", Optional.of(C_POSITIVE)),};
-
-    private static final TestFile[] NEW_CLUSTERING_DATA = {new TestFile("de/jplag/PseudonymizedReports/neu/A_matches_avg.csv", Optional.empty()),
-            new TestFile("de/jplag/PseudonymizedReports/neu/B_matches_avg.csv", Optional.of(B_POSITIVE)),
-            new TestFile("de/jplag/PseudonymizedReports/neu/C_matches_avg.csv", Optional.of(C_POSITIVE)),};
-
-    private String str(float f) {
-        return String.format("%.4f", f);
-    }
-
-    private URL loadFromClasspath(String file) throws FileNotFoundException {
-        URL url = getClass().getClassLoader().getResource(file);
-        if (url == null) {
-            assumeTrue(false, file + " not found. 'de/jpag/PseudonymizedReports' must contain the data from the PseudonymizedReports repository.");
-        }
-        return url;
-    }
-
-    private void doTesting(ReadResult readResult, Optional<List<String>> expected) {
-        RealMatrix clusteringSimilarity = new Array2DRowRealMatrix(readResult.similarity.getData());
-
-        /*
-         * AgglomerativeClustering.ClusteringOptions options = new AgglomerativeClustering.ClusteringOptions();
-         * options.minimalSimilarity = 0.15f; options.similarity = AgglomerativeClustering.InterClusterSimilarity.AVERAGE;
-         * ClusteringAlgorithm clusteringAlg = new AgglomerativeClustering(options);
-         */
-
-        SpectralClustering clusteringAlg = new SpectralClustering(ClusteringOptions.DEFAULTS);
-        ClusteringPreprocessor preprocessor = new CumulativeDistributionFunctionPreprocessor();
-        GenericClusteringAlgorithm preprocessedClusteringAlg = new PreprocessedClusteringAlgorithm(clusteringAlg, preprocessor);
-        Collection<Collection<Integer>> clustering = preprocessedClusteringAlg.cluster(clusteringSimilarity);
-        ClusteringResult<Integer> mRes = ClusteringResult.fromIntegerCollections(new ArrayList<>(clustering), readResult.similarity);
-        List<Cluster<Integer>> clusters = new ArrayList<>(mRes.getClusters());
-        clusters.sort(Comparator.comparingDouble(c -> -c.getNormalizedCommunityStrengthPerConnection()));
-
-        System.out.println("cs\tncsm\tavgSim\tcombined\tmembers");
-        for (Cluster<Integer> c : clusters) {
-            float ncsm = c.getCommunityStrengthPerConnection();
-            float avgSim = c.averageSimilarity((a, b) -> (float) readResult.similarity.getEntry(a, b));
-            System.out.println(str(c.getCommunityStrength()) + "\t" + str(ncsm) + "\t" + str(avgSim) + "\t"
-                    + c.getMembers().stream().map(readResult.mapping::unmap).collect(Collectors.toList()));
-        }
-        System.out.println("Community Strength: " + mRes.getCommunityStrength());
-        System.out.println("Clusters: " + clusters.size());
-
-        expected.ifPresent(expectedIdentifiers -> {
-            Set<String> expectedIdentifiersSet = new HashSet<>(expectedIdentifiers);
-            Set<String> bestClusters = clusters.get(0).getMembers().stream().map(readResult.mapping::unmap).collect(Collectors.toSet());
-            assertEquals(expectedIdentifiersSet, bestClusters);
-            System.out.println("hey");
-        });
-    }
-
-    @Test
-    public void aClusteringOld() throws FileNotFoundException, URISyntaxException {
-        for (TestFile testFile : OLD_CLUSTERING_DATA) {
-            URL url = loadFromClasspath(testFile.uri);
-            File file = new File(url.toURI());
-            ReadResult r = readOldCsv(file);
-            doTesting(r, testFile.expected);
-        }
-    }
-
-    @Test
-    public void aClusteringNew() throws FileNotFoundException, URISyntaxException {
-        for (TestFile testFile : NEW_CLUSTERING_DATA) {
-            URL url = loadFromClasspath(testFile.uri);
-            File file = new File(url.toURI());
-            ReadResult r = readNewCsv(file);
-            doTesting(r, testFile.expected);
-        }
-    }
-
-    private static class ReadComparison {
-        int left;
-        int right;
-        float similarity;
-    }
-
-    private static class ReadResult {
-        IntegerMapping<String> mapping;
-        RealMatrix similarity;
-    }
-
-    private static ReadResult readNewCsv(File fileName) throws FileNotFoundException {
-        IntegerMapping<String> mapping = new IntegerMapping<>(512);
-        List<ReadComparison> comparisons = new ArrayList<>(512);
-        try (CSVReader reader = new CSVReader(fileName, ";")) {
-            while (reader.hasNext()) {
-                List<String> records = reader.next();
-                if (records.isEmpty())
-                    continue;
-                String leftStudent = records.get(1);
-                String rightStudent = records.get(2);
-                String similarity = records.get(3);
-                ReadComparison comparison = new ReadComparison();
-                comparison.left = mapping.map(leftStudent);
-                comparison.right = mapping.map(rightStudent);
-                comparison.similarity = Float.parseFloat(similarity) / 100;
-                comparisons.add(comparison);
-            }
-        }
-        RealMatrix matrix = new Array2DRowRealMatrix(mapping.size(), mapping.size());
-        for (ReadComparison comparison : comparisons) {
-            matrix.setEntry(comparison.left, comparison.right, comparison.similarity);
-            matrix.setEntry(comparison.right, comparison.left, comparison.similarity);
-        }
-        ReadResult r = new ReadResult();
-        r.similarity = matrix;
-        r.mapping = mapping;
-        return r;
-    }
-
-    private static ReadResult readOldCsv(File fileName) throws FileNotFoundException {
-        IntegerMapping<String> mapping = new IntegerMapping<>(512);
-        List<ReadComparison> comparisons = new ArrayList<>(512);
-        try (CSVReader reader = new CSVReader(fileName, ";")) {
-            while (reader.hasNext()) {
-                List<String> records = reader.next();
-                if (records.isEmpty())
-                    continue;
-                Deque<String> stuff = new ArrayDeque<>(records);
-                String leftStudent = stuff.removeFirst();
-                int leftID = mapping.map(leftStudent);
-                while (stuff.size() >= 3) {
-                    ReadComparison comparison = new ReadComparison();
-                    comparison.left = leftID;
-                    stuff.removeFirst(); // comparison ID not needed
-                    String rightStudent = stuff.removeFirst();
-                    String similarity = stuff.removeFirst();
-                    comparison.right = mapping.map(rightStudent);
-                    comparison.similarity = Float.parseFloat(similarity) / 100;
-                    comparisons.add(comparison);
-                }
-            }
-        }
-        RealMatrix matrix = new Array2DRowRealMatrix(mapping.size(), mapping.size());
-        for (ReadComparison comparison : comparisons) {
-            matrix.setEntry(comparison.left, comparison.right, comparison.similarity);
-            matrix.setEntry(comparison.right, comparison.left, comparison.similarity);
-        }
-        ReadResult r = new ReadResult();
-        r.similarity = matrix;
-        r.mapping = mapping;
-        return r;
-    }
-
-    private static class CSVReader implements AutoCloseable {
-        private String delimiter;
-        private Scanner scanner;
-
-        private CSVReader(File fileName, String delimiter) throws FileNotFoundException {
-            this.delimiter = delimiter;
-            scanner = new Scanner(fileName);
-        }
-
-        @Override
-        public void close() {
-            scanner.close();
-        }
-
-        List<String> next() {
-            String line = scanner.nextLine();
-            String[] records = line.split(delimiter);
-            return Arrays.asList(records);
-        }
-
-        boolean hasNext() {
-            return scanner.hasNextLine();
-        }
-    }
-}
diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
index 0c0d7fbd4..5bc2484f0 100644
--- a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
+++ b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
@@ -2,6 +2,7 @@
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.commons.math3.linear.Array2DRowRealMatrix;
@@ -49,8 +50,64 @@ void uniformClustering() {
         assertEquals(0.0, result.getCommunityStrength(), 0.00001);
     }
 
-    private static void setEntries(RealMatrix matrix, int i, int j, double value) {
-        matrix.setEntry(i, j, value);
-        matrix.setEntry(j, i, value);
+    @Test
+    void averageSimilarity() {
+        var similarity = new Array2DRowRealMatrix(4, 4);
+
+        setEntries(similarity, 0, 1, 0.5);
+        setEntries(similarity, 0, 2, 0.3);
+        setEntries(similarity, 0, 3, 0.4);
+        setEntries(similarity, 1, 2, 0.1);
+        setEntries(similarity, 1, 3, 0.1);
+        setEntries(similarity, 2, 3, 0.7);
+
+        ClusteringResult<Integer> result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 2, 3)), similarity);
+
+        assertEquals(0.35, result.getClusters().stream().findFirst().orElseThrow().getAverageSimilarity(), 0.00001);
+    }
+
+    @Test
+    void averageSimilarity2() {
+        var similarity = new Array2DRowRealMatrix(6, 6);
+
+        setEntries(similarity, 0, 1, 0.5);
+        setEntries(similarity, 0, 2, 0.3);
+        setEntries(similarity, 0, 3, 0.4);
+        setEntries(similarity, 0, 4, 0.4);
+        setEntries(similarity, 1, 2, 0.1);
+        setEntries(similarity, 1, 3, 0.1);
+        setEntries(similarity, 1, 4, 0.3);
+        setEntries(similarity, 2, 3, 0.7);
+        setEntries(similarity, 2, 4, 0.2);
+        setEntries(similarity, 2, 5, 0.9);
+        setEntries(similarity, 3, 4, 0.5);
+        setEntries(similarity, 3, 5, 0.05);
+
+
+        ClusteringResult<Integer> result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 4), List.of(2, 3, 5)), similarity);
+        var clusters = new ArrayList<>(result.getClusters());
+
+        assertEquals(0.4, clusters.get(0).getAverageSimilarity(), 0.00001);
+        assertEquals(0.55, clusters.get(1).getAverageSimilarity(), 0.00001);
+    }
+
+    @Test
+    void averageSimilarityPerfectClustering() {
+        RealMatrix similarity = new Array2DRowRealMatrix(4, 4);
+
+        // These are similar
+        setEntries(similarity, 0, 1, 1f);
+        setEntries(similarity, 2, 3, 1f);
+
+        // Others are dissimilar
+
+        ClusteringResult<Integer> result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1), List.of(2, 3)), similarity);
+        var cluster = result.getClusters().stream().findFirst().orElseThrow();
+        assertEquals(1f, cluster.getAverageSimilarity(), 0.00001);
+    }
+
+    private static void setEntries(RealMatrix matrix, int i, int j, double similarity) {
+        matrix.setEntry(i, j, similarity);
+        matrix.setEntry(j, i, similarity);
     }
 }

From ae68a2f93deef3ea90973708dae5e6e8657a3e71 Mon Sep 17 00:00:00 2001
From: Nesta Bentum <nesta.bentum@gmail.com>
Date: Fri, 10 Jun 2022 13:05:32 +0200
Subject: [PATCH 2/3] Write ClusteringResult to Report

---
 .../java/de/jplag/clustering/Cluster.java     |  8 ----
 .../jplag/clustering/ClusteringAdapter.java   |  4 +-
 .../de/jplag/clustering/ClusteringResult.java | 22 +++++----
 .../reportobject/ReportObjectFactory.java     | 11 ++---
 .../mapper/ClusteringResultMapper.java        | 26 ++++++++++
 .../reporting/reportobject/model/Cluster.java |  4 +-
 .../clustering/ClusteringResultTest.java      |  1 -
 .../mapper/ClusteringResultMapperTest.java    | 48 +++++++++++++++++++
 8 files changed, 92 insertions(+), 32 deletions(-)
 create mode 100644 jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java
 create mode 100644 jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java

diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java
index 8a1225b23..21b03fc5e 100644
--- a/jplag/src/main/java/de/jplag/clustering/Cluster.java
+++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java
@@ -4,11 +4,9 @@
 import java.util.Collection;
 import java.util.List;
 import java.util.function.BiFunction;
-import java.util.stream.Collectors;
 
 /**
  * Cluster part of a {@link ClusteringResult}.
- *
  * @param <T> type of the clusters members
  */
 public class Cluster<T> {
@@ -18,7 +16,6 @@ public class Cluster<T> {
     private ClusteringResult<T> clusteringResult = null;
     private final float averageSimilarity;
 
-
     public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
         this.members = new ArrayList<>(members);
         this.communityStrength = communityStrength;
@@ -36,7 +33,6 @@ public float getAverageSimilarity() {
 
     /**
      * See {@link ClusteringResult#getCommunityStrength}
-     *
      * @return community strength of the cluster
      */
     public float getCommunityStrength() {
@@ -46,7 +42,6 @@ public float getCommunityStrength() {
     /**
      * Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their
      * own clusters.
-     *
      * @param clusteringResult the clustering result
      */
     public void setClusteringResult(ClusteringResult<T> clusteringResult) {
@@ -67,7 +62,6 @@ public float getCommunityStrengthPerConnection() {
      * Computes a normalized community strength per connection. Can be used as measure for strength of evidence in
      * comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate
      * non-clusters. This method may only be called on clusters that are part of a ClusteringResult.
-     *
      * @return normalized community strength per connection
      */
     public float getNormalizedCommunityStrengthPerConnection() {
@@ -94,7 +88,6 @@ public double getWorth(BiFunction<T, T, Float> similarity) {
 
     /**
      * Computes the average similarity inside the cluster.
-     *
      * @param similarity function that supplies the similarity of two cluster members.
      * @return average similarity
      */
@@ -119,7 +112,6 @@ private int connections() {
 
     /**
      * Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process.
-     *
      * @return is bad
      */
     public boolean isBadCluster() {
diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
index c94893ebe..19b8dbfa9 100644
--- a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
+++ b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java
@@ -28,9 +28,8 @@ public class ClusteringAdapter {
     /**
      * Creates the clustering adapter. Only submissions that appear in those similarities might also appear in
      * {@link ClusteringResult}s obtained from this adapter.
-     *
      * @param comparisons that should be included in the process of clustering
-     * @param metric      function that assigns a similarity to each comparison
+     * @param metric function that assigns a similarity to each comparison
      */
     public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlagComparison, Float> metric) {
         mapping = new IntegerMapping<>(comparisons.size());
@@ -53,7 +52,6 @@ public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlag
     /**
      * Use a generic clustering algorithm to cluster the submissions, that were included in this {@link ClusteringAdapter}'s
      * comparison.
-     *
      * @param algorithm that is used for clustering
      * @return the clustered submissions
      */
diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
index 007b5eae9..3da872b1b 100644
--- a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
+++ b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
@@ -9,7 +9,6 @@
 
 /**
  * Set of clusters dividing a set of entities.
- *
  * @param <T> type of the clustered entities (e.g. Submission)
  */
 public class ClusteringResult<T> {
@@ -35,7 +34,6 @@ public Collection<Cluster<T>> getClusters() {
      * changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in
      * networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi:
      * 10.1103/PhysRevE.69.026113 It's called modularity in that paper.
-     *
      * @return community strength
      */
     public float getCommunityStrength() {
@@ -44,7 +42,6 @@ public float getCommunityStrength() {
 
     /**
      * How much this clustering result is worth during optimization.
-     *
      * @param similarity TODO DF: JAVADOC
      * @return worth
      */
@@ -106,15 +103,20 @@ private static float calculateAverageSimilarityFor(Collection<Integer> cluster,
                     sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
                 }
             }
-            submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to avoid adding them anew unnecessary
+            submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to
+                                                                                        // avoid adding them anew unnecessary
         }
         int nMinusOne = cluster.size() - 1;
-        float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 comparisons:
-                                                                           compare first element of cluster to all other except itself: n-1 comparisons. compare second element two all other except itself and first element
-                                                                         (as these two were already compared when we processed the first element), n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so on.
-                                                                         when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it has already been compared to all other.
-                                                                         adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = Gauss sum of (n-1)
-         */
+        float numberOfComparisons = (nMinusOne * (nMinusOne + 1))
+                / 2f; /*
+                       * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
+                       * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element two
+                       * all other except itself and first element (as these two were already compared when we processed the first element),
+                       * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
+                       * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it
+                       * has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) =
+                       * Gauss sum of (n-1)
+                       */
         return sumOfSimilarities / numberOfComparisons;
     }
 
diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java b/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java
index ad97eb92c..03861ee68 100644
--- a/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java
+++ b/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java
@@ -12,6 +12,7 @@
 import org.slf4j.LoggerFactory;
 
 import de.jplag.*;
+import de.jplag.reporting.reportobject.mapper.ClusteringResultMapper;
 import de.jplag.reporting.reportobject.model.*;
 import de.jplag.reporting.reportobject.model.Match;
 
@@ -21,6 +22,7 @@
 public class ReportObjectFactory {
 
     private static final Logger logger = LoggerFactory.getLogger(ReportObjectFactory.class);
+    private static final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper();
 
     /**
      * Converts a JPlagResult to a JPlagReport.
@@ -58,7 +60,7 @@ private static OverviewReport generateOverviewReport(JPlagResult result) {
         overviewReport.setExecutionTime(result.getDuration());
         overviewReport.setComparisonNames(getComparisonNames(comparisons));
         overviewReport.setMetrics(getMetrics(result));
-        overviewReport.setClusters(getClusters(result));
+        overviewReport.setClusters(clusteringResultMapper.map(result));
 
         return overviewReport;
     }
@@ -166,13 +168,6 @@ private static Match convertMatchToReportMatch(JPlagComparison comparison, de.jp
         return new Match(startTokenFirst.getFile(), startTokenSecond.getFile(), startFirst, endFirst, startSecond, endSecond, tokens);
     }
 
-    // TODO implement after PR Read clustering #281
-    private static List<Cluster> getClusters(JPlagResult result) {
-        // List<ClusteringResult<Submission>> clusters = result.getClusteringResult();
-        // return clusters.map( c -> new Cluster(getAvgSimilarity, getStrength, c.getMembers().map(Submission::getName)))
-        return List.of();
-    }
-
     private static List<String> readFileLines(File file) {
         List<String> lines = new ArrayList<>();
         try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) {
diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java b/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java
new file mode 100644
index 000000000..ea75f2cbd
--- /dev/null
+++ b/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java
@@ -0,0 +1,26 @@
+package de.jplag.reporting.reportobject.mapper;
+
+import java.util.Collection;
+import java.util.List;
+
+import de.jplag.JPlagResult;
+import de.jplag.Submission;
+import de.jplag.clustering.ClusteringResult;
+import de.jplag.reporting.reportobject.model.Cluster;
+
+/**
+ * Extracts and maps the clusters from the JPlagResult to the corresponding JSON DTO
+ */
+public class ClusteringResultMapper {
+    public List<Cluster> map(JPlagResult result) {
+        var clusteringResult = result.getClusteringResult();
+        return clusteringResult.stream().map(ClusteringResult::getClusters).flatMap(Collection::stream).map(this::convertCluster).toList();
+    }
+
+    private Cluster convertCluster(de.jplag.clustering.Cluster<Submission> from) {
+        var strength = from.getCommunityStrength();
+        var avgSimilarity = from.getAverageSimilarity();
+        var member = from.getMembers().stream().map(Submission::getName).toList();
+        return new Cluster(avgSimilarity, strength, member);
+    }
+}
diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
index 3f229f538..17af0728d 100644
--- a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
+++ b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java
@@ -4,6 +4,6 @@
 
 import com.fasterxml.jackson.annotation.JsonProperty;
 
-public record Cluster(@JsonProperty("average_similarity") float averageSimilarity,
-                      @JsonProperty("strength") float strength, @JsonProperty("members") List<String> members) {
+public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, @JsonProperty("strength") float strength,
+        @JsonProperty("members") List<String> members) {
 }
diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
index 5bc2484f0..4b01691b6 100644
--- a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
+++ b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java
@@ -83,7 +83,6 @@ void averageSimilarity2() {
         setEntries(similarity, 3, 4, 0.5);
         setEntries(similarity, 3, 5, 0.05);
 
-
         ClusteringResult<Integer> result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 4), List.of(2, 3, 5)), similarity);
         var clusters = new ArrayList<>(result.getClusters());
 
diff --git a/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java b/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java
new file mode 100644
index 000000000..103126841
--- /dev/null
+++ b/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java
@@ -0,0 +1,48 @@
+package de.jplag.reporting.reportobject.mapper;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import de.jplag.JPlagResult;
+import de.jplag.Submission;
+import de.jplag.clustering.Cluster;
+import de.jplag.clustering.ClusteringResult;
+
+public class ClusteringResultMapperTest {
+    private final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper();
+
+    @Test
+    public void test() {
+        // given
+        JPlagResult resultMock = mock(JPlagResult.class);
+        Cluster<Submission> cluster1 = createClusterWith(0.2f, 0.4f, "1", "2");
+        Cluster<Submission> cluster2 = createClusterWith(0.3f, 0.6f, "3", "4", "5");
+        when(resultMock.getClusteringResult()).thenReturn(List.of(new ClusteringResult<>(List.of(cluster1, cluster2), 0.3f)));
+
+        // when
+        var result = clusteringResultMapper.map(resultMock);
+
+        // then
+        assertEquals(List.of(new de.jplag.reporting.reportobject.model.Cluster(0.4f, 0.2f, List.of("1", "2")),
+                new de.jplag.reporting.reportobject.model.Cluster(0.6f, 0.3f, List.of("3", "4", "5"))
+
+        ), result);
+    }
+
+    private Cluster<Submission> createClusterWith(Float communityStrength, Float averageSimilarity, String... ids) {
+        var submissions = Arrays.stream(ids).map(this::submissionWithId).toList();
+        return new Cluster<>(submissions, communityStrength, averageSimilarity);
+    }
+
+    private Submission submissionWithId(String id) {
+        Submission submission = mock(Submission.class);
+        when(submission.getName()).thenReturn(id);
+        return submission;
+    }
+}

From 02b1960463dbd0dd2fb0f7a19c9d2612814b097a Mon Sep 17 00:00:00 2001
From: Nesta Bentum <nesta.bentum@gmail.com>
Date: Sat, 25 Jun 2022 11:44:11 +0200
Subject: [PATCH 3/3] Improve Cluster Average Similarity Calculation

& add JDoc
---
 .../main/java/de/jplag/clustering/Cluster.java |  5 +++++
 .../de/jplag/clustering/ClusteringResult.java  | 18 ++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java
index 21b03fc5e..d5c5e886a 100644
--- a/jplag/src/main/java/de/jplag/clustering/Cluster.java
+++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java
@@ -16,6 +16,11 @@ public class Cluster<T> {
     private ClusteringResult<T> clusteringResult = null;
     private final float averageSimilarity;
 
+    /**
+     * @param members Members of the cluster.
+     * @param communityStrength A metric of how strongly the members of this cluster are connected.
+     * @param averageSimilarity The average similarity between all tuple comparisons of the members in this cluster.
+     */
     public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
         this.members = new ArrayList<>(members);
         this.communityStrength = communityStrength;
diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
index 3da872b1b..ce46e764c 100644
--- a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
+++ b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
@@ -14,7 +14,7 @@
 public class ClusteringResult<T> {
 
     private final List<Cluster<T>> clusters;
-    private float communityStrength;
+    private final float communityStrength;
 
     public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
         this.clusters = List.copyOf(clusters);
@@ -96,21 +96,19 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
 
     private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
         var sumOfSimilarities = 0f;
-        var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster));
-        for (Integer indexOfSubmission1 : cluster) {
-            for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) {
-                if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) {
-                    sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
-                }
+        List<Integer> indices = List.copyOf(cluster);
+        for (int i = 1; i < cluster.size(); i++) {
+            int indexOfSubmission1 = indices.get(i);
+            for (int j = 0; j < i; j++) { // as the similarity matrix is symmetrical we need only iterate over one half of it
+                int indexOfSubmission2 = indices.get(j);
+                sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
             }
-            submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to
-                                                                                        // avoid adding them anew unnecessary
         }
         int nMinusOne = cluster.size() - 1;
         float numberOfComparisons = (nMinusOne * (nMinusOne + 1))
                 / 2f; /*
                        * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
-                       * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element two
+                       * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element to
                        * all other except itself and first element (as these two were already compared when we processed the first element),
                        * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
                        * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it