From 8503fbf8089d4748871dcf3b9340f91a21677e40 Mon Sep 17 00:00:00 2001 From: Nesta Bentum Date: Fri, 10 Jun 2022 11:42:51 +0200 Subject: [PATCH 1/3] Incorporate Average Similarity into Cluster --- .../java/de/jplag/clustering/Cluster.java | 20 +- .../jplag/clustering/ClusteringAdapter.java | 6 +- .../de/jplag/clustering/ClusteringResult.java | 37 ++- .../reporting/reportobject/model/Cluster.java | 30 +-- .../java/de/jplag/clustering/ClusterTest.java | 17 +- .../clustering/ClusteringRealDataTest.java | 215 ------------------ .../clustering/ClusteringResultTest.java | 63 ++++- 7 files changed, 114 insertions(+), 274 deletions(-) delete mode 100644 jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java index b91312d5b..8a1225b23 100644 --- a/jplag/src/main/java/de/jplag/clustering/Cluster.java +++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java @@ -8,6 +8,7 @@ /** * Cluster part of a {@link ClusteringResult}. + * * @param type of the clusters members */ public class Cluster { @@ -15,10 +16,13 @@ public class Cluster { private final float communityStrength; private final Collection members; private ClusteringResult clusteringResult = null; + private final float averageSimilarity; - public Cluster(Collection members, float communityStrength) { + + public Cluster(Collection members, float communityStrength, float averageSimilarity) { this.members = new ArrayList<>(members); this.communityStrength = communityStrength; + this.averageSimilarity = averageSimilarity; } public Collection getMembers() { @@ -26,8 +30,13 @@ public Collection getMembers() { return members; } + public float getAverageSimilarity() { + return averageSimilarity; + } + /** * See {@link ClusteringResult#getCommunityStrength} + * * @return community strength of the cluster */ public float getCommunityStrength() { @@ -37,6 +46,7 @@ public float getCommunityStrength() { /** * Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their * own clusters. + * * @param clusteringResult the clustering result */ public void setClusteringResult(ClusteringResult clusteringResult) { @@ -57,11 +67,11 @@ public float getCommunityStrengthPerConnection() { * Computes a normalized community strength per connection. Can be used as measure for strength of evidence in * comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate * non-clusters. This method may only be called on clusters that are part of a ClusteringResult. + * * @return normalized community strength per connection */ public float getNormalizedCommunityStrengthPerConnection() { - List> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0) - .collect(Collectors.toList()); + List> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0).toList(); float posCommunityStrengthSum = (float) goodClusters.stream().mapToDouble(Cluster::getCommunityStrengthPerConnection).sum(); int size = clusteringResult.getClusters().size(); @@ -84,10 +94,11 @@ public double getWorth(BiFunction similarity) { /** * Computes the average similarity inside the cluster. + * * @param similarity function that supplies the similarity of two cluster members. * @return average similarity */ - public float averageSimilarity(BiFunction similarity) { + private float averageSimilarity(BiFunction similarity) { List members = new ArrayList<>(getMembers()); if (members.size() < 2) { return 1; @@ -108,6 +119,7 @@ private int connections() { /** * Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process. + * * @return is bad */ public boolean isBadCluster() { diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java index 8ff57851d..c94893ebe 100644 --- a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java +++ b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java @@ -28,8 +28,9 @@ public class ClusteringAdapter { /** * Creates the clustering adapter. Only submissions that appear in those similarities might also appear in * {@link ClusteringResult}s obtained from this adapter. + * * @param comparisons that should be included in the process of clustering - * @param metric function that assigns a similarity to each comparison + * @param metric function that assigns a similarity to each comparison */ public ClusteringAdapter(Collection comparisons, Function metric) { mapping = new IntegerMapping<>(comparisons.size()); @@ -52,6 +53,7 @@ public ClusteringAdapter(Collection comparisons, Function doClustering(GenericClusteringAlgorithm algo ClusteringResult modularityClusterResult = ClusteringResult.fromIntegerCollections(new ArrayList<>(intResult), similarityMatrix); List> mappedClusters = modularityClusterResult.getClusters().stream() .map(unmappedCluster -> new Cluster<>(unmappedCluster.getMembers().stream().map(mapping::unmap).collect(Collectors.toList()), - unmappedCluster.getCommunityStrength())) + unmappedCluster.getCommunityStrength(), unmappedCluster.getAverageSimilarity())) .collect(Collectors.toList()); return new ClusteringResult<>(mappedClusters, modularityClusterResult.getCommunityStrength()); } diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java index cf8bc57af..007b5eae9 100644 --- a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java +++ b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java @@ -1,12 +1,6 @@ package de.jplag.clustering; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.function.BiFunction; import java.util.stream.DoubleStream; @@ -15,12 +9,13 @@ /** * Set of clusters dividing a set of entities. + * * @param type of the clustered entities (e.g. Submission) */ public class ClusteringResult { private final List> clusters; - private float communityStrength = 0; + private float communityStrength; public ClusteringResult(Collection> clusters, float communityStrength) { this.clusters = List.copyOf(clusters); @@ -40,6 +35,7 @@ public Collection> getClusters() { * changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in * networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi: * 10.1103/PhysRevE.69.026113 It's called modularity in that paper. + * * @return community strength */ public float getCommunityStrength() { @@ -48,6 +44,7 @@ public float getCommunityStrength() { /** * How much this clustering result is worth during optimization. + * * @param similarity TODO DF: JAVADOC * @return worth */ @@ -92,11 +89,33 @@ public static ClusteringResult fromIntegerCollections(List(clustering.get(i), (float) clusterCommunityStrength)); + float averageSimilarity = calculateAverageSimilarityFor(clustering.get(i), similarity); + clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength, averageSimilarity)); communityStrength += clusterCommunityStrength; } } return new ClusteringResult<>(clusters, communityStrength); } + private static float calculateAverageSimilarityFor(Collection cluster, RealMatrix similarityMatrix) { + var sumOfSimilarities = 0f; + var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster)); + for (Integer indexOfSubmission1 : cluster) { + for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) { + if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) { + sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2); + } + } + submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to avoid adding them anew unnecessary + } + int nMinusOne = cluster.size() - 1; + float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 comparisons: + compare first element of cluster to all other except itself: n-1 comparisons. compare second element two all other except itself and first element + (as these two were already compared when we processed the first element), n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so on. + when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it has already been compared to all other. + adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = Gauss sum of (n-1) + */ + return sumOfSimilarities / numberOfComparisons; + } + } diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java index 29c6bac88..3f229f538 100644 --- a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java +++ b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java @@ -4,32 +4,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; -public class Cluster { - - @JsonProperty("average_similarity") - private final float averageSimilarity; - - @JsonProperty("strength") - private final float strength; - - @JsonProperty("members") - private final List members; - - public Cluster(float averageSimilarity, float strength, List members) { - this.averageSimilarity = averageSimilarity; - this.strength = strength; - this.members = List.copyOf(members); - } - - public float getAverageSimilarity() { - return averageSimilarity; - } - - public float getStrength() { - return strength; - } - - public List getMembers() { - return members; - } +public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, + @JsonProperty("strength") float strength, @JsonProperty("members") List members) { } diff --git a/jplag/src/test/java/de/jplag/clustering/ClusterTest.java b/jplag/src/test/java/de/jplag/clustering/ClusterTest.java index 5142f61fa..6e7a442a2 100644 --- a/jplag/src/test/java/de/jplag/clustering/ClusterTest.java +++ b/jplag/src/test/java/de/jplag/clustering/ClusterTest.java @@ -13,36 +13,27 @@ public class ClusterTest { private static final double EPSILON = 0.00001; Cluster cluster; - @Test - public void testAverageSimilarity() { - cluster = new Cluster<>(List.of('a', 'b', 'c'), 0); - float averageSimilarity = cluster.averageSimilarity((a, b) -> { - return Math.abs((float) (((int) a) - ((int) b))); - }); - assertEquals((1.f + 2.f + 1.f + 1.f + 2.f + 1.f) / 6, averageSimilarity, EPSILON); - } - @Test public void testCommunityStrengthPerConnectionOneMember() { - cluster = new Cluster<>(List.of('a'), 10); + cluster = new Cluster<>(List.of('a'), 10, 0); assertEquals(0.0, cluster.getCommunityStrengthPerConnection(), EPSILON); } @Test public void testCommunityStrengthPerConnectionTwoMembers() { - cluster = new Cluster<>(List.of('a', 'b'), 10); + cluster = new Cluster<>(List.of('a', 'b'), 10, 0); assertEquals(10.0, cluster.getCommunityStrengthPerConnection(), EPSILON); } @Test public void testCommunityStrengthPerConnectionThreeMembers() { - cluster = new Cluster<>(List.of('a', 'b', 'c'), 10); + cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0); assertEquals(10.0 / 3, cluster.getCommunityStrengthPerConnection(), EPSILON); } @Test public void testNormalizedCommunityStrength() { - cluster = new Cluster<>(List.of('a', 'b', 'c'), 10); + cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0); @SuppressWarnings("unchecked") ClusteringResult clusteringResult = mock(ClusteringResult.class); when(clusteringResult.getClusters()).thenReturn(List.of(cluster, cluster)); diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java deleted file mode 100644 index 8e3844332..000000000 --- a/jplag/src/test/java/de/jplag/clustering/ClusteringRealDataTest.java +++ /dev/null @@ -1,215 +0,0 @@ -package de.jplag.clustering; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import java.io.File; -import java.io.FileNotFoundException; -import java.net.URISyntaxException; -import java.net.URL; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.math3.linear.Array2DRowRealMatrix; -import org.apache.commons.math3.linear.RealMatrix; -import org.junit.jupiter.api.Test; - -import de.jplag.clustering.algorithm.GenericClusteringAlgorithm; -import de.jplag.clustering.algorithm.SpectralClustering; -import de.jplag.clustering.preprocessors.CumulativeDistributionFunctionPreprocessor; - -/** - * These test are not meant to be run during normal unit testing. They can be used to test the clustering algorithms - * against data from the private pseudomized reports repository. These tests test PROBABILISTIC behavior, so use with - * caution! - */ -public class ClusteringRealDataTest { - - private static class TestFile { - - private String uri; - private Optional> expected; - - public TestFile(String uri, Optional> expected) { - this.uri = uri; - this.expected = expected; - } - } - - private static List B_POSITIVE = Arrays.asList(new String[] {"Student (31)", "Student (223)"}); - private static List C_POSITIVE = Arrays.asList(new String[] {"Student (166)", "Student (212)", "Student (236)", "Student (229)"}); - - private static final TestFile[] OLD_CLUSTERING_DATA = {new TestFile("de/jplag/PseudonymizedReports/alt/A_1000_matches_max.csv", Optional.empty()), - new TestFile("de/jplag/PseudonymizedReports/alt/B_1000_matches_max.csv", Optional.of(B_POSITIVE)), - new TestFile("de/jplag/PseudonymizedReports/alt/C_1000_matches_max.csv", Optional.of(C_POSITIVE)),}; - - private static final TestFile[] NEW_CLUSTERING_DATA = {new TestFile("de/jplag/PseudonymizedReports/neu/A_matches_avg.csv", Optional.empty()), - new TestFile("de/jplag/PseudonymizedReports/neu/B_matches_avg.csv", Optional.of(B_POSITIVE)), - new TestFile("de/jplag/PseudonymizedReports/neu/C_matches_avg.csv", Optional.of(C_POSITIVE)),}; - - private String str(float f) { - return String.format("%.4f", f); - } - - private URL loadFromClasspath(String file) throws FileNotFoundException { - URL url = getClass().getClassLoader().getResource(file); - if (url == null) { - assumeTrue(false, file + " not found. 'de/jpag/PseudonymizedReports' must contain the data from the PseudonymizedReports repository."); - } - return url; - } - - private void doTesting(ReadResult readResult, Optional> expected) { - RealMatrix clusteringSimilarity = new Array2DRowRealMatrix(readResult.similarity.getData()); - - /* - * AgglomerativeClustering.ClusteringOptions options = new AgglomerativeClustering.ClusteringOptions(); - * options.minimalSimilarity = 0.15f; options.similarity = AgglomerativeClustering.InterClusterSimilarity.AVERAGE; - * ClusteringAlgorithm clusteringAlg = new AgglomerativeClustering(options); - */ - - SpectralClustering clusteringAlg = new SpectralClustering(ClusteringOptions.DEFAULTS); - ClusteringPreprocessor preprocessor = new CumulativeDistributionFunctionPreprocessor(); - GenericClusteringAlgorithm preprocessedClusteringAlg = new PreprocessedClusteringAlgorithm(clusteringAlg, preprocessor); - Collection> clustering = preprocessedClusteringAlg.cluster(clusteringSimilarity); - ClusteringResult mRes = ClusteringResult.fromIntegerCollections(new ArrayList<>(clustering), readResult.similarity); - List> clusters = new ArrayList<>(mRes.getClusters()); - clusters.sort(Comparator.comparingDouble(c -> -c.getNormalizedCommunityStrengthPerConnection())); - - System.out.println("cs\tncsm\tavgSim\tcombined\tmembers"); - for (Cluster c : clusters) { - float ncsm = c.getCommunityStrengthPerConnection(); - float avgSim = c.averageSimilarity((a, b) -> (float) readResult.similarity.getEntry(a, b)); - System.out.println(str(c.getCommunityStrength()) + "\t" + str(ncsm) + "\t" + str(avgSim) + "\t" - + c.getMembers().stream().map(readResult.mapping::unmap).collect(Collectors.toList())); - } - System.out.println("Community Strength: " + mRes.getCommunityStrength()); - System.out.println("Clusters: " + clusters.size()); - - expected.ifPresent(expectedIdentifiers -> { - Set expectedIdentifiersSet = new HashSet<>(expectedIdentifiers); - Set bestClusters = clusters.get(0).getMembers().stream().map(readResult.mapping::unmap).collect(Collectors.toSet()); - assertEquals(expectedIdentifiersSet, bestClusters); - System.out.println("hey"); - }); - } - - @Test - public void aClusteringOld() throws FileNotFoundException, URISyntaxException { - for (TestFile testFile : OLD_CLUSTERING_DATA) { - URL url = loadFromClasspath(testFile.uri); - File file = new File(url.toURI()); - ReadResult r = readOldCsv(file); - doTesting(r, testFile.expected); - } - } - - @Test - public void aClusteringNew() throws FileNotFoundException, URISyntaxException { - for (TestFile testFile : NEW_CLUSTERING_DATA) { - URL url = loadFromClasspath(testFile.uri); - File file = new File(url.toURI()); - ReadResult r = readNewCsv(file); - doTesting(r, testFile.expected); - } - } - - private static class ReadComparison { - int left; - int right; - float similarity; - } - - private static class ReadResult { - IntegerMapping mapping; - RealMatrix similarity; - } - - private static ReadResult readNewCsv(File fileName) throws FileNotFoundException { - IntegerMapping mapping = new IntegerMapping<>(512); - List comparisons = new ArrayList<>(512); - try (CSVReader reader = new CSVReader(fileName, ";")) { - while (reader.hasNext()) { - List records = reader.next(); - if (records.isEmpty()) - continue; - String leftStudent = records.get(1); - String rightStudent = records.get(2); - String similarity = records.get(3); - ReadComparison comparison = new ReadComparison(); - comparison.left = mapping.map(leftStudent); - comparison.right = mapping.map(rightStudent); - comparison.similarity = Float.parseFloat(similarity) / 100; - comparisons.add(comparison); - } - } - RealMatrix matrix = new Array2DRowRealMatrix(mapping.size(), mapping.size()); - for (ReadComparison comparison : comparisons) { - matrix.setEntry(comparison.left, comparison.right, comparison.similarity); - matrix.setEntry(comparison.right, comparison.left, comparison.similarity); - } - ReadResult r = new ReadResult(); - r.similarity = matrix; - r.mapping = mapping; - return r; - } - - private static ReadResult readOldCsv(File fileName) throws FileNotFoundException { - IntegerMapping mapping = new IntegerMapping<>(512); - List comparisons = new ArrayList<>(512); - try (CSVReader reader = new CSVReader(fileName, ";")) { - while (reader.hasNext()) { - List records = reader.next(); - if (records.isEmpty()) - continue; - Deque stuff = new ArrayDeque<>(records); - String leftStudent = stuff.removeFirst(); - int leftID = mapping.map(leftStudent); - while (stuff.size() >= 3) { - ReadComparison comparison = new ReadComparison(); - comparison.left = leftID; - stuff.removeFirst(); // comparison ID not needed - String rightStudent = stuff.removeFirst(); - String similarity = stuff.removeFirst(); - comparison.right = mapping.map(rightStudent); - comparison.similarity = Float.parseFloat(similarity) / 100; - comparisons.add(comparison); - } - } - } - RealMatrix matrix = new Array2DRowRealMatrix(mapping.size(), mapping.size()); - for (ReadComparison comparison : comparisons) { - matrix.setEntry(comparison.left, comparison.right, comparison.similarity); - matrix.setEntry(comparison.right, comparison.left, comparison.similarity); - } - ReadResult r = new ReadResult(); - r.similarity = matrix; - r.mapping = mapping; - return r; - } - - private static class CSVReader implements AutoCloseable { - private String delimiter; - private Scanner scanner; - - private CSVReader(File fileName, String delimiter) throws FileNotFoundException { - this.delimiter = delimiter; - scanner = new Scanner(fileName); - } - - @Override - public void close() { - scanner.close(); - } - - List next() { - String line = scanner.nextLine(); - String[] records = line.split(delimiter); - return Arrays.asList(records); - } - - boolean hasNext() { - return scanner.hasNextLine(); - } - } -} diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java index 0c0d7fbd4..5bc2484f0 100644 --- a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java +++ b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java @@ -2,6 +2,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; import java.util.List; import org.apache.commons.math3.linear.Array2DRowRealMatrix; @@ -49,8 +50,64 @@ void uniformClustering() { assertEquals(0.0, result.getCommunityStrength(), 0.00001); } - private static void setEntries(RealMatrix matrix, int i, int j, double value) { - matrix.setEntry(i, j, value); - matrix.setEntry(j, i, value); + @Test + void averageSimilarity() { + var similarity = new Array2DRowRealMatrix(4, 4); + + setEntries(similarity, 0, 1, 0.5); + setEntries(similarity, 0, 2, 0.3); + setEntries(similarity, 0, 3, 0.4); + setEntries(similarity, 1, 2, 0.1); + setEntries(similarity, 1, 3, 0.1); + setEntries(similarity, 2, 3, 0.7); + + ClusteringResult result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 2, 3)), similarity); + + assertEquals(0.35, result.getClusters().stream().findFirst().orElseThrow().getAverageSimilarity(), 0.00001); + } + + @Test + void averageSimilarity2() { + var similarity = new Array2DRowRealMatrix(6, 6); + + setEntries(similarity, 0, 1, 0.5); + setEntries(similarity, 0, 2, 0.3); + setEntries(similarity, 0, 3, 0.4); + setEntries(similarity, 0, 4, 0.4); + setEntries(similarity, 1, 2, 0.1); + setEntries(similarity, 1, 3, 0.1); + setEntries(similarity, 1, 4, 0.3); + setEntries(similarity, 2, 3, 0.7); + setEntries(similarity, 2, 4, 0.2); + setEntries(similarity, 2, 5, 0.9); + setEntries(similarity, 3, 4, 0.5); + setEntries(similarity, 3, 5, 0.05); + + + ClusteringResult result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 4), List.of(2, 3, 5)), similarity); + var clusters = new ArrayList<>(result.getClusters()); + + assertEquals(0.4, clusters.get(0).getAverageSimilarity(), 0.00001); + assertEquals(0.55, clusters.get(1).getAverageSimilarity(), 0.00001); + } + + @Test + void averageSimilarityPerfectClustering() { + RealMatrix similarity = new Array2DRowRealMatrix(4, 4); + + // These are similar + setEntries(similarity, 0, 1, 1f); + setEntries(similarity, 2, 3, 1f); + + // Others are dissimilar + + ClusteringResult result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1), List.of(2, 3)), similarity); + var cluster = result.getClusters().stream().findFirst().orElseThrow(); + assertEquals(1f, cluster.getAverageSimilarity(), 0.00001); + } + + private static void setEntries(RealMatrix matrix, int i, int j, double similarity) { + matrix.setEntry(i, j, similarity); + matrix.setEntry(j, i, similarity); } } From ae68a2f93deef3ea90973708dae5e6e8657a3e71 Mon Sep 17 00:00:00 2001 From: Nesta Bentum Date: Fri, 10 Jun 2022 13:05:32 +0200 Subject: [PATCH 2/3] Write ClusteringResult to Report --- .../java/de/jplag/clustering/Cluster.java | 8 ---- .../jplag/clustering/ClusteringAdapter.java | 4 +- .../de/jplag/clustering/ClusteringResult.java | 22 +++++---- .../reportobject/ReportObjectFactory.java | 11 ++--- .../mapper/ClusteringResultMapper.java | 26 ++++++++++ .../reporting/reportobject/model/Cluster.java | 4 +- .../clustering/ClusteringResultTest.java | 1 - .../mapper/ClusteringResultMapperTest.java | 48 +++++++++++++++++++ 8 files changed, 92 insertions(+), 32 deletions(-) create mode 100644 jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java create mode 100644 jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java index 8a1225b23..21b03fc5e 100644 --- a/jplag/src/main/java/de/jplag/clustering/Cluster.java +++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java @@ -4,11 +4,9 @@ import java.util.Collection; import java.util.List; import java.util.function.BiFunction; -import java.util.stream.Collectors; /** * Cluster part of a {@link ClusteringResult}. - * * @param type of the clusters members */ public class Cluster { @@ -18,7 +16,6 @@ public class Cluster { private ClusteringResult clusteringResult = null; private final float averageSimilarity; - public Cluster(Collection members, float communityStrength, float averageSimilarity) { this.members = new ArrayList<>(members); this.communityStrength = communityStrength; @@ -36,7 +33,6 @@ public float getAverageSimilarity() { /** * See {@link ClusteringResult#getCommunityStrength} - * * @return community strength of the cluster */ public float getCommunityStrength() { @@ -46,7 +42,6 @@ public float getCommunityStrength() { /** * Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their * own clusters. - * * @param clusteringResult the clustering result */ public void setClusteringResult(ClusteringResult clusteringResult) { @@ -67,7 +62,6 @@ public float getCommunityStrengthPerConnection() { * Computes a normalized community strength per connection. Can be used as measure for strength of evidence in * comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate * non-clusters. This method may only be called on clusters that are part of a ClusteringResult. - * * @return normalized community strength per connection */ public float getNormalizedCommunityStrengthPerConnection() { @@ -94,7 +88,6 @@ public double getWorth(BiFunction similarity) { /** * Computes the average similarity inside the cluster. - * * @param similarity function that supplies the similarity of two cluster members. * @return average similarity */ @@ -119,7 +112,6 @@ private int connections() { /** * Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process. - * * @return is bad */ public boolean isBadCluster() { diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java index c94893ebe..19b8dbfa9 100644 --- a/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java +++ b/jplag/src/main/java/de/jplag/clustering/ClusteringAdapter.java @@ -28,9 +28,8 @@ public class ClusteringAdapter { /** * Creates the clustering adapter. Only submissions that appear in those similarities might also appear in * {@link ClusteringResult}s obtained from this adapter. - * * @param comparisons that should be included in the process of clustering - * @param metric function that assigns a similarity to each comparison + * @param metric function that assigns a similarity to each comparison */ public ClusteringAdapter(Collection comparisons, Function metric) { mapping = new IntegerMapping<>(comparisons.size()); @@ -53,7 +52,6 @@ public ClusteringAdapter(Collection comparisons, Function type of the clustered entities (e.g. Submission) */ public class ClusteringResult { @@ -35,7 +34,6 @@ public Collection> getClusters() { * changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in * networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi: * 10.1103/PhysRevE.69.026113 It's called modularity in that paper. - * * @return community strength */ public float getCommunityStrength() { @@ -44,7 +42,6 @@ public float getCommunityStrength() { /** * How much this clustering result is worth during optimization. - * * @param similarity TODO DF: JAVADOC * @return worth */ @@ -106,15 +103,20 @@ private static float calculateAverageSimilarityFor(Collection cluster, sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2); } } - submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to avoid adding them anew unnecessary + submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to + // avoid adding them anew unnecessary } int nMinusOne = cluster.size() - 1; - float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 comparisons: - compare first element of cluster to all other except itself: n-1 comparisons. compare second element two all other except itself and first element - (as these two were already compared when we processed the first element), n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so on. - when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it has already been compared to all other. - adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = Gauss sum of (n-1) - */ + float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) + / 2f; /* + * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 + * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element two + * all other except itself and first element (as these two were already compared when we processed the first element), + * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so + * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it + * has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = + * Gauss sum of (n-1) + */ return sumOfSimilarities / numberOfComparisons; } diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java b/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java index ad97eb92c..03861ee68 100644 --- a/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java +++ b/jplag/src/main/java/de/jplag/reporting/reportobject/ReportObjectFactory.java @@ -12,6 +12,7 @@ import org.slf4j.LoggerFactory; import de.jplag.*; +import de.jplag.reporting.reportobject.mapper.ClusteringResultMapper; import de.jplag.reporting.reportobject.model.*; import de.jplag.reporting.reportobject.model.Match; @@ -21,6 +22,7 @@ public class ReportObjectFactory { private static final Logger logger = LoggerFactory.getLogger(ReportObjectFactory.class); + private static final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper(); /** * Converts a JPlagResult to a JPlagReport. @@ -58,7 +60,7 @@ private static OverviewReport generateOverviewReport(JPlagResult result) { overviewReport.setExecutionTime(result.getDuration()); overviewReport.setComparisonNames(getComparisonNames(comparisons)); overviewReport.setMetrics(getMetrics(result)); - overviewReport.setClusters(getClusters(result)); + overviewReport.setClusters(clusteringResultMapper.map(result)); return overviewReport; } @@ -166,13 +168,6 @@ private static Match convertMatchToReportMatch(JPlagComparison comparison, de.jp return new Match(startTokenFirst.getFile(), startTokenSecond.getFile(), startFirst, endFirst, startSecond, endSecond, tokens); } - // TODO implement after PR Read clustering #281 - private static List getClusters(JPlagResult result) { - // List> clusters = result.getClusteringResult(); - // return clusters.map( c -> new Cluster(getAvgSimilarity, getStrength, c.getMembers().map(Submission::getName))) - return List.of(); - } - private static List readFileLines(File file) { List lines = new ArrayList<>(); try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) { diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java b/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java new file mode 100644 index 000000000..ea75f2cbd --- /dev/null +++ b/jplag/src/main/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapper.java @@ -0,0 +1,26 @@ +package de.jplag.reporting.reportobject.mapper; + +import java.util.Collection; +import java.util.List; + +import de.jplag.JPlagResult; +import de.jplag.Submission; +import de.jplag.clustering.ClusteringResult; +import de.jplag.reporting.reportobject.model.Cluster; + +/** + * Extracts and maps the clusters from the JPlagResult to the corresponding JSON DTO + */ +public class ClusteringResultMapper { + public List map(JPlagResult result) { + var clusteringResult = result.getClusteringResult(); + return clusteringResult.stream().map(ClusteringResult::getClusters).flatMap(Collection::stream).map(this::convertCluster).toList(); + } + + private Cluster convertCluster(de.jplag.clustering.Cluster from) { + var strength = from.getCommunityStrength(); + var avgSimilarity = from.getAverageSimilarity(); + var member = from.getMembers().stream().map(Submission::getName).toList(); + return new Cluster(avgSimilarity, strength, member); + } +} diff --git a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java index 3f229f538..17af0728d 100644 --- a/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java +++ b/jplag/src/main/java/de/jplag/reporting/reportobject/model/Cluster.java @@ -4,6 +4,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; -public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, - @JsonProperty("strength") float strength, @JsonProperty("members") List members) { +public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, @JsonProperty("strength") float strength, + @JsonProperty("members") List members) { } diff --git a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java index 5bc2484f0..4b01691b6 100644 --- a/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java +++ b/jplag/src/test/java/de/jplag/clustering/ClusteringResultTest.java @@ -83,7 +83,6 @@ void averageSimilarity2() { setEntries(similarity, 3, 4, 0.5); setEntries(similarity, 3, 5, 0.05); - ClusteringResult result = ClusteringResult.fromIntegerCollections(List.of(List.of(0, 1, 4), List.of(2, 3, 5)), similarity); var clusters = new ArrayList<>(result.getClusters()); diff --git a/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java b/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java new file mode 100644 index 000000000..103126841 --- /dev/null +++ b/jplag/src/test/java/de/jplag/reporting/reportobject/mapper/ClusteringResultMapperTest.java @@ -0,0 +1,48 @@ +package de.jplag.reporting.reportobject.mapper; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import de.jplag.JPlagResult; +import de.jplag.Submission; +import de.jplag.clustering.Cluster; +import de.jplag.clustering.ClusteringResult; + +public class ClusteringResultMapperTest { + private final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper(); + + @Test + public void test() { + // given + JPlagResult resultMock = mock(JPlagResult.class); + Cluster cluster1 = createClusterWith(0.2f, 0.4f, "1", "2"); + Cluster cluster2 = createClusterWith(0.3f, 0.6f, "3", "4", "5"); + when(resultMock.getClusteringResult()).thenReturn(List.of(new ClusteringResult<>(List.of(cluster1, cluster2), 0.3f))); + + // when + var result = clusteringResultMapper.map(resultMock); + + // then + assertEquals(List.of(new de.jplag.reporting.reportobject.model.Cluster(0.4f, 0.2f, List.of("1", "2")), + new de.jplag.reporting.reportobject.model.Cluster(0.6f, 0.3f, List.of("3", "4", "5")) + + ), result); + } + + private Cluster createClusterWith(Float communityStrength, Float averageSimilarity, String... ids) { + var submissions = Arrays.stream(ids).map(this::submissionWithId).toList(); + return new Cluster<>(submissions, communityStrength, averageSimilarity); + } + + private Submission submissionWithId(String id) { + Submission submission = mock(Submission.class); + when(submission.getName()).thenReturn(id); + return submission; + } +} From 02b1960463dbd0dd2fb0f7a19c9d2612814b097a Mon Sep 17 00:00:00 2001 From: Nesta Bentum Date: Sat, 25 Jun 2022 11:44:11 +0200 Subject: [PATCH 3/3] Improve Cluster Average Similarity Calculation & add JDoc --- .../main/java/de/jplag/clustering/Cluster.java | 5 +++++ .../de/jplag/clustering/ClusteringResult.java | 18 ++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/jplag/src/main/java/de/jplag/clustering/Cluster.java b/jplag/src/main/java/de/jplag/clustering/Cluster.java index 21b03fc5e..d5c5e886a 100644 --- a/jplag/src/main/java/de/jplag/clustering/Cluster.java +++ b/jplag/src/main/java/de/jplag/clustering/Cluster.java @@ -16,6 +16,11 @@ public class Cluster { private ClusteringResult clusteringResult = null; private final float averageSimilarity; + /** + * @param members Members of the cluster. + * @param communityStrength A metric of how strongly the members of this cluster are connected. + * @param averageSimilarity The average similarity between all tuple comparisons of the members in this cluster. + */ public Cluster(Collection members, float communityStrength, float averageSimilarity) { this.members = new ArrayList<>(members); this.communityStrength = communityStrength; diff --git a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java index 3da872b1b..ce46e764c 100644 --- a/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java +++ b/jplag/src/main/java/de/jplag/clustering/ClusteringResult.java @@ -14,7 +14,7 @@ public class ClusteringResult { private final List> clusters; - private float communityStrength; + private final float communityStrength; public ClusteringResult(Collection> clusters, float communityStrength) { this.clusters = List.copyOf(clusters); @@ -96,21 +96,19 @@ public static ClusteringResult fromIntegerCollections(List cluster, RealMatrix similarityMatrix) { var sumOfSimilarities = 0f; - var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster)); - for (Integer indexOfSubmission1 : cluster) { - for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) { - if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) { - sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2); - } + List indices = List.copyOf(cluster); + for (int i = 1; i < cluster.size(); i++) { + int indexOfSubmission1 = indices.get(i); + for (int j = 0; j < i; j++) { // as the similarity matrix is symmetrical we need only iterate over one half of it + int indexOfSubmission2 = indices.get(j); + sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2); } - submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to - // avoid adding them anew unnecessary } int nMinusOne = cluster.size() - 1; float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* * Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 - * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element two + * comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element to * all other except itself and first element (as these two were already compared when we processed the first element), * n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so * on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it