Skip to content

Commit

Permalink
Merge pull request #453 from nestabentum/cluster_in_report
Browse files Browse the repository at this point in the history
Write Clusters To JSON Report
  • Loading branch information
tsaglam authored Jun 25, 2022
2 parents 9f441e6 + 02b1960 commit 6a7ba9b
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 282 deletions.
19 changes: 14 additions & 5 deletions jplag/src/main/java/de/jplag/clustering/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.Collection;
import java.util.List;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

/**
* Cluster part of a {@link ClusteringResult}.
Expand All @@ -15,17 +14,28 @@ public class Cluster<T> {
private final float communityStrength;
private final Collection<T> members;
private ClusteringResult<T> clusteringResult = null;
private final float averageSimilarity;

public Cluster(Collection<T> members, float communityStrength) {
/**
* @param members Members of the cluster.
* @param communityStrength A metric of how strongly the members of this cluster are connected.
* @param averageSimilarity The average similarity between all tuple comparisons of the members in this cluster.
*/
public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
this.members = new ArrayList<>(members);
this.communityStrength = communityStrength;
this.averageSimilarity = averageSimilarity;
}

public Collection<T> getMembers() {
// TODO Check why access to local attribute.
return members;
}

public float getAverageSimilarity() {
return averageSimilarity;
}

/**
* See {@link ClusteringResult#getCommunityStrength}
* @return community strength of the cluster
Expand Down Expand Up @@ -60,8 +70,7 @@ public float getCommunityStrengthPerConnection() {
* @return normalized community strength per connection
*/
public float getNormalizedCommunityStrengthPerConnection() {
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0)
.collect(Collectors.toList());
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0).toList();
float posCommunityStrengthSum = (float) goodClusters.stream().mapToDouble(Cluster::getCommunityStrengthPerConnection).sum();

int size = clusteringResult.getClusters().size();
Expand All @@ -87,7 +96,7 @@ public double getWorth(BiFunction<T, T, Float> similarity) {
* @param similarity function that supplies the similarity of two cluster members.
* @return average similarity
*/
public float averageSimilarity(BiFunction<T, T, Float> similarity) {
private float averageSimilarity(BiFunction<T, T, Float> similarity) {
List<T> members = new ArrayList<>(getMembers());
if (members.size() < 2) {
return 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public ClusteringResult<Submission> doClustering(GenericClusteringAlgorithm algo
ClusteringResult<Integer> modularityClusterResult = ClusteringResult.fromIntegerCollections(new ArrayList<>(intResult), similarityMatrix);
List<Cluster<Submission>> mappedClusters = modularityClusterResult.getClusters().stream()
.map(unmappedCluster -> new Cluster<>(unmappedCluster.getMembers().stream().map(mapping::unmap).collect(Collectors.toList()),
unmappedCluster.getCommunityStrength()))
unmappedCluster.getCommunityStrength(), unmappedCluster.getAverageSimilarity()))
.collect(Collectors.toList());
return new ClusteringResult<>(mappedClusters, modularityClusterResult.getCommunityStrength());
}
Expand Down
37 changes: 28 additions & 9 deletions jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
package de.jplag.clustering;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.DoubleStream;

Expand All @@ -20,7 +14,7 @@
public class ClusteringResult<T> {

private final List<Cluster<T>> clusters;
private float communityStrength = 0;
private final float communityStrength;

public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
this.clusters = List.copyOf(clusters);
Expand Down Expand Up @@ -92,11 +86,36 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
for (int i = 0; i < clustering.size(); i++) {
double outWeightSum = percentagesOfSimilaritySums.getRowVector(i).getL1Norm();
double clusterCommunityStrength = percentagesOfSimilaritySums.getEntry(i, i) - outWeightSum * outWeightSum;
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength));
float averageSimilarity = calculateAverageSimilarityFor(clustering.get(i), similarity);
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength, averageSimilarity));
communityStrength += clusterCommunityStrength;
}
}
return new ClusteringResult<>(clusters, communityStrength);
}

private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
var sumOfSimilarities = 0f;
List<Integer> indices = List.copyOf(cluster);
for (int i = 1; i < cluster.size(); i++) {
int indexOfSubmission1 = indices.get(i);
for (int j = 0; j < i; j++) { // as the similarity matrix is symmetrical we need only iterate over one half of it
int indexOfSubmission2 = indices.get(j);
sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
}
}
int nMinusOne = cluster.size() - 1;
float numberOfComparisons = (nMinusOne * (nMinusOne + 1))
/ 2f; /*
* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
* comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element to
* all other except itself and first element (as these two were already compared when we processed the first element),
* n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
* on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it
* has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) =
* Gauss sum of (n-1)
*/
return sumOfSimilarities / numberOfComparisons;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.slf4j.LoggerFactory;

import de.jplag.*;
import de.jplag.reporting.reportobject.mapper.ClusteringResultMapper;
import de.jplag.reporting.reportobject.model.*;
import de.jplag.reporting.reportobject.model.Match;

Expand All @@ -21,6 +22,7 @@
public class ReportObjectFactory {

private static final Logger logger = LoggerFactory.getLogger(ReportObjectFactory.class);
private static final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper();

/**
* Converts a JPlagResult to a JPlagReport.
Expand Down Expand Up @@ -59,7 +61,7 @@ private static OverviewReport generateOverviewReport(JPlagResult result) {
overviewReport.setExecutionTime(result.getDuration());
overviewReport.setComparisonNames(getComparisonNames(comparisons));
overviewReport.setMetrics(getMetrics(result));
overviewReport.setClusters(getClusters(result));
overviewReport.setClusters(clusteringResultMapper.map(result));

return overviewReport;
}
Expand Down Expand Up @@ -168,13 +170,6 @@ private static Match convertMatchToReportMatch(JPlagComparison comparison, de.jp
return new Match(startTokenFirst.getFile(), startTokenSecond.getFile(), startFirst, endFirst, startSecond, endSecond, tokens);
}

// TODO implement after PR Read clustering #281
private static List<Cluster> getClusters(JPlagResult result) {
// List<ClusteringResult<Submission>> clusters = result.getClusteringResult();
// return clusters.map( c -> new Cluster(getAvgSimilarity, getStrength, c.getMembers().map(Submission::getName)))
return List.of();
}

private static List<String> readFileLines(File file) {
List<String> lines = new ArrayList<>();
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.jplag.reporting.reportobject.mapper;

import java.util.Collection;
import java.util.List;

import de.jplag.JPlagResult;
import de.jplag.Submission;
import de.jplag.clustering.ClusteringResult;
import de.jplag.reporting.reportobject.model.Cluster;

/**
* Extracts and maps the clusters from the JPlagResult to the corresponding JSON DTO
*/
public class ClusteringResultMapper {
public List<Cluster> map(JPlagResult result) {
var clusteringResult = result.getClusteringResult();
return clusteringResult.stream().map(ClusteringResult::getClusters).flatMap(Collection::stream).map(this::convertCluster).toList();
}

private Cluster convertCluster(de.jplag.clustering.Cluster<Submission> from) {
var strength = from.getCommunityStrength();
var avgSimilarity = from.getAverageSimilarity();
var member = from.getMembers().stream().map(Submission::getName).toList();
return new Cluster(avgSimilarity, strength, member);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,6 @@

import com.fasterxml.jackson.annotation.JsonProperty;

public class Cluster {

@JsonProperty("average_similarity")
private final float averageSimilarity;

@JsonProperty("strength")
private final float strength;

@JsonProperty("members")
private final List<String> members;

public Cluster(float averageSimilarity, float strength, List<String> members) {
this.averageSimilarity = averageSimilarity;
this.strength = strength;
this.members = List.copyOf(members);
}

public float getAverageSimilarity() {
return averageSimilarity;
}

public float getStrength() {
return strength;
}

public List<String> getMembers() {
return members;
}
public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, @JsonProperty("strength") float strength,
@JsonProperty("members") List<String> members) {
}
17 changes: 4 additions & 13 deletions jplag/src/test/java/de/jplag/clustering/ClusterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,27 @@ public class ClusterTest {
private static final double EPSILON = 0.00001;
Cluster<Character> cluster;

@Test
public void testAverageSimilarity() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 0);
float averageSimilarity = cluster.averageSimilarity((a, b) -> {
return Math.abs((float) (((int) a) - ((int) b)));
});
assertEquals((1.f + 2.f + 1.f + 1.f + 2.f + 1.f) / 6, averageSimilarity, EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionOneMember() {
cluster = new Cluster<>(List.of('a'), 10);
cluster = new Cluster<>(List.of('a'), 10, 0);
assertEquals(0.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionTwoMembers() {
cluster = new Cluster<>(List.of('a', 'b'), 10);
cluster = new Cluster<>(List.of('a', 'b'), 10, 0);
assertEquals(10.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionThreeMembers() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
assertEquals(10.0 / 3, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testNormalizedCommunityStrength() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
@SuppressWarnings("unchecked")
ClusteringResult<Character> clusteringResult = mock(ClusteringResult.class);
when(clusteringResult.getClusters()).thenReturn(List.of(cluster, cluster));
Expand Down
Loading

0 comments on commit 6a7ba9b

Please sign in to comment.