Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write Clusters To JSON Report #453

Merged
merged 3 commits into from
Jun 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions jplag/src/main/java/de/jplag/clustering/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.Collection;
import java.util.List;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

/**
* Cluster part of a {@link ClusteringResult}.
Expand All @@ -15,17 +14,28 @@ public class Cluster<T> {
private final float communityStrength;
private final Collection<T> members;
private ClusteringResult<T> clusteringResult = null;
private final float averageSimilarity;

public Cluster(Collection<T> members, float communityStrength) {
/**
* @param members Members of the cluster.
* @param communityStrength A metric of how strongly the members of this cluster are connected.
* @param averageSimilarity The average similarity between all tuple comparisons of the members in this cluster.
*/
public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
nestabentum marked this conversation as resolved.
Show resolved Hide resolved
this.members = new ArrayList<>(members);
this.communityStrength = communityStrength;
this.averageSimilarity = averageSimilarity;
}

public Collection<T> getMembers() {
// TODO Check why access to local attribute.
return members;
}

public float getAverageSimilarity() {
return averageSimilarity;
}

/**
* See {@link ClusteringResult#getCommunityStrength}
* @return community strength of the cluster
Expand Down Expand Up @@ -60,8 +70,7 @@ public float getCommunityStrengthPerConnection() {
* @return normalized community strength per connection
*/
public float getNormalizedCommunityStrengthPerConnection() {
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0)
.collect(Collectors.toList());
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0).toList();
float posCommunityStrengthSum = (float) goodClusters.stream().mapToDouble(Cluster::getCommunityStrengthPerConnection).sum();

int size = clusteringResult.getClusters().size();
Expand All @@ -87,7 +96,7 @@ public double getWorth(BiFunction<T, T, Float> similarity) {
* @param similarity function that supplies the similarity of two cluster members.
* @return average similarity
*/
public float averageSimilarity(BiFunction<T, T, Float> similarity) {
private float averageSimilarity(BiFunction<T, T, Float> similarity) {
List<T> members = new ArrayList<>(getMembers());
if (members.size() < 2) {
return 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public ClusteringResult<Submission> doClustering(GenericClusteringAlgorithm algo
ClusteringResult<Integer> modularityClusterResult = ClusteringResult.fromIntegerCollections(new ArrayList<>(intResult), similarityMatrix);
List<Cluster<Submission>> mappedClusters = modularityClusterResult.getClusters().stream()
.map(unmappedCluster -> new Cluster<>(unmappedCluster.getMembers().stream().map(mapping::unmap).collect(Collectors.toList()),
unmappedCluster.getCommunityStrength()))
unmappedCluster.getCommunityStrength(), unmappedCluster.getAverageSimilarity()))
.collect(Collectors.toList());
return new ClusteringResult<>(mappedClusters, modularityClusterResult.getCommunityStrength());
}
Expand Down
37 changes: 28 additions & 9 deletions jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
package de.jplag.clustering;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.DoubleStream;

Expand All @@ -20,7 +14,7 @@
public class ClusteringResult<T> {

private final List<Cluster<T>> clusters;
private float communityStrength = 0;
private final float communityStrength;

public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
this.clusters = List.copyOf(clusters);
Expand Down Expand Up @@ -92,11 +86,36 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
for (int i = 0; i < clustering.size(); i++) {
double outWeightSum = percentagesOfSimilaritySums.getRowVector(i).getL1Norm();
double clusterCommunityStrength = percentagesOfSimilaritySums.getEntry(i, i) - outWeightSum * outWeightSum;
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength));
float averageSimilarity = calculateAverageSimilarityFor(clustering.get(i), similarity);
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength, averageSimilarity));
communityStrength += clusterCommunityStrength;
}
}
return new ClusteringResult<>(clusters, communityStrength);
}

private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
var sumOfSimilarities = 0f;
List<Integer> indices = List.copyOf(cluster);
for (int i = 1; i < cluster.size(); i++) {
int indexOfSubmission1 = indices.get(i);
for (int j = 0; j < i; j++) { // as the similarity matrix is symmetrical we need only iterate over one half of it
int indexOfSubmission2 = indices.get(j);
sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
}
}
int nMinusOne = cluster.size() - 1;
float numberOfComparisons = (nMinusOne * (nMinusOne + 1))
/ 2f; /*
* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1
* comparisons: compare first element of cluster to all other except itself: n-1 comparisons. compare second element to
* all other except itself and first element (as these two were already compared when we processed the first element),
* n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so
* on. when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it
* has already been compared to all other. adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) =
* Gauss sum of (n-1)
*/
return sumOfSimilarities / numberOfComparisons;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.slf4j.LoggerFactory;

import de.jplag.*;
import de.jplag.reporting.reportobject.mapper.ClusteringResultMapper;
import de.jplag.reporting.reportobject.model.*;
import de.jplag.reporting.reportobject.model.Match;

Expand All @@ -21,6 +22,7 @@
public class ReportObjectFactory {

private static final Logger logger = LoggerFactory.getLogger(ReportObjectFactory.class);
private static final ClusteringResultMapper clusteringResultMapper = new ClusteringResultMapper();

/**
* Converts a JPlagResult to a JPlagReport.
Expand Down Expand Up @@ -58,7 +60,7 @@ private static OverviewReport generateOverviewReport(JPlagResult result) {
overviewReport.setExecutionTime(result.getDuration());
overviewReport.setComparisonNames(getComparisonNames(comparisons));
overviewReport.setMetrics(getMetrics(result));
overviewReport.setClusters(getClusters(result));
overviewReport.setClusters(clusteringResultMapper.map(result));

return overviewReport;
}
Expand Down Expand Up @@ -166,13 +168,6 @@ private static Match convertMatchToReportMatch(JPlagComparison comparison, de.jp
return new Match(startTokenFirst.getFile(), startTokenSecond.getFile(), startFirst, endFirst, startSecond, endSecond, tokens);
}

// TODO implement after PR Read clustering #281
private static List<Cluster> getClusters(JPlagResult result) {
// List<ClusteringResult<Submission>> clusters = result.getClusteringResult();
// return clusters.map( c -> new Cluster(getAvgSimilarity, getStrength, c.getMembers().map(Submission::getName)))
return List.of();
}

private static List<String> readFileLines(File file) {
List<String> lines = new ArrayList<>();
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.jplag.reporting.reportobject.mapper;

import java.util.Collection;
import java.util.List;

import de.jplag.JPlagResult;
import de.jplag.Submission;
import de.jplag.clustering.ClusteringResult;
import de.jplag.reporting.reportobject.model.Cluster;

/**
* Extracts and maps the clusters from the JPlagResult to the corresponding JSON DTO
*/
public class ClusteringResultMapper {
public List<Cluster> map(JPlagResult result) {
var clusteringResult = result.getClusteringResult();
return clusteringResult.stream().map(ClusteringResult::getClusters).flatMap(Collection::stream).map(this::convertCluster).toList();
}

private Cluster convertCluster(de.jplag.clustering.Cluster<Submission> from) {
var strength = from.getCommunityStrength();
var avgSimilarity = from.getAverageSimilarity();
var member = from.getMembers().stream().map(Submission::getName).toList();
return new Cluster(avgSimilarity, strength, member);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,6 @@

import com.fasterxml.jackson.annotation.JsonProperty;

public class Cluster {

@JsonProperty("average_similarity")
private final float averageSimilarity;

@JsonProperty("strength")
private final float strength;

@JsonProperty("members")
private final List<String> members;

public Cluster(float averageSimilarity, float strength, List<String> members) {
this.averageSimilarity = averageSimilarity;
this.strength = strength;
this.members = List.copyOf(members);
}

public float getAverageSimilarity() {
return averageSimilarity;
}

public float getStrength() {
return strength;
}

public List<String> getMembers() {
return members;
}
public record Cluster(@JsonProperty("average_similarity") float averageSimilarity, @JsonProperty("strength") float strength,
nestabentum marked this conversation as resolved.
Show resolved Hide resolved
@JsonProperty("members") List<String> members) {
}
17 changes: 4 additions & 13 deletions jplag/src/test/java/de/jplag/clustering/ClusterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,27 @@ public class ClusterTest {
private static final double EPSILON = 0.00001;
Cluster<Character> cluster;

@Test
public void testAverageSimilarity() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 0);
float averageSimilarity = cluster.averageSimilarity((a, b) -> {
return Math.abs((float) (((int) a) - ((int) b)));
});
assertEquals((1.f + 2.f + 1.f + 1.f + 2.f + 1.f) / 6, averageSimilarity, EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionOneMember() {
cluster = new Cluster<>(List.of('a'), 10);
cluster = new Cluster<>(List.of('a'), 10, 0);
assertEquals(0.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionTwoMembers() {
cluster = new Cluster<>(List.of('a', 'b'), 10);
cluster = new Cluster<>(List.of('a', 'b'), 10, 0);
assertEquals(10.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionThreeMembers() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
assertEquals(10.0 / 3, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testNormalizedCommunityStrength() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
@SuppressWarnings("unchecked")
ClusteringResult<Character> clusteringResult = mock(ClusteringResult.class);
when(clusteringResult.getClusters()).thenReturn(List.of(cluster, cluster));
Expand Down
Loading