Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write Clusters To JSON Report #453

Merged
merged 3 commits into from
Jun 25, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions jplag/src/main/java/de/jplag/clustering/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,35 @@

/**
* Cluster part of a {@link ClusteringResult}.
*
* @param <T> type of the clusters members
*/
public class Cluster<T> {

private final float communityStrength;
private final Collection<T> members;
private ClusteringResult<T> clusteringResult = null;
private final float averageSimilarity;

public Cluster(Collection<T> members, float communityStrength) {

public Cluster(Collection<T> members, float communityStrength, float averageSimilarity) {
nestabentum marked this conversation as resolved.
Show resolved Hide resolved
this.members = new ArrayList<>(members);
this.communityStrength = communityStrength;
this.averageSimilarity = averageSimilarity;
}

public Collection<T> getMembers() {
// TODO Check why access to local attribute.
return members;
}

public float getAverageSimilarity() {
return averageSimilarity;
}

/**
* See {@link ClusteringResult#getCommunityStrength}
*
* @return community strength of the cluster
*/
public float getCommunityStrength() {
Expand All @@ -37,6 +46,7 @@ public float getCommunityStrength() {
/**
* Sets this clusters clustering result. Should only be called by classes extending {@link ClusteringResult} on their
* own clusters.
*
* @param clusteringResult the clustering result
*/
public void setClusteringResult(ClusteringResult<T> clusteringResult) {
Expand All @@ -57,11 +67,11 @@ public float getCommunityStrengthPerConnection() {
* Computes a normalized community strength per connection. Can be used as measure for strength of evidence in
* comparison to other clusters in the same clustering. Guaranteed to be smaller than 1. Negative values indicate
* non-clusters. This method may only be called on clusters that are part of a ClusteringResult.
*
* @return normalized community strength per connection
*/
public float getNormalizedCommunityStrengthPerConnection() {
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0)
.collect(Collectors.toList());
List<Cluster<T>> goodClusters = clusteringResult.getClusters().stream().filter(cluster -> cluster.getCommunityStrength() > 0).toList();
float posCommunityStrengthSum = (float) goodClusters.stream().mapToDouble(Cluster::getCommunityStrengthPerConnection).sum();

int size = clusteringResult.getClusters().size();
Expand All @@ -84,10 +94,11 @@ public double getWorth(BiFunction<T, T, Float> similarity) {

/**
* Computes the average similarity inside the cluster.
*
* @param similarity function that supplies the similarity of two cluster members.
* @return average similarity
*/
public float averageSimilarity(BiFunction<T, T, Float> similarity) {
private float averageSimilarity(BiFunction<T, T, Float> similarity) {
List<T> members = new ArrayList<>(getMembers());
if (members.size() < 2) {
return 1;
Expand All @@ -108,6 +119,7 @@ private int connections() {

/**
* Whether this cluster is very uninformative or wrong and should be pruned as last step of the clustering process.
*
* @return is bad
*/
public boolean isBadCluster() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ public class ClusteringAdapter {
/**
* Creates the clustering adapter. Only submissions that appear in those similarities might also appear in
* {@link ClusteringResult}s obtained from this adapter.
*
* @param comparisons that should be included in the process of clustering
* @param metric function that assigns a similarity to each comparison
* @param metric function that assigns a similarity to each comparison
*/
public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlagComparison, Float> metric) {
mapping = new IntegerMapping<>(comparisons.size());
Expand All @@ -52,6 +53,7 @@ public ClusteringAdapter(Collection<JPlagComparison> comparisons, Function<JPlag
/**
* Use a generic clustering algorithm to cluster the submissions, that were included in this {@link ClusteringAdapter}'s
* comparison.
*
* @param algorithm that is used for clustering
* @return the clustered submissions
*/
Expand All @@ -60,7 +62,7 @@ public ClusteringResult<Submission> doClustering(GenericClusteringAlgorithm algo
ClusteringResult<Integer> modularityClusterResult = ClusteringResult.fromIntegerCollections(new ArrayList<>(intResult), similarityMatrix);
List<Cluster<Submission>> mappedClusters = modularityClusterResult.getClusters().stream()
.map(unmappedCluster -> new Cluster<>(unmappedCluster.getMembers().stream().map(mapping::unmap).collect(Collectors.toList()),
unmappedCluster.getCommunityStrength()))
unmappedCluster.getCommunityStrength(), unmappedCluster.getAverageSimilarity()))
.collect(Collectors.toList());
return new ClusteringResult<>(mappedClusters, modularityClusterResult.getCommunityStrength());
}
Expand Down
37 changes: 28 additions & 9 deletions jplag/src/main/java/de/jplag/clustering/ClusteringResult.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
package de.jplag.clustering;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.DoubleStream;

Expand All @@ -15,12 +9,13 @@

/**
* Set of clusters dividing a set of entities.
*
* @param <T> type of the clustered entities (e.g. Submission)
*/
public class ClusteringResult<T> {

private final List<Cluster<T>> clusters;
private float communityStrength = 0;
private float communityStrength;

public ClusteringResult(Collection<Cluster<T>> clusters, float communityStrength) {
this.clusters = List.copyOf(clusters);
Expand All @@ -40,6 +35,7 @@ public Collection<Cluster<T>> getClusters() {
* changed, a higher community strength denotes a better clustering. See: Finding and evaluating community structure in
* networks, M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113 – Published 26 February 2004, Doi:
* 10.1103/PhysRevE.69.026113 It's called modularity in that paper.
*
* @return community strength
*/
public float getCommunityStrength() {
Expand All @@ -48,6 +44,7 @@ public float getCommunityStrength() {

/**
* How much this clustering result is worth during optimization.
*
* @param similarity TODO DF: JAVADOC
* @return worth
*/
Expand Down Expand Up @@ -92,11 +89,33 @@ public static ClusteringResult<Integer> fromIntegerCollections(List<Collection<I
for (int i = 0; i < clustering.size(); i++) {
double outWeightSum = percentagesOfSimilaritySums.getRowVector(i).getL1Norm();
double clusterCommunityStrength = percentagesOfSimilaritySums.getEntry(i, i) - outWeightSum * outWeightSum;
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength));
float averageSimilarity = calculateAverageSimilarityFor(clustering.get(i), similarity);
clusters.add(new Cluster<>(clustering.get(i), (float) clusterCommunityStrength, averageSimilarity));
communityStrength += clusterCommunityStrength;
}
}
return new ClusteringResult<>(clusters, communityStrength);
}

private static float calculateAverageSimilarityFor(Collection<Integer> cluster, RealMatrix similarityMatrix) {
var sumOfSimilarities = 0f;
var submissionIndicesWithoutIndicesAlreadyProcessed = new ArrayList<>(List.copyOf(cluster));
for (Integer indexOfSubmission1 : cluster) {
for (Integer indexOfSubmission2 : submissionIndicesWithoutIndicesAlreadyProcessed) {
if (!Objects.equals(indexOfSubmission1, indexOfSubmission2)) {
sumOfSimilarities += similarityMatrix.getEntry(indexOfSubmission1, indexOfSubmission2);
}
}
submissionIndicesWithoutIndicesAlreadyProcessed.remove(indexOfSubmission1); // remove indices we already processed from second list to avoid adding them anew unnecessary
}
nestabentum marked this conversation as resolved.
Show resolved Hide resolved
int nMinusOne = cluster.size() - 1;
float numberOfComparisons = (nMinusOne * (nMinusOne + 1)) / 2f; /* Use Gauss sum to calculate number of comparisons in cluster: Given cluster of size n we need Gauss sum of n-1 comparisons:
compare first element of cluster to all other except itself: n-1 comparisons. compare second element two all other except itself and first element
(as these two were already compared when we processed the first element), n-2 comparisons. compare third element to all other but itself and all previously compared: n-3 comparisons and so on.
when we reach the second to last element we have n-(n-1)=1 comparisons left. when we reach the last element it has already been compared to all other.
adding up all comparisons we get: (n-1) + (n-2) + (n-3) + ... + (n-(n-1)) = Gauss sum of (n-1)
*/
return sumOfSimilarities / numberOfComparisons;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,6 @@

import com.fasterxml.jackson.annotation.JsonProperty;

public class Cluster {

@JsonProperty("average_similarity")
private final float averageSimilarity;

@JsonProperty("strength")
private final float strength;

@JsonProperty("members")
private final List<String> members;

public Cluster(float averageSimilarity, float strength, List<String> members) {
this.averageSimilarity = averageSimilarity;
this.strength = strength;
this.members = List.copyOf(members);
}

public float getAverageSimilarity() {
return averageSimilarity;
}

public float getStrength() {
return strength;
}

public List<String> getMembers() {
return members;
}
public record Cluster(@JsonProperty("average_similarity") float averageSimilarity,
@JsonProperty("strength") float strength, @JsonProperty("members") List<String> members) {
}
17 changes: 4 additions & 13 deletions jplag/src/test/java/de/jplag/clustering/ClusterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,27 @@ public class ClusterTest {
private static final double EPSILON = 0.00001;
Cluster<Character> cluster;

@Test
public void testAverageSimilarity() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 0);
float averageSimilarity = cluster.averageSimilarity((a, b) -> {
return Math.abs((float) (((int) a) - ((int) b)));
});
assertEquals((1.f + 2.f + 1.f + 1.f + 2.f + 1.f) / 6, averageSimilarity, EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionOneMember() {
cluster = new Cluster<>(List.of('a'), 10);
cluster = new Cluster<>(List.of('a'), 10, 0);
assertEquals(0.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionTwoMembers() {
cluster = new Cluster<>(List.of('a', 'b'), 10);
cluster = new Cluster<>(List.of('a', 'b'), 10, 0);
assertEquals(10.0, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testCommunityStrengthPerConnectionThreeMembers() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
assertEquals(10.0 / 3, cluster.getCommunityStrengthPerConnection(), EPSILON);
}

@Test
public void testNormalizedCommunityStrength() {
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10);
cluster = new Cluster<>(List.of('a', 'b', 'c'), 10, 0);
@SuppressWarnings("unchecked")
ClusteringResult<Character> clusteringResult = mock(ClusteringResult.class);
when(clusteringResult.getClusters()).thenReturn(List.of(cluster, cluster));
Expand Down
Loading