forked from marmanis/yooreeka
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
H. Marmanis
committed
Dec 3, 2012
1 parent
1d728ca
commit 762d794
Showing
255 changed files
with
35,422 additions
and
0 deletions.
There are no files selected for viewing
448 changes: 448 additions & 0 deletions
448
src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java
Large diffs are not rendered by default.
Oops, something went wrong.
145 changes: 145 additions & 0 deletions
145
src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
/* | ||
* ________________________________________________________________________________________ | ||
* | ||
* Y O O R E E K A | ||
* A library for data mining, machine learning, soft computing, and mathematical analysis | ||
* ________________________________________________________________________________________ | ||
* | ||
* The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " | ||
* (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms | ||
* are valuable in any software application. | ||
* | ||
* Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko | ||
* Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. | ||
* | ||
* Certain library functions depend on other Open Source software libraries, which are covered | ||
* by different license agreements. See the NOTICE file distributed with this work for additional | ||
* information regarding copyright ownership and licensing. | ||
* | ||
* Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | ||
* either express or implied. See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
package org.yooreeka.algos.clustering.hierarchical; | ||
|
||
import org.yooreeka.algos.clustering.model.Cluster; | ||
import org.yooreeka.algos.clustering.model.DataPoint; | ||
import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping; | ||
|
||
/** A hierarchical agglomerative clustering algorithm based on the average link */ | ||
public class AverageLinkAlgorithm { | ||
|
||
public static void main(String[] args) { | ||
// Define data | ||
DataPoint[] elements = new DataPoint[5]; | ||
elements[0] = new DataPoint("A", new double[] {}); | ||
elements[1] = new DataPoint("B", new double[] {}); | ||
elements[2] = new DataPoint("C", new double[] {}); | ||
elements[3] = new DataPoint("D", new double[] {}); | ||
elements[4] = new DataPoint("E", new double[] {}); | ||
|
||
double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 }, | ||
{ 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } }; | ||
|
||
AverageLinkAlgorithm ca = new AverageLinkAlgorithm(elements, a); | ||
Dendrogram dnd = ca.cluster(); | ||
dnd.printAll(); | ||
} | ||
private DataPoint[] elements; | ||
private double[][] a; | ||
|
||
private ClusterSet allClusters; | ||
|
||
public AverageLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) { | ||
this.elements = elements; | ||
this.a = adjacencyMatrix; | ||
this.allClusters = new ClusterSet(); | ||
} | ||
|
||
public Dendrogram cluster() { | ||
|
||
Dendrogram dnd = new Dendrogram("Distance"); | ||
double d = 0.0; | ||
|
||
// initially load all elements as individual clusters | ||
for (DataPoint e : elements) { | ||
Cluster c = new Cluster(e); | ||
allClusters.add(c); | ||
} | ||
|
||
dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); | ||
|
||
d = 1.0; | ||
|
||
while (allClusters.size() > 1) { | ||
int K = allClusters.size(); | ||
mergeClusters(d); | ||
// it is possible that there were no clusters to merge for current | ||
// d. | ||
if (K > allClusters.size()) { | ||
dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); | ||
K = allClusters.size(); | ||
} | ||
|
||
d = d + 0.5; | ||
} | ||
return dnd; | ||
} | ||
|
||
private void mergeClusters(double distanceThreshold) { | ||
int nClusters = allClusters.size(); | ||
|
||
ObjectToIndexMapping<Cluster> idxMapping = new ObjectToIndexMapping<Cluster>(); | ||
|
||
double[][] clusterDistances = new double[nClusters][nClusters]; | ||
|
||
for (int i = 0, n = a.length; i < n; i++) { | ||
for (int j = i + 1, k = a.length; j < k; j++) { | ||
double d = a[i][j]; | ||
if (d > 0) { | ||
DataPoint e1 = elements[i]; | ||
DataPoint e2 = elements[j]; | ||
Cluster c1 = allClusters.findClusterByElement(e1); | ||
Cluster c2 = allClusters.findClusterByElement(e2); | ||
if (!c1.equals(c2)) { | ||
int ci = idxMapping.getIndex(c1); | ||
int cj = idxMapping.getIndex(c2); | ||
clusterDistances[ci][cj] += d; | ||
clusterDistances[cj][ci] += d; | ||
} | ||
} | ||
} | ||
} | ||
|
||
boolean[] merged = new boolean[clusterDistances.length]; | ||
for (int i = 0, n = clusterDistances.length; i < n; i++) { | ||
for (int j = i + 1, k = clusterDistances.length; j < k; j++) { | ||
Cluster ci = idxMapping.getObject(i); | ||
Cluster cj = idxMapping.getObject(j); | ||
int ni = ci.size(); | ||
int nj = cj.size(); | ||
clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj); | ||
clusterDistances[j][i] = clusterDistances[i][j]; | ||
// merge clusters if distance is below the threshold | ||
if (merged[i] == false && merged[j] == false) { | ||
if (clusterDistances[i][j] <= distanceThreshold) { | ||
allClusters.remove(ci); | ||
allClusters.remove(cj); | ||
Cluster mergedCluster = new Cluster(ci, cj); | ||
allClusters.add(mergedCluster); | ||
merged[i] = true; | ||
merged[j] = true; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
83 changes: 83 additions & 0 deletions
83
src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/* | ||
* ________________________________________________________________________________________ | ||
* | ||
* Y O O R E E K A | ||
* A library for data mining, machine learning, soft computing, and mathematical analysis | ||
* ________________________________________________________________________________________ | ||
* | ||
* The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " | ||
* (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms | ||
* are valuable in any software application. | ||
* | ||
* Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko | ||
* Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. | ||
* | ||
* Certain library functions depend on other Open Source software libraries, which are covered | ||
* by different license agreements. See the NOTICE file distributed with this work for additional | ||
* information regarding copyright ownership and licensing. | ||
* | ||
* Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | ||
* either express or implied. See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
package org.yooreeka.algos.clustering.hierarchical; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.yooreeka.algos.clustering.model.Cluster; | ||
import org.yooreeka.algos.clustering.model.DataPoint; | ||
|
||
/** | ||
* Set of clusters. | ||
*/ | ||
public class ClusterSet { | ||
|
||
private Set<Cluster> allClusters = new HashSet<Cluster>(); | ||
|
||
public boolean add(Cluster c) { | ||
return allClusters.add(c); | ||
} | ||
|
||
public Cluster findClusterByElement(DataPoint e) { | ||
Cluster cluster = null; | ||
for (Cluster c : allClusters) { | ||
if (c.contains(e)) { | ||
cluster = c; | ||
break; | ||
} | ||
} | ||
return cluster; | ||
} | ||
|
||
public List<Cluster> getAllClusters() { | ||
return new ArrayList<Cluster>(allClusters); | ||
} | ||
|
||
public boolean remove(Cluster c) { | ||
return allClusters.remove(c); | ||
} | ||
|
||
public int size() { | ||
return allClusters.size(); | ||
} | ||
|
||
// public ClusterSet copy() { | ||
// ClusterSet clusterSet = new ClusterSet(); | ||
// for(Cluster c : this.allClusters ) { | ||
// Cluster clusterCopy = c.copy(); | ||
// clusterSet.add(clusterCopy); | ||
// } | ||
// return clusterSet; | ||
// } | ||
} |
162 changes: 162 additions & 0 deletions
162
src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/* | ||
* ________________________________________________________________________________________ | ||
* | ||
* Y O O R E E K A | ||
* A library for data mining, machine learning, soft computing, and mathematical analysis | ||
* ________________________________________________________________________________________ | ||
* | ||
* The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " | ||
* (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms | ||
* are valuable in any software application. | ||
* | ||
* Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko | ||
* Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. | ||
* | ||
* Certain library functions depend on other Open Source software libraries, which are covered | ||
* by different license agreements. See the NOTICE file distributed with this work for additional | ||
* information regarding copyright ownership and licensing. | ||
* | ||
* Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | ||
* either express or implied. See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
package org.yooreeka.algos.clustering.hierarchical; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.logging.Logger; | ||
|
||
import org.yooreeka.algos.clustering.model.Cluster; | ||
import org.yooreeka.config.YooreekaConfigurator; | ||
|
||
public class Dendrogram { | ||
|
||
private static final Logger LOG = Logger.getLogger(Dendrogram.class.getName()); | ||
|
||
/* | ||
* Clusters by level. | ||
*/ | ||
private Map<Integer, ClusterSet> entryMap; | ||
private Map<Integer, String> levelLabels; | ||
private Integer nextLevel; | ||
private String levelLabelName; | ||
|
||
public Dendrogram(String levelLabelName) { | ||
|
||
LOG.setLevel(YooreekaConfigurator.getLevel(Dendrogram.class.getName())); | ||
|
||
entryMap = new LinkedHashMap<Integer, ClusterSet>(); | ||
levelLabels = new LinkedHashMap<Integer, String>(); | ||
nextLevel = 1; | ||
this.levelLabelName = levelLabelName; | ||
} | ||
|
||
public int addLevel(String label, Cluster cluster) { | ||
List<Cluster> values = new ArrayList<Cluster>(); | ||
values.add(cluster); | ||
return addLevel(label, values); | ||
} | ||
|
||
/** | ||
* Creates a new dendrogram level using copies of provided clusters. | ||
*/ | ||
public int addLevel(String label, Collection<Cluster> clusters) { | ||
|
||
ClusterSet clusterSet = new ClusterSet(); | ||
|
||
for (Cluster c : clusters) { | ||
// copy cluster before adding - over time cluster elements may | ||
// change | ||
// but for dendrogram we want to keep current state. | ||
clusterSet.add(c.copy()); | ||
} | ||
|
||
int level = nextLevel; | ||
|
||
entryMap.put(level, clusterSet); | ||
levelLabels.put(level, label); | ||
|
||
nextLevel++; | ||
return level; | ||
} | ||
|
||
public List<Integer> getAllLevels() { | ||
return new ArrayList<Integer>(entryMap.keySet()); | ||
} | ||
|
||
public List<Cluster> getClustersForLevel(int level) { | ||
ClusterSet cs = entryMap.get(level); | ||
return cs.getAllClusters(); | ||
} | ||
|
||
public String getLabelForLevel(int level) { | ||
return levelLabels.get(level); | ||
} | ||
|
||
public int getTopLevel() { | ||
return nextLevel - 1; | ||
} | ||
|
||
public void print(int level) { | ||
String label = levelLabels.get(level); | ||
ClusterSet clusters = entryMap.get(level); | ||
LOG.info("Clusters for: level=" + level + ", " | ||
+ levelLabelName + "=" + label); | ||
for (Cluster c : clusters.getAllClusters()) { | ||
if (c.getElements().size() > 1) { | ||
LOG.info("____________________________________________________________\n"); | ||
LOG.info(c.getElementsAsString()); | ||
LOG.info("____________________________________________________________\n\n"); | ||
} | ||
} | ||
} | ||
|
||
public void printAll() { | ||
for (Map.Entry<Integer, ClusterSet> e : entryMap.entrySet()) { | ||
Integer level = e.getKey(); | ||
print(level); | ||
} | ||
} | ||
|
||
/** | ||
* Replaces clusters in the specified level. If level doesn't exist it will | ||
* be created. | ||
* | ||
* @param level | ||
* dendrogram level. | ||
* @param label | ||
* level description. | ||
* @param clusters | ||
* clusters for the level. | ||
* @return | ||
*/ | ||
public void setLevel(int level, String label, Collection<Cluster> clusters) { | ||
|
||
ClusterSet clusterSet = new ClusterSet(); | ||
|
||
for (Cluster c : clusters) { | ||
clusterSet.add(c.copy()); | ||
} | ||
|
||
LOG.fine("Setting cluster level: " + level); | ||
|
||
entryMap.put(level, clusterSet); | ||
levelLabels.put(level, label); | ||
|
||
if (level >= nextLevel) { | ||
nextLevel = level + 1; | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.