Checking in the code

dnstanciu · Dec 3, 2012 · 762d794 · 762d794
1 parent 1d728ca
commit 762d794
Show file tree

Hide file tree

Showing 255 changed files with 35,422 additions and 0 deletions.
diff --git a/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java b/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java
@@ -0,0 +1,145 @@
+/*
+ *   ________________________________________________________________________________________
+ *   
+ *   Y O O R E E K A
+ *   A library for data mining, machine learning, soft computing, and mathematical analysis
+ *   ________________________________________________________________________________________ 
+ *    
+ *   The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " 
+ *   (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms 
+ *   are valuable in any software application.
+ *  
+ *   Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ *   Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.  
+ * 
+ *   Certain library functions depend on other Open Source software libraries, which are covered 
+ *   by different license agreements. See the NOTICE file distributed with this work for additional 
+ *   information regarding copyright ownership and licensing.
+ * 
+ *   Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); 
+ *   you may not use this file except in compliance with the License.  
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software distributed under 
+ *   the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+ *   either express or implied. See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *   
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping;
+
+/** A hierarchical agglomerative clustering algorithm based on the average link */
+public class AverageLinkAlgorithm {
+
+	public static void main(String[] args) {
+		// Define data
+		DataPoint[] elements = new DataPoint[5];
+		elements[0] = new DataPoint("A", new double[] {});
+		elements[1] = new DataPoint("B", new double[] {});
+		elements[2] = new DataPoint("C", new double[] {});
+		elements[3] = new DataPoint("D", new double[] {});
+		elements[4] = new DataPoint("E", new double[] {});
+
+		double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 },
+				{ 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } };
+
+		AverageLinkAlgorithm ca = new AverageLinkAlgorithm(elements, a);
+		Dendrogram dnd = ca.cluster();
+		dnd.printAll();
+	}
+	private DataPoint[] elements;
+	private double[][] a;
+
+	private ClusterSet allClusters;
+
+	public AverageLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) {
+		this.elements = elements;
+		this.a = adjacencyMatrix;
+		this.allClusters = new ClusterSet();
+	}
+
+	public Dendrogram cluster() {
+
+		Dendrogram dnd = new Dendrogram("Distance");
+		double d = 0.0;
+
+		// initially load all elements as individual clusters
+		for (DataPoint e : elements) {
+			Cluster c = new Cluster(e);
+			allClusters.add(c);
+		}
+
+		dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
+
+		d = 1.0;
+
+		while (allClusters.size() > 1) {
+			int K = allClusters.size();
+			mergeClusters(d);
+			// it is possible that there were no clusters to merge for current
+			// d.
+			if (K > allClusters.size()) {
+				dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
+				K = allClusters.size();
+			}
+
+			d = d + 0.5;
+		}
+		return dnd;
+	}
+
+	private void mergeClusters(double distanceThreshold) {
+		int nClusters = allClusters.size();
+
+		ObjectToIndexMapping<Cluster> idxMapping = new ObjectToIndexMapping<Cluster>();
+
+		double[][] clusterDistances = new double[nClusters][nClusters];
+
+		for (int i = 0, n = a.length; i < n; i++) {
+			for (int j = i + 1, k = a.length; j < k; j++) {
+				double d = a[i][j];
+				if (d > 0) {
+					DataPoint e1 = elements[i];
+					DataPoint e2 = elements[j];
+					Cluster c1 = allClusters.findClusterByElement(e1);
+					Cluster c2 = allClusters.findClusterByElement(e2);
+					if (!c1.equals(c2)) {
+						int ci = idxMapping.getIndex(c1);
+						int cj = idxMapping.getIndex(c2);
+						clusterDistances[ci][cj] += d;
+						clusterDistances[cj][ci] += d;
+					}
+				}
+			}
+		}
+
+		boolean[] merged = new boolean[clusterDistances.length];
+		for (int i = 0, n = clusterDistances.length; i < n; i++) {
+			for (int j = i + 1, k = clusterDistances.length; j < k; j++) {
+				Cluster ci = idxMapping.getObject(i);
+				Cluster cj = idxMapping.getObject(j);
+				int ni = ci.size();
+				int nj = cj.size();
+				clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj);
+				clusterDistances[j][i] = clusterDistances[i][j];
+				// merge clusters if distance is below the threshold
+				if (merged[i] == false && merged[j] == false) {
+					if (clusterDistances[i][j] <= distanceThreshold) {
+						allClusters.remove(ci);
+						allClusters.remove(cj);
+						Cluster mergedCluster = new Cluster(ci, cj);
+						allClusters.add(mergedCluster);
+						merged[i] = true;
+						merged[j] = true;
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java b/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java
@@ -0,0 +1,83 @@
+/*
+ *   ________________________________________________________________________________________
+ *   
+ *   Y O O R E E K A
+ *   A library for data mining, machine learning, soft computing, and mathematical analysis
+ *   ________________________________________________________________________________________ 
+ *    
+ *   The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " 
+ *   (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms 
+ *   are valuable in any software application.
+ *  
+ *   Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ *   Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.  
+ * 
+ *   Certain library functions depend on other Open Source software libraries, which are covered 
+ *   by different license agreements. See the NOTICE file distributed with this work for additional 
+ *   information regarding copyright ownership and licensing.
+ * 
+ *   Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); 
+ *   you may not use this file except in compliance with the License.  
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software distributed under 
+ *   the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+ *   either express or implied. See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *   
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+
+/**
+ * Set of clusters.
+ */
+public class ClusterSet {
+
+	private Set<Cluster> allClusters = new HashSet<Cluster>();
+
+	public boolean add(Cluster c) {
+		return allClusters.add(c);
+	}
+
+	public Cluster findClusterByElement(DataPoint e) {
+		Cluster cluster = null;
+		for (Cluster c : allClusters) {
+			if (c.contains(e)) {
+				cluster = c;
+				break;
+			}
+		}
+		return cluster;
+	}
+
+	public List<Cluster> getAllClusters() {
+		return new ArrayList<Cluster>(allClusters);
+	}
+
+	public boolean remove(Cluster c) {
+		return allClusters.remove(c);
+	}
+
+	public int size() {
+		return allClusters.size();
+	}
+
+	// public ClusterSet copy() {
+	// ClusterSet clusterSet = new ClusterSet();
+	// for(Cluster c : this.allClusters ) {
+	// Cluster clusterCopy = c.copy();
+	// clusterSet.add(clusterCopy);
+	// }
+	// return clusterSet;
+	// }
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java b/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java
@@ -0,0 +1,162 @@
+/*
+ *   ________________________________________________________________________________________
+ *   
+ *   Y O O R E E K A
+ *   A library for data mining, machine learning, soft computing, and mathematical analysis
+ *   ________________________________________________________________________________________ 
+ *    
+ *   The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " 
+ *   (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms 
+ *   are valuable in any software application.
+ *  
+ *   Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ *   Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.  
+ * 
+ *   Certain library functions depend on other Open Source software libraries, which are covered 
+ *   by different license agreements. See the NOTICE file distributed with this work for additional 
+ *   information regarding copyright ownership and licensing.
+ * 
+ *   Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); 
+ *   you may not use this file except in compliance with the License.  
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software distributed under 
+ *   the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+ *   either express or implied. See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *   
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.config.YooreekaConfigurator;
+
+public class Dendrogram {
+
+	private static final Logger LOG = Logger.getLogger(Dendrogram.class.getName());
+
+	/*
+	 * Clusters by level.
+	 */
+	private Map<Integer, ClusterSet> entryMap;
+	private Map<Integer, String> levelLabels;
+	private Integer nextLevel;
+	private String levelLabelName;
+
+	public Dendrogram(String levelLabelName) {
+
+		LOG.setLevel(YooreekaConfigurator.getLevel(Dendrogram.class.getName()));
+
+		entryMap = new LinkedHashMap<Integer, ClusterSet>();
+		levelLabels = new LinkedHashMap<Integer, String>();
+		nextLevel = 1;
+		this.levelLabelName = levelLabelName;
+	}
+
+	public int addLevel(String label, Cluster cluster) {
+		List<Cluster> values = new ArrayList<Cluster>();
+		values.add(cluster);
+		return addLevel(label, values);
+	}
+
+	/**
+	 * Creates a new dendrogram level using copies of provided clusters.
+	 */
+	public int addLevel(String label, Collection<Cluster> clusters) {
+
+		ClusterSet clusterSet = new ClusterSet();
+
+		for (Cluster c : clusters) {
+			// copy cluster before adding - over time cluster elements may
+			// change
+			// but for dendrogram we want to keep current state.
+			clusterSet.add(c.copy());
+		}
+
+		int level = nextLevel;
+
+		entryMap.put(level, clusterSet);
+		levelLabels.put(level, label);
+
+		nextLevel++;
+		return level;
+	}
+
+	public List<Integer> getAllLevels() {
+		return new ArrayList<Integer>(entryMap.keySet());
+	}
+
+	public List<Cluster> getClustersForLevel(int level) {
+		ClusterSet cs = entryMap.get(level);
+		return cs.getAllClusters();
+	}
+
+	public String getLabelForLevel(int level) {
+		return levelLabels.get(level);
+	}
+
+	public int getTopLevel() {
+		return nextLevel - 1;
+	}
+
+	public void print(int level) {
+		String label = levelLabels.get(level);
+		ClusterSet clusters = entryMap.get(level);
+		LOG.info("Clusters for: level=" + level + ", "
+				+ levelLabelName + "=" + label);
+		for (Cluster c : clusters.getAllClusters()) {
+			if (c.getElements().size() > 1) {
+				LOG.info("____________________________________________________________\n");
+				LOG.info(c.getElementsAsString());
+				LOG.info("____________________________________________________________\n\n");
+			}
+		}
+	}
+
+	public void printAll() {
+		for (Map.Entry<Integer, ClusterSet> e : entryMap.entrySet()) {
+			Integer level = e.getKey();
+			print(level);
+		}
+	}
+
+	/**
+	 * Replaces clusters in the specified level. If level doesn't exist it will
+	 * be created.
+	 * 
+	 * @param level
+	 *            dendrogram level.
+	 * @param label
+	 *            level description.
+	 * @param clusters
+	 *            clusters for the level.
+	 * @return
+	 */
+	public void setLevel(int level, String label, Collection<Cluster> clusters) {
+
+		ClusterSet clusterSet = new ClusterSet();
+
+		for (Cluster c : clusters) {
+			clusterSet.add(c.copy());
+		}
+
+		LOG.fine("Setting cluster level: " + level);
+
+		entryMap.put(level, clusterSet);
+		levelLabels.put(level, label);
+
+		if (level >= nextLevel) {
+			nextLevel = level + 1;
+		}
+	}
+
+}