From 762d794eae7b589b883d1bd31dcd1fad1942c51b Mon Sep 17 00:00:00 2001
From: "H. Marmanis"
Date: Mon, 3 Dec 2012 12:00:06 -0500
Subject: [PATCH] Checking in the code
---
.../clustering/dbscan/DBSCANAlgorithm.java | 448 ++++++++++++++
.../hierarchical/AverageLinkAlgorithm.java | 145 +++++
.../clustering/hierarchical/ClusterSet.java | 83 +++
.../clustering/hierarchical/Dendrogram.java | 162 ++++++
.../algos/clustering/hierarchical/MST.java | 130 +++++
.../hierarchical/MSTSingleLinkAlgorithm.java | 142 +++++
.../hierarchical/SingleLinkAlgorithm.java | 126 ++++
.../algos/clustering/model/Attribute.java | 119 ++++
.../algos/clustering/model/Cluster.java | 197 +++++++
.../algos/clustering/model/DataPoint.java | 181 ++++++
.../partitional/KMeansAlgorithm.java | 306 ++++++++++
.../partitional/NearestNeighborAlgorithm.java | 230 ++++++++
.../algos/clustering/rock/LinkMatrix.java | 195 +++++++
.../clustering/rock/MergeGoodnessMeasure.java | 92 +++
.../algos/clustering/rock/ROCKAlgorithm.java | 142 +++++
.../algos/clustering/rock/ROCKClusters.java | 205 +++++++
.../algos/clustering/rock/SimilarCluster.java | 85 +++
.../clustering/test/MyDiggSpaceData.java | 125 ++++
.../clustering/test/MyDiggSpaceDataset.java | 56 ++
.../algos/clustering/test/SFData.java | 212 +++++++
.../algos/clustering/test/SFDataset.java | 93 +++
.../algos/clustering/utils/Attributes.java | 143 +++++
.../utils/ObjectToIndexMapping.java | 90 +++
.../utils/SortedArrayClustering.java | 71 +++
.../algos/reco/collab/cache/FileStore.java | 134 +++++
.../algos/reco/collab/cache/Store.java | 72 +++
.../algos/reco/collab/data/BaseDataset.java | 431 ++++++++++++++
.../algos/reco/collab/data/ContentItem.java | 59 ++
.../algos/reco/collab/data/DiggData.java | 361 ++++++++++++
.../algos/reco/collab/data/HTMLContent.java | 99 ++++
.../algos/reco/collab/data/MovieLensData.java | 83 +++
.../reco/collab/data/MovieLensDataset.java | 385 +++++++++++++
.../algos/reco/collab/data/MusicData.java | 256 ++++++++
.../algos/reco/collab/data/MusicItem.java | 71 +++
.../algos/reco/collab/data/MusicRating.java | 52 ++
.../algos/reco/collab/data/MusicUser.java | 249 ++++++++
.../algos/reco/collab/data/NewsData.java | 202 +++++++
.../algos/reco/collab/data/NewsItem.java | 54 ++
.../algos/reco/collab/data/NewsUser.java | 82 +++
.../algos/reco/collab/data/RatingBuilder.java | 94 +++
.../evaluation/EvaluationDataProvider.java | 44 ++
.../MovieLensEvaluationDataProvider.java | 283 +++++++++
.../reco/collab/evaluation/MovieLensRMSE.java | 104 ++++
.../reco/collab/evaluation/RMSEEstimator.java | 173 ++++++
.../reco/collab/evaluation/RMSEResult.java | 85 +++
.../algos/reco/collab/model/Content.java | 182 ++++++
.../algos/reco/collab/model/Dataset.java | 142 +++++
.../algos/reco/collab/model/Item.java | 171 ++++++
.../algos/reco/collab/model/Rating.java | 127 ++++
.../reco/collab/model/RecommendationType.java | 38 ++
.../algos/reco/collab/model/SimilarItem.java | 128 ++++
.../algos/reco/collab/model/SimilarUser.java | 134 +++++
.../algos/reco/collab/model/User.java | 175 ++++++
.../algos/reco/collab/recommender/Delphi.java | 545 ++++++++++++++++++
.../reco/collab/recommender/DiggDelphi.java | 282 +++++++++
.../collab/recommender/MovieLensDelphi.java | 324 +++++++++++
.../recommender/PredictedItemRating.java | 152 +++++
.../reco/collab/recommender/Recommender.java | 88 +++
.../movielens/MovieLensItemSimilarity.java | 92 +++
.../movielens/MovieLensUserSimilarity.java | 107 ++++
.../naive/ImprovedItemBasedSimilarity.java | 120 ++++
.../naive/ImprovedUserBasedSimilarity.java | 129 +++++
.../similarity/naive/ItemBasedSimilarity.java | 110 ++++
.../naive/ItemContentBasedSimilarity.java | 92 +++
.../naive/ItemPenaltyBasedSimilarity.java | 161 ++++++
.../similarity/naive/SimilarityMatrix.java | 74 +++
.../naive/SimilarityMatrixImpl.java | 148 +++++
.../similarity/naive/UserBasedSimilarity.java | 117 ++++
.../naive/UserContentBasedSimilarity.java | 107 ++++
.../naive/UserItemContentBasedSimilarity.java | 184 ++++++
.../UpperTriangularSimilarityMatrix.java | 82 +++
.../UpperTriangularSimilarityMatrixImpl.java | 150 +++++
.../similarity/util/PearsonCorrelation.java | 170 ++++++
.../similarity/util/RatingCountMatrix.java | 131 +++++
.../util/SimilarityMatrixCache.java | 71 +++
.../util/SimilarityMatrixRepository.java | 173 ++++++
.../algos/reco/content/digg/DiggCategory.java | 83 +++
.../algos/reco/content/digg/DiggService.java | 253 ++++++++
.../reco/content/digg/DiggStoryItem.java | 109 ++++
.../algos/reco/content/digg/DiggUser.java | 45 ++
.../algos/search/data/SearchResult.java | 180 ++++++
.../search/lucene/LuceneIndexBuilder.java | 152 +++++
.../lucene/analyzer/CustomAnalyzer.java | 113 ++++
.../lucene/analyzer/TextDocumentTerms.java | 78 +++
.../search/ranking/DocRankMatrixBuilder.java | 197 +++++++
.../search/ranking/PageRankMatrixBuilder.java | 98 ++++
.../algos/search/ranking/PageRankMatrixH.java | 184 ++++++
.../yooreeka/algos/search/ranking/Rank.java | 294 ++++++++++
.../algos/search/ranking/RelevanceScore.java | 78 +++
.../algos/search/util/TermFreqMapUtils.java | 93 +++
.../algos/taxis/bayesian/NaiveBayes.java | 327 +++++++++++
.../boosting/BoostingARCX4Classifier.java | 190 ++++++
.../taxis/boosting/WeightBasedRandom.java | 80 +++
.../algos/taxis/core/AttributeValue.java | 113 ++++
.../algos/taxis/core/BaseConcept.java | 124 ++++
.../algos/taxis/core/BaseInstance.java | 239 ++++++++
.../algos/taxis/core/DoubleAttribute.java | 96 +++
.../algos/taxis/core/StringAttribute.java | 108 ++++
.../algos/taxis/core/TrainingSet.java | 173 ++++++
.../algos/taxis/core/intf/Attribute.java | 42 ++
.../algos/taxis/core/intf/Classifier.java | 51 ++
.../algos/taxis/core/intf/Concept.java | 44 ++
.../algos/taxis/core/intf/Instance.java | 46 ++
.../taxis/ensemble/ClassifierEnsemble.java | 106 ++++
.../taxis/ensemble/ConceptMajorityVoter.java | 87 +++
.../taxis/evaluation/ClassifierResults.java | 70 +++
.../algos/taxis/evaluation/CochransQTest.java | 128 ++++
.../algos/taxis/evaluation/Diff2PropTest.java | 84 +++
.../algos/taxis/evaluation/FTest.java | 182 ++++++
.../algos/taxis/evaluation/McNemarTest.java | 118 ++++
.../yooreeka/algos/taxis/evaluation/Test.java | 105 ++++
.../taxis/networks/neural/XORNetwork.java | 163 ++++++
.../taxis/networks/neural/core/BaseLayer.java | 139 +++++
.../taxis/networks/neural/core/BaseLink.java | 85 +++
.../taxis/networks/neural/core/BaseNN.java | 429 ++++++++++++++
.../taxis/networks/neural/core/BaseNode.java | 216 +++++++
.../networks/neural/core/LinearNode.java | 59 ++
.../networks/neural/core/SigmoidNode.java | 52 ++
.../networks/neural/core/intf/Layer.java | 57 ++
.../taxis/networks/neural/core/intf/Link.java | 53 ++
.../neural/core/intf/NeuralNetwork.java | 69 +++
.../taxis/networks/neural/core/intf/Node.java | 98 ++++
.../algos/taxis/tree/AttributeDefinition.java | 100 ++++
.../algos/taxis/tree/AttributeSelector.java | 145 +++++
.../algos/taxis/tree/AttributeUtils.java | 61 ++
src/org/yooreeka/algos/taxis/tree/Branch.java | 92 +++
.../algos/taxis/tree/BranchGroup.java | 130 +++++
.../algos/taxis/tree/ConceptUtils.java | 85 +++
.../taxis/tree/DecisionTreeClassifier.java | 248 ++++++++
.../yooreeka/algos/taxis/tree/InfoGain.java | 151 +++++
src/org/yooreeka/algos/taxis/tree/Node.java | 403 +++++++++++++
.../algos/taxis/tree/SplittingCriterion.java | 124 ++++
.../taxis/tree/TrueErrorRateEstimator.java | 71 +++
.../yooreeka/config/YooreekaConfigurator.java | 220 +++++++
.../credit/BaggingCreditClassifier.java | 79 +++
.../credit/BoostingCreditClassifier.java | 131 +++++
.../examples/credit/CreditConcept.java | 92 +++
.../examples/credit/CreditInstance.java | 121 ++++
.../examples/credit/DTCreditClassifier.java | 194 +++++++
.../examples/credit/NBCreditClassifier.java | 121 ++++
.../examples/credit/NNCreditClassifier.java | 406 +++++++++++++
.../examples/credit/UserCreditNN.java | 211 +++++++
.../examples/credit/data/UseCaseData.java | 194 +++++++
.../examples/credit/data/UserDataset.java | 80 +++
.../examples/credit/data/UserLoader.java | 70 +++
.../credit/data/users/BadUserType.java | 53 ++
.../credit/data/users/DangerousUserType.java | 53 ++
.../credit/data/users/ExcellentUserType.java | 53 ++
.../credit/data/users/GoodUserType.java | 53 ++
.../examples/credit/data/users/User.java | 319 ++++++++++
.../examples/credit/data/users/UserType.java | 512 ++++++++++++++++
.../credit/data/users/VeryGoodUserType.java | 53 ++
.../examples/credit/util/AttributeInfo.java | 68 +++
.../examples/credit/util/AttributeUtils.java | 88 +++
.../util/BootstrapTrainingSetBuilder.java | 121 ++++
.../credit/util/ClassifierResults.java | 70 +++
.../examples/credit/util/CreditDataUtils.java | 100 ++++
.../credit/util/CreditErrorEstimator.java | 231 ++++++++
.../examples/credit/util/DataGenerator.java | 130 +++++
.../credit/util/UserInstanceBuilder.java | 167 ++++++
.../examples/fraud/DTFraudClassifier.java | 136 +++++
.../examples/fraud/NNFraudClassifier.java | 356 ++++++++++++
.../examples/fraud/TransactionConcept.java | 92 +++
.../examples/fraud/TransactionInstance.java | 99 ++++
.../examples/fraud/TransactionNN.java | 106 ++++
.../examples/fraud/data/Transaction.java | 124 ++++
.../fraud/data/TransactionDataset.java | 134 +++++
.../data/TransactionInstanceBuilder.java | 224 +++++++
.../fraud/data/TransactionLoader.java | 59 ++
.../fraud/data/TransactionLocation.java | 94 +++
.../examples/fraud/util/DataGenerator.java | 119 ++++
.../examples/fraud/util/FraudDataUtils.java | 148 +++++
.../fraud/util/FraudErrorEstimator.java | 123 ++++
.../examples/fraud/util/TenUsersSample.java | 363 ++++++++++++
.../fraud/util/TransactionSetProfile.java | 145 +++++
.../examples/fraud/util/UserStatistics.java | 153 +++++
.../fraud/util/UserStatisticsCalculator.java | 164 ++++++
.../examples/newsgroups/NewsCrawler.java | 195 +++++++
.../recommender/MovieLensRMSESample.java | 61 ++
.../examples/recommender/RatingGrapher.java | 174 ++++++
.../examples/recommender/Recommender.java | 119 ++++
src/org/yooreeka/examples/search/DocRank.java | 57 ++
.../examples/search/LuceneIndexer.java | 87 +++
.../yooreeka/examples/search/MySearcher.java | 360 ++++++++++++
.../yooreeka/examples/search/PageRank.java | 56 ++
.../examples/spamfilter/EmailClassifier.java | 247 ++++++++
.../examples/spamfilter/EmailInstance.java | 86 +++
.../examples/spamfilter/data/Email.java | 119 ++++
.../examples/spamfilter/data/EmailData.java | 223 +++++++
.../spamfilter/data/EmailDataset.java | 137 +++++
src/org/yooreeka/util/C.java | 64 ++
src/org/yooreeka/util/P.java | 57 ++
src/org/yooreeka/util/gui/GraphGui.java | 152 +++++
src/org/yooreeka/util/gui/XyGui.java | 203 +++++++
.../util/internet/behavior/UserClick.java | 157 +++++
.../util/internet/behavior/UserQuery.java | 159 +++++
.../crawling/FetchAndProcessCrawler.java | 310 ++++++++++
.../util/internet/crawling/YCrawler.java | 197 +++++++
.../crawling/core/BasicWebCrawler.java | 332 +++++++++++
.../internet/crawling/core/CrawlData.java | 99 ++++
.../crawling/core/CrawlDataProcessor.java | 46 ++
.../crawling/core/DocumentFilter.java | 44 ++
.../internet/crawling/core/URLFilter.java | 79 +++
.../internet/crawling/core/URLNormalizer.java | 77 +++
.../internet/crawling/db/FetchedDocsDB.java | 305 ++++++++++
.../util/internet/crawling/db/KnownUrlDB.java | 279 +++++++++
.../util/internet/crawling/db/PageLinkDB.java | 163 ++++++
.../internet/crawling/db/ProcessedDocsDB.java | 413 +++++++++++++
.../crawling/model/FetchedDocument.java | 143 +++++
.../crawling/model/KnownUrlEntry.java | 77 +++
.../util/internet/crawling/model/Outlink.java | 55 ++
.../crawling/transport/common/Transport.java | 43 ++
.../transport/common/TransportException.java | 47 ++
.../transport/file/FileTransport.java | 134 +++++
.../file/FileTransportException.java | 49 ++
.../transport/http/HTTPTransport.java | 260 +++++++++
.../http/HTTPTransportException.java | 46 ++
.../crawling/transport/http/HTTPUtils.java | 142 +++++
.../crawling/util/DocumentIdUtils.java | 56 ++
.../internet/crawling/util/FileUtils.java | 130 +++++
.../util/internet/crawling/util/UrlGroup.java | 71 +++
.../util/internet/crawling/util/UrlUtils.java | 65 +++
.../crawling/util/ValueToIndexMapping.java | 93 +++
.../yooreeka/util/metrics/CosineDistance.java | 58 ++
.../util/metrics/CosineSimilarity.java | 76 +++
.../util/metrics/CosineSimilarityMeasure.java | 56 ++
.../util/metrics/EuclideanDistance.java | 55 ++
.../util/metrics/JaccardCoefficient.java | 77 +++
.../util/metrics/JaccardDistance.java | 57 ++
.../util/metrics/NumericDistance.java | 40 ++
.../util/metrics/SimilarityMeasure.java | 43 ++
.../util/metrics/TermFrequencyBuilder.java | 78 +++
.../util/parsing/common/AbstractDocument.java | 48 ++
.../util/parsing/common/DataEntry.java | 40 ++
.../util/parsing/common/DataField.java | 68 +++
.../util/parsing/common/DataType.java | 40 ++
.../util/parsing/common/DocumentParser.java | 44 ++
.../common/DocumentParserException.java | 45 ++
.../parsing/common/DocumentParserFactory.java | 68 +++
.../parsing/common/ProcessedDocument.java | 198 +++++++
.../util/parsing/csv/CSVDocument.java | 93 +++
.../yooreeka/util/parsing/csv/CSVEntry.java | 108 ++++
.../yooreeka/util/parsing/csv/CSVFile.java | 149 +++++
.../yooreeka/util/parsing/csv/CSVParser.java | 135 +++++
.../yooreeka/util/parsing/csv/CSVSchema.java | 58 ++
.../util/parsing/html/CompositeFilter.java | 64 ++
.../util/parsing/html/ElementNodeFilter.java | 61 ++
.../util/parsing/html/HTMLDocumentParser.java | 457 +++++++++++++++
.../html/HTMLDocumentParserException.java | 49 ++
.../util/parsing/html/HTMLWriter.java | 119 ++++
.../util/parsing/html/LinkNodeFilter.java | 58 ++
.../util/parsing/html/MultiFilter.java | 61 ++
.../parsing/msword/MSWordDocumentParser.java | 103 ++++
.../msword/MSWordDocumentParserException.java | 49 ++
.../util/text/AlphabetProjection.java | 313 ++++++++++
255 files changed, 35422 insertions(+)
create mode 100644 src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/MST.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/model/Attribute.java
create mode 100644 src/org/yooreeka/algos/clustering/model/Cluster.java
create mode 100644 src/org/yooreeka/algos/clustering/model/DataPoint.java
create mode 100644 src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/rock/LinkMatrix.java
create mode 100644 src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java
create mode 100644 src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java
create mode 100644 src/org/yooreeka/algos/clustering/rock/ROCKClusters.java
create mode 100644 src/org/yooreeka/algos/clustering/rock/SimilarCluster.java
create mode 100644 src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java
create mode 100644 src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java
create mode 100644 src/org/yooreeka/algos/clustering/test/SFData.java
create mode 100644 src/org/yooreeka/algos/clustering/test/SFDataset.java
create mode 100644 src/org/yooreeka/algos/clustering/utils/Attributes.java
create mode 100644 src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java
create mode 100644 src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java
create mode 100644 src/org/yooreeka/algos/reco/collab/cache/FileStore.java
create mode 100644 src/org/yooreeka/algos/reco/collab/cache/Store.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/BaseDataset.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/ContentItem.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/DiggData.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/HTMLContent.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MovieLensData.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicData.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicItem.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicRating.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicUser.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsData.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsItem.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsUser.java
create mode 100644 src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java
create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java
create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java
create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java
create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java
create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/Content.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/Dataset.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/Item.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/Rating.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/RecommendationType.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/SimilarItem.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/SimilarUser.java
create mode 100644 src/org/yooreeka/algos/reco/collab/model/User.java
create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/Delphi.java
create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/DiggDelphi.java
create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/MovieLensDelphi.java
create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/PredictedItemRating.java
create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/Recommender.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensItemSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensUserSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedItemBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedUserBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemContentBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemPenaltyBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrix.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrixImpl.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserContentBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserItemContentBasedSimilarity.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrix.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrixImpl.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/PearsonCorrelation.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/RatingCountMatrix.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixCache.java
create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java
create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggCategory.java
create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggService.java
create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggStoryItem.java
create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggUser.java
create mode 100644 src/org/yooreeka/algos/search/data/SearchResult.java
create mode 100644 src/org/yooreeka/algos/search/lucene/LuceneIndexBuilder.java
create mode 100644 src/org/yooreeka/algos/search/lucene/analyzer/CustomAnalyzer.java
create mode 100644 src/org/yooreeka/algos/search/lucene/analyzer/TextDocumentTerms.java
create mode 100644 src/org/yooreeka/algos/search/ranking/DocRankMatrixBuilder.java
create mode 100644 src/org/yooreeka/algos/search/ranking/PageRankMatrixBuilder.java
create mode 100644 src/org/yooreeka/algos/search/ranking/PageRankMatrixH.java
create mode 100644 src/org/yooreeka/algos/search/ranking/Rank.java
create mode 100644 src/org/yooreeka/algos/search/ranking/RelevanceScore.java
create mode 100644 src/org/yooreeka/algos/search/util/TermFreqMapUtils.java
create mode 100644 src/org/yooreeka/algos/taxis/bayesian/NaiveBayes.java
create mode 100644 src/org/yooreeka/algos/taxis/boosting/BoostingARCX4Classifier.java
create mode 100644 src/org/yooreeka/algos/taxis/boosting/WeightBasedRandom.java
create mode 100644 src/org/yooreeka/algos/taxis/core/AttributeValue.java
create mode 100644 src/org/yooreeka/algos/taxis/core/BaseConcept.java
create mode 100644 src/org/yooreeka/algos/taxis/core/BaseInstance.java
create mode 100644 src/org/yooreeka/algos/taxis/core/DoubleAttribute.java
create mode 100644 src/org/yooreeka/algos/taxis/core/StringAttribute.java
create mode 100644 src/org/yooreeka/algos/taxis/core/TrainingSet.java
create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Attribute.java
create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Classifier.java
create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Concept.java
create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Instance.java
create mode 100644 src/org/yooreeka/algos/taxis/ensemble/ClassifierEnsemble.java
create mode 100644 src/org/yooreeka/algos/taxis/ensemble/ConceptMajorityVoter.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/ClassifierResults.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/CochransQTest.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/Diff2PropTest.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/FTest.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/McNemarTest.java
create mode 100644 src/org/yooreeka/algos/taxis/evaluation/Test.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/XORNetwork.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseLayer.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseLink.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseNN.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseNode.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/LinearNode.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/SigmoidNode.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Layer.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Link.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/NeuralNetwork.java
create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Node.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeDefinition.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeSelector.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeUtils.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/Branch.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/BranchGroup.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/ConceptUtils.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/DecisionTreeClassifier.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/InfoGain.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/Node.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/SplittingCriterion.java
create mode 100644 src/org/yooreeka/algos/taxis/tree/TrueErrorRateEstimator.java
create mode 100644 src/org/yooreeka/config/YooreekaConfigurator.java
create mode 100644 src/org/yooreeka/examples/credit/BaggingCreditClassifier.java
create mode 100644 src/org/yooreeka/examples/credit/BoostingCreditClassifier.java
create mode 100644 src/org/yooreeka/examples/credit/CreditConcept.java
create mode 100644 src/org/yooreeka/examples/credit/CreditInstance.java
create mode 100644 src/org/yooreeka/examples/credit/DTCreditClassifier.java
create mode 100644 src/org/yooreeka/examples/credit/NBCreditClassifier.java
create mode 100644 src/org/yooreeka/examples/credit/NNCreditClassifier.java
create mode 100644 src/org/yooreeka/examples/credit/UserCreditNN.java
create mode 100644 src/org/yooreeka/examples/credit/data/UseCaseData.java
create mode 100644 src/org/yooreeka/examples/credit/data/UserDataset.java
create mode 100644 src/org/yooreeka/examples/credit/data/UserLoader.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/BadUserType.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/DangerousUserType.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/ExcellentUserType.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/GoodUserType.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/User.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/UserType.java
create mode 100644 src/org/yooreeka/examples/credit/data/users/VeryGoodUserType.java
create mode 100644 src/org/yooreeka/examples/credit/util/AttributeInfo.java
create mode 100644 src/org/yooreeka/examples/credit/util/AttributeUtils.java
create mode 100644 src/org/yooreeka/examples/credit/util/BootstrapTrainingSetBuilder.java
create mode 100644 src/org/yooreeka/examples/credit/util/ClassifierResults.java
create mode 100644 src/org/yooreeka/examples/credit/util/CreditDataUtils.java
create mode 100644 src/org/yooreeka/examples/credit/util/CreditErrorEstimator.java
create mode 100644 src/org/yooreeka/examples/credit/util/DataGenerator.java
create mode 100644 src/org/yooreeka/examples/credit/util/UserInstanceBuilder.java
create mode 100644 src/org/yooreeka/examples/fraud/DTFraudClassifier.java
create mode 100644 src/org/yooreeka/examples/fraud/NNFraudClassifier.java
create mode 100644 src/org/yooreeka/examples/fraud/TransactionConcept.java
create mode 100644 src/org/yooreeka/examples/fraud/TransactionInstance.java
create mode 100644 src/org/yooreeka/examples/fraud/TransactionNN.java
create mode 100644 src/org/yooreeka/examples/fraud/data/Transaction.java
create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionDataset.java
create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionInstanceBuilder.java
create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionLoader.java
create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionLocation.java
create mode 100644 src/org/yooreeka/examples/fraud/util/DataGenerator.java
create mode 100644 src/org/yooreeka/examples/fraud/util/FraudDataUtils.java
create mode 100644 src/org/yooreeka/examples/fraud/util/FraudErrorEstimator.java
create mode 100644 src/org/yooreeka/examples/fraud/util/TenUsersSample.java
create mode 100644 src/org/yooreeka/examples/fraud/util/TransactionSetProfile.java
create mode 100644 src/org/yooreeka/examples/fraud/util/UserStatistics.java
create mode 100644 src/org/yooreeka/examples/fraud/util/UserStatisticsCalculator.java
create mode 100644 src/org/yooreeka/examples/newsgroups/NewsCrawler.java
create mode 100644 src/org/yooreeka/examples/recommender/MovieLensRMSESample.java
create mode 100644 src/org/yooreeka/examples/recommender/RatingGrapher.java
create mode 100644 src/org/yooreeka/examples/recommender/Recommender.java
create mode 100644 src/org/yooreeka/examples/search/DocRank.java
create mode 100644 src/org/yooreeka/examples/search/LuceneIndexer.java
create mode 100644 src/org/yooreeka/examples/search/MySearcher.java
create mode 100644 src/org/yooreeka/examples/search/PageRank.java
create mode 100644 src/org/yooreeka/examples/spamfilter/EmailClassifier.java
create mode 100644 src/org/yooreeka/examples/spamfilter/EmailInstance.java
create mode 100644 src/org/yooreeka/examples/spamfilter/data/Email.java
create mode 100644 src/org/yooreeka/examples/spamfilter/data/EmailData.java
create mode 100644 src/org/yooreeka/examples/spamfilter/data/EmailDataset.java
create mode 100644 src/org/yooreeka/util/C.java
create mode 100644 src/org/yooreeka/util/P.java
create mode 100644 src/org/yooreeka/util/gui/GraphGui.java
create mode 100644 src/org/yooreeka/util/gui/XyGui.java
create mode 100644 src/org/yooreeka/util/internet/behavior/UserClick.java
create mode 100644 src/org/yooreeka/util/internet/behavior/UserQuery.java
create mode 100644 src/org/yooreeka/util/internet/crawling/FetchAndProcessCrawler.java
create mode 100644 src/org/yooreeka/util/internet/crawling/YCrawler.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/BasicWebCrawler.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/CrawlData.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/CrawlDataProcessor.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/DocumentFilter.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/URLFilter.java
create mode 100644 src/org/yooreeka/util/internet/crawling/core/URLNormalizer.java
create mode 100644 src/org/yooreeka/util/internet/crawling/db/FetchedDocsDB.java
create mode 100644 src/org/yooreeka/util/internet/crawling/db/KnownUrlDB.java
create mode 100644 src/org/yooreeka/util/internet/crawling/db/PageLinkDB.java
create mode 100644 src/org/yooreeka/util/internet/crawling/db/ProcessedDocsDB.java
create mode 100644 src/org/yooreeka/util/internet/crawling/model/FetchedDocument.java
create mode 100644 src/org/yooreeka/util/internet/crawling/model/KnownUrlEntry.java
create mode 100644 src/org/yooreeka/util/internet/crawling/model/Outlink.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/common/Transport.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/common/TransportException.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/file/FileTransport.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/file/FileTransportException.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransport.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransportException.java
create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPUtils.java
create mode 100644 src/org/yooreeka/util/internet/crawling/util/DocumentIdUtils.java
create mode 100644 src/org/yooreeka/util/internet/crawling/util/FileUtils.java
create mode 100644 src/org/yooreeka/util/internet/crawling/util/UrlGroup.java
create mode 100644 src/org/yooreeka/util/internet/crawling/util/UrlUtils.java
create mode 100644 src/org/yooreeka/util/internet/crawling/util/ValueToIndexMapping.java
create mode 100644 src/org/yooreeka/util/metrics/CosineDistance.java
create mode 100644 src/org/yooreeka/util/metrics/CosineSimilarity.java
create mode 100644 src/org/yooreeka/util/metrics/CosineSimilarityMeasure.java
create mode 100644 src/org/yooreeka/util/metrics/EuclideanDistance.java
create mode 100644 src/org/yooreeka/util/metrics/JaccardCoefficient.java
create mode 100644 src/org/yooreeka/util/metrics/JaccardDistance.java
create mode 100644 src/org/yooreeka/util/metrics/NumericDistance.java
create mode 100644 src/org/yooreeka/util/metrics/SimilarityMeasure.java
create mode 100644 src/org/yooreeka/util/metrics/TermFrequencyBuilder.java
create mode 100644 src/org/yooreeka/util/parsing/common/AbstractDocument.java
create mode 100644 src/org/yooreeka/util/parsing/common/DataEntry.java
create mode 100644 src/org/yooreeka/util/parsing/common/DataField.java
create mode 100644 src/org/yooreeka/util/parsing/common/DataType.java
create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParser.java
create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParserException.java
create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParserFactory.java
create mode 100644 src/org/yooreeka/util/parsing/common/ProcessedDocument.java
create mode 100644 src/org/yooreeka/util/parsing/csv/CSVDocument.java
create mode 100644 src/org/yooreeka/util/parsing/csv/CSVEntry.java
create mode 100644 src/org/yooreeka/util/parsing/csv/CSVFile.java
create mode 100644 src/org/yooreeka/util/parsing/csv/CSVParser.java
create mode 100644 src/org/yooreeka/util/parsing/csv/CSVSchema.java
create mode 100644 src/org/yooreeka/util/parsing/html/CompositeFilter.java
create mode 100644 src/org/yooreeka/util/parsing/html/ElementNodeFilter.java
create mode 100644 src/org/yooreeka/util/parsing/html/HTMLDocumentParser.java
create mode 100644 src/org/yooreeka/util/parsing/html/HTMLDocumentParserException.java
create mode 100644 src/org/yooreeka/util/parsing/html/HTMLWriter.java
create mode 100644 src/org/yooreeka/util/parsing/html/LinkNodeFilter.java
create mode 100644 src/org/yooreeka/util/parsing/html/MultiFilter.java
create mode 100644 src/org/yooreeka/util/parsing/msword/MSWordDocumentParser.java
create mode 100644 src/org/yooreeka/util/parsing/msword/MSWordDocumentParserException.java
create mode 100644 src/org/yooreeka/util/text/AlphabetProjection.java
diff --git a/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java b/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java
new file mode 100644
index 0000000..82dfd83
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java
@@ -0,0 +1,448 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.dbscan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping;
+import org.yooreeka.util.P;
+import org.yooreeka.util.metrics.NumericDistance;
+import org.yooreeka.util.metrics.TermFrequencyBuilder;
+
+/**
+ * Implementation of DBSCAN clustering algorithm.
+ *
+ * Algorithm parameters:
+ *
+ * - Eps - threshold value to determine point neighbors. Two points are
+ * neighbors if the distance between them does not exceed this threshold value.
+ * - MinPts - minimum number of points in any cluster.
+ *
+ * Choice of parameter values depends on the data.
+ *
+ *
+ * Point types:
+ *
+ * - Core point - point that belongs to the core of the cluster. It has at
+ * least MinPts neighboring points.
+ * - Border point - is a neighbor to at least one core point but it doesn't
+ * have enough neighbors to be a core point.
+ * - Noise point - is a point that doesn't belong to any cluster because it is
+ * not close to any of the core points.
+ *
+ */
+public class DBSCANAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(DBSCANAlgorithm.class
+ .getName());
+
+ private static double[][] calculateAdjacencyMatrix(NumericDistance distance,
+ DataPoint[] points, boolean useTermFrequencies) {
+ int n = points.length;
+ double[][] a = new double[n][n];
+ for (int i = 0; i < n; i++) {
+ double[] x = points[i].getNumericAttrValues();
+ for (int j = i + 1; j < n; j++) {
+ double[] y;
+ if (useTermFrequencies) {
+ double[][] tfVectors = TermFrequencyBuilder
+ .buildTermFrequencyVectors(
+ points[i].getTextAttrValues(),
+ points[j].getTextAttrValues());
+ x = tfVectors[0];
+ y = tfVectors[1];
+ } else {
+ y = points[j].getNumericAttrValues();
+ }
+ a[i][j] = distance.getDistance(x, y);
+ a[j][i] = a[i][j];
+ }
+ a[i][i] = 0.0;
+ }
+ return a;
+ }
+
+ public static void main(String[] args) {
+
+ DataPoint[] elements = new DataPoint[5];
+ elements[0] = new DataPoint("A", new double[] {});
+ elements[1] = new DataPoint("B", new double[] {});
+ elements[2] = new DataPoint("C", new double[] {});
+ elements[3] = new DataPoint("D", new double[] {});
+ elements[4] = new DataPoint("E", new double[] {});
+
+ double[][] a = new double[][] { { 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 2 },
+ { 2, 2, 2, 11, 31 }, { 2, 2, 2, 10, 30 }, { 60, 60, 60, 0, 0 } };
+
+ double eps = 0.5;
+ int minPoints = 2;
+
+ DBSCANAlgorithm dbscan = new DBSCANAlgorithm(elements, a, eps,
+ minPoints);
+
+ printResults(dbscan.cluster(), eps,minPoints);
+ }
+
+ /*
+ * Data points for clustering.
+ */
+ private DataPoint[] points;
+
+ /*
+ * Adjacency matrix. Contains distances between points.
+ */
+ private double[][] adjacencyMatrix;
+
+ /*
+ * Threshold value. Determines which points will be considered as neighbors.
+ * Two points are neighbors if the distance between them does not exceed
+ * threshold value.
+ */
+ private double eps;
+
+ /*
+ * Identifies a set of Noise points.
+ */
+ private static int CLUSTER_ID_NOISE = -1;
+
+ /*
+ * Identifies a set of Unclassified points.
+ */
+ private int CLUSTER_ID_UNCLASSIFIED = 0;
+
+ /*
+ * Sequence that is used to generate next cluster id.
+ */
+ private int nextClusterId = 1;
+
+ /*
+ * Sets of points. Initially all points will be assigned into Unclassified
+ * points set.
+ */
+ private Map> clusters = new LinkedHashMap>();
+
+ /*
+ * Number of points that should exist in the neighborhood for a point to be
+ * a core point.
+ *
+ * Best value for this parameter depends on the data set.
+ */
+ private int minPoints;
+
+ private ObjectToIndexMapping idxMapping = new ObjectToIndexMapping();
+
+ private boolean verbose = true;
+
+ /**
+ * Initializes algorithm with all data that it needs.
+ *
+ * @param points
+ * all points to cluster
+ * @param distance
+ * metric distance function
+ * @param eps
+ * threshold value used to calculate point neighborhood.
+ * @param minPoints
+ * number of neighbors for point to be considered a core point.
+ */
+ public DBSCANAlgorithm(DataPoint[] points, NumericDistance distance, double eps,
+ int minPoints, boolean useTermFrequencies) {
+
+ init(points, eps, minPoints);
+ this.adjacencyMatrix = calculateAdjacencyMatrix(distance, points,
+ useTermFrequencies);
+ }
+
+ /**
+ * Initializes algorithm with all data that it needs.
+ *
+ * @param points
+ * points to cluster
+ * @param adjacencyMatrix
+ * adjacency matrix with distances
+ * @param eps
+ * distance threshold value
+ * @param minPoints
+ * number of neighbors for point to be considered a core point.
+ */
+ public DBSCANAlgorithm(DataPoint[] points, double[][] adjacencyMatrix,
+ double eps, int minPoints) {
+ init(points, eps, minPoints);
+ this.adjacencyMatrix = adjacencyMatrix;
+ }
+
+ private void assignPointToCluster(DataPoint p, int clusterId) {
+
+ // Remove point from the group that it currently belongs to...
+ if (isNoise(p)) {
+ removePointFromCluster(p, CLUSTER_ID_NOISE);
+ } else if (isUnclassified(p)) {
+ removePointFromCluster(p, CLUSTER_ID_UNCLASSIFIED);
+ } else {
+ if (clusterId != CLUSTER_ID_UNCLASSIFIED) {
+ throw new RuntimeException(
+ "Trying to move point that has already been"
+ + "assigned to some other cluster. Point: " + p
+ + ", clusterId=" + clusterId);
+ } else {
+ // do nothing. we are registering a brand new point in
+ // UNCLASSIFIED set.
+ }
+ }
+
+ Set points = clusters.get(clusterId);
+ if (points == null) {
+ points = new HashSet();
+ clusters.put(clusterId, points);
+ }
+ points.add(p);
+ }
+
+ private void assignPointToCluster(Set points, int clusterId) {
+ for (DataPoint p : points) {
+ assignPointToCluster(p, clusterId);
+ }
+ }
+
+ public List cluster() {
+ int clusterId = getNextClusterId();
+
+ for (DataPoint p : points) {
+ if (isUnclassified(p)) {
+
+ boolean isClusterCreated = createCluster(p, clusterId);
+
+ if (isClusterCreated) {
+ // Generate id for the next cluster
+ clusterId = getNextClusterId();
+ }
+ }
+ }
+
+ // Convert sets of points into clusters...
+ List allClusters = new ArrayList();
+
+ for (Map.Entry> e : clusters.entrySet()) {
+
+ String label = String.valueOf(e.getKey());
+
+ Set points = e.getValue();
+
+ if (points != null && !points.isEmpty()) {
+
+ Cluster cluster = new Cluster(label, e.getValue());
+
+ allClusters.add(cluster);
+ }
+ }
+
+ // Group with Noise elements returned as well
+ return allClusters;
+ }
+
+ private boolean createCluster(DataPoint p, Integer clusterId) {
+
+ boolean isClusterCreated = false;
+
+ Set nPoints = findNeighbors(p, eps);
+
+ if (nPoints.size() < minPoints) {
+ // Assign point into "Noise" group.
+ // It will have a chance to become a border point later on.
+ assignPointToCluster(p, CLUSTER_ID_NOISE);
+
+ // return false to indicate that we didn't create any cluster
+ isClusterCreated = false;
+
+ } else {
+
+ // All points are reachable from the core point...
+ assignPointToCluster(nPoints, clusterId);
+
+ // Remove point itself.
+ nPoints.remove(p);
+
+ // Process the rest of the neighbors...
+ while (nPoints.size() > 0) {
+
+ // pick the first neighbor
+ DataPoint nPoint = nPoints.iterator().next();
+
+ // process neighbor
+ Set nnPoints = findNeighbors(nPoint, eps);
+
+ if (nnPoints.size() >= minPoints) {
+
+ // nPoint is another core point.
+ for (DataPoint nnPoint : nnPoints) {
+
+ if (isNoise(nnPoint)) {
+
+ /*
+ * It's a border point. We know that it doesn't have
+ * enough neighbors to be a core point. Just add it
+ * to the cluster.
+ */
+ assignPointToCluster(nnPoint, clusterId);
+
+ } else if (isUnclassified(nnPoint)) {
+
+ /*
+ * We don't know if this point has enough neighbors
+ * to be a core point... add it to the list of
+ * points to be checked.
+ */
+ nPoints.add(nnPoint);
+
+ /*
+ * And assign it to the cluster
+ */
+ assignPointToCluster(nnPoint, clusterId);
+ }
+ }
+ } else {
+ // do nothing. The neighbor is just a border point.
+ }
+
+ nPoints.remove(nPoint);
+ }
+
+ // return true to indicate that we did create a cluster
+ isClusterCreated = true;
+ }
+
+ return isClusterCreated;
+ }
+
+ private Set findNeighbors(DataPoint p, double threshold) {
+ Set neighbors = new HashSet();
+ int i = idxMapping.getIndex(p);
+ for (int j = 0, n = idxMapping.getSize(); j < n; j++) {
+ if (adjacencyMatrix[i][j] <= threshold) {
+ neighbors.add(idxMapping.getObject(j));
+ }
+ }
+ return neighbors;
+ }
+
+ private int getNextClusterId() {
+ return nextClusterId++;
+ }
+
+ private void init(DataPoint[] points, double neighborThreshold,
+ int minPoints) {
+
+ LOG.setLevel(Level.FINEST); //YooreekaConfigurator.getLevel(DBSCANAlgorithm.class.getName()));
+
+ this.points = points;
+ this.eps = neighborThreshold;
+ this.minPoints = minPoints;
+
+ for (DataPoint p : points) {
+ // Creating a Point <-> Index mappping for all points
+ idxMapping.getIndex(p);
+ // Assign all points into "Unclassified" group
+ assignPointToCluster(p, CLUSTER_ID_UNCLASSIFIED);
+ }
+ }
+
+ private boolean isNoise(DataPoint p) {
+ return isPointInCluster(p, CLUSTER_ID_NOISE);
+ }
+
+ private boolean isPointInCluster(DataPoint p, int clusterId) {
+ boolean inCluster = false;
+ Set points = clusters.get(clusterId);
+ if (points != null) {
+ inCluster = points.contains(p);
+ }
+ return inCluster;
+ }
+
+ private boolean isUnclassified(DataPoint p) {
+ return isPointInCluster(p, CLUSTER_ID_UNCLASSIFIED);
+
+ }
+
+ public boolean isVerbose() {
+ return verbose;
+ }
+
+ public void printDistances() {
+ LOG.info("Point Similarity matrix:");
+ for (int i = 0; i < adjacencyMatrix.length; i++) {
+ LOG.info(Arrays.toString(adjacencyMatrix[i]));
+ }
+ }
+
+ public static void printResults(List allClusters, double eps, int minPoints) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("DBSCAN Clustering with NeighborThreshold=").append(eps);
+ sb.append(", minPoints=").append(minPoints).append("\n");
+ sb.append("Clusters:\n");
+ String noiseElements = "no noise elements";
+ for (Cluster c : allClusters) {
+ if (String.valueOf(CLUSTER_ID_NOISE).equals(c.getLabel())) {
+ // print noise data at the end
+ noiseElements = c.getElementsAsString();
+ } else {
+ sb.append("____________________________________________________________\n");
+ sb.append(c.getLabel()).append(": \n").append(c.getElementsAsString());
+ sb.append("____________________________________________________________\n\n");
+ }
+ }
+ sb.append("Noise Elements:\n ").append(noiseElements).append("\n");
+ P.println(sb.toString());
+ }
+ private boolean removePointFromCluster(DataPoint p, int clusterId) {
+ boolean removed = false;
+ Set points = clusters.get(clusterId);
+ if (points != null) {
+ removed = points.remove(p);
+ }
+ return removed;
+ }
+
+ public void setVerbose(boolean verbose) {
+ this.verbose = verbose;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java
new file mode 100644
index 0000000..e5963e0
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java
@@ -0,0 +1,145 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping;
+
+/** A hierarchical agglomerative clustering algorithm based on the average link */
+public class AverageLinkAlgorithm {
+
+ public static void main(String[] args) {
+ // Define data
+ DataPoint[] elements = new DataPoint[5];
+ elements[0] = new DataPoint("A", new double[] {});
+ elements[1] = new DataPoint("B", new double[] {});
+ elements[2] = new DataPoint("C", new double[] {});
+ elements[3] = new DataPoint("D", new double[] {});
+ elements[4] = new DataPoint("E", new double[] {});
+
+ double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 },
+ { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } };
+
+ AverageLinkAlgorithm ca = new AverageLinkAlgorithm(elements, a);
+ Dendrogram dnd = ca.cluster();
+ dnd.printAll();
+ }
+ private DataPoint[] elements;
+ private double[][] a;
+
+ private ClusterSet allClusters;
+
+ public AverageLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) {
+ this.elements = elements;
+ this.a = adjacencyMatrix;
+ this.allClusters = new ClusterSet();
+ }
+
+ public Dendrogram cluster() {
+
+ Dendrogram dnd = new Dendrogram("Distance");
+ double d = 0.0;
+
+ // initially load all elements as individual clusters
+ for (DataPoint e : elements) {
+ Cluster c = new Cluster(e);
+ allClusters.add(c);
+ }
+
+ dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
+
+ d = 1.0;
+
+ while (allClusters.size() > 1) {
+ int K = allClusters.size();
+ mergeClusters(d);
+ // it is possible that there were no clusters to merge for current
+ // d.
+ if (K > allClusters.size()) {
+ dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
+ K = allClusters.size();
+ }
+
+ d = d + 0.5;
+ }
+ return dnd;
+ }
+
+ private void mergeClusters(double distanceThreshold) {
+ int nClusters = allClusters.size();
+
+ ObjectToIndexMapping idxMapping = new ObjectToIndexMapping();
+
+ double[][] clusterDistances = new double[nClusters][nClusters];
+
+ for (int i = 0, n = a.length; i < n; i++) {
+ for (int j = i + 1, k = a.length; j < k; j++) {
+ double d = a[i][j];
+ if (d > 0) {
+ DataPoint e1 = elements[i];
+ DataPoint e2 = elements[j];
+ Cluster c1 = allClusters.findClusterByElement(e1);
+ Cluster c2 = allClusters.findClusterByElement(e2);
+ if (!c1.equals(c2)) {
+ int ci = idxMapping.getIndex(c1);
+ int cj = idxMapping.getIndex(c2);
+ clusterDistances[ci][cj] += d;
+ clusterDistances[cj][ci] += d;
+ }
+ }
+ }
+ }
+
+ boolean[] merged = new boolean[clusterDistances.length];
+ for (int i = 0, n = clusterDistances.length; i < n; i++) {
+ for (int j = i + 1, k = clusterDistances.length; j < k; j++) {
+ Cluster ci = idxMapping.getObject(i);
+ Cluster cj = idxMapping.getObject(j);
+ int ni = ci.size();
+ int nj = cj.size();
+ clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj);
+ clusterDistances[j][i] = clusterDistances[i][j];
+ // merge clusters if distance is below the threshold
+ if (merged[i] == false && merged[j] == false) {
+ if (clusterDistances[i][j] <= distanceThreshold) {
+ allClusters.remove(ci);
+ allClusters.remove(cj);
+ Cluster mergedCluster = new Cluster(ci, cj);
+ allClusters.add(mergedCluster);
+ merged[i] = true;
+ merged[j] = true;
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java b/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java
new file mode 100644
index 0000000..236fb38
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java
@@ -0,0 +1,83 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+
+/**
+ * Set of clusters.
+ */
+public class ClusterSet {
+
+ private Set allClusters = new HashSet();
+
+ public boolean add(Cluster c) {
+ return allClusters.add(c);
+ }
+
+ public Cluster findClusterByElement(DataPoint e) {
+ Cluster cluster = null;
+ for (Cluster c : allClusters) {
+ if (c.contains(e)) {
+ cluster = c;
+ break;
+ }
+ }
+ return cluster;
+ }
+
+ public List getAllClusters() {
+ return new ArrayList(allClusters);
+ }
+
+ public boolean remove(Cluster c) {
+ return allClusters.remove(c);
+ }
+
+ public int size() {
+ return allClusters.size();
+ }
+
+ // public ClusterSet copy() {
+ // ClusterSet clusterSet = new ClusterSet();
+ // for(Cluster c : this.allClusters ) {
+ // Cluster clusterCopy = c.copy();
+ // clusterSet.add(clusterCopy);
+ // }
+ // return clusterSet;
+ // }
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java b/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java
new file mode 100644
index 0000000..bdfd51f
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java
@@ -0,0 +1,162 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.config.YooreekaConfigurator;
+
+public class Dendrogram {
+
+ private static final Logger LOG = Logger.getLogger(Dendrogram.class.getName());
+
+ /*
+ * Clusters by level.
+ */
+ private Map entryMap;
+ private Map levelLabels;
+ private Integer nextLevel;
+ private String levelLabelName;
+
+ public Dendrogram(String levelLabelName) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(Dendrogram.class.getName()));
+
+ entryMap = new LinkedHashMap();
+ levelLabels = new LinkedHashMap();
+ nextLevel = 1;
+ this.levelLabelName = levelLabelName;
+ }
+
+ public int addLevel(String label, Cluster cluster) {
+ List values = new ArrayList();
+ values.add(cluster);
+ return addLevel(label, values);
+ }
+
+ /**
+ * Creates a new dendrogram level using copies of provided clusters.
+ */
+ public int addLevel(String label, Collection clusters) {
+
+ ClusterSet clusterSet = new ClusterSet();
+
+ for (Cluster c : clusters) {
+ // copy cluster before adding - over time cluster elements may
+ // change
+ // but for dendrogram we want to keep current state.
+ clusterSet.add(c.copy());
+ }
+
+ int level = nextLevel;
+
+ entryMap.put(level, clusterSet);
+ levelLabels.put(level, label);
+
+ nextLevel++;
+ return level;
+ }
+
+ public List getAllLevels() {
+ return new ArrayList(entryMap.keySet());
+ }
+
+ public List getClustersForLevel(int level) {
+ ClusterSet cs = entryMap.get(level);
+ return cs.getAllClusters();
+ }
+
+ public String getLabelForLevel(int level) {
+ return levelLabels.get(level);
+ }
+
+ public int getTopLevel() {
+ return nextLevel - 1;
+ }
+
+ public void print(int level) {
+ String label = levelLabels.get(level);
+ ClusterSet clusters = entryMap.get(level);
+ LOG.info("Clusters for: level=" + level + ", "
+ + levelLabelName + "=" + label);
+ for (Cluster c : clusters.getAllClusters()) {
+ if (c.getElements().size() > 1) {
+ LOG.info("____________________________________________________________\n");
+ LOG.info(c.getElementsAsString());
+ LOG.info("____________________________________________________________\n\n");
+ }
+ }
+ }
+
+ public void printAll() {
+ for (Map.Entry e : entryMap.entrySet()) {
+ Integer level = e.getKey();
+ print(level);
+ }
+ }
+
+ /**
+ * Replaces clusters in the specified level. If level doesn't exist it will
+ * be created.
+ *
+ * @param level
+ * dendrogram level.
+ * @param label
+ * level description.
+ * @param clusters
+ * clusters for the level.
+ * @return
+ */
+ public void setLevel(int level, String label, Collection clusters) {
+
+ ClusterSet clusterSet = new ClusterSet();
+
+ for (Cluster c : clusters) {
+ clusterSet.add(c.copy());
+ }
+
+ LOG.fine("Setting cluster level: " + level);
+
+ entryMap.put(level, clusterSet);
+ levelLabels.put(level, label);
+
+ if (level >= nextLevel) {
+ nextLevel = level + 1;
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/MST.java b/src/org/yooreeka/algos/clustering/hierarchical/MST.java
new file mode 100644
index 0000000..6b78307
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/MST.java
@@ -0,0 +1,130 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+/**
+ * Basic implementation of Prim's algorithm to build Minimal Spanning Tree
+ * (MST).
+ *
+ */
+public class MST {
+
+ class Edge {
+
+ private int i;
+ private int j;
+ private double w;
+
+ Edge(int i, int j, double w) {
+ this.i = i;
+ this.j = j;
+ this.w = w;
+ }
+
+ public int getI() {
+ return i;
+ }
+
+ public int getJ() {
+ return j;
+ }
+
+ public double getW() {
+ return w;
+ }
+
+ }
+
+ /** The adjacency matrix of the graph */
+ private double[][] adjM;
+
+ public MST() {
+ }
+
+ public double[][] buildMST(double[][] adjM) {
+
+ this.adjM = adjM;
+
+ // Marks nodes that belong to MST. Initial MST has only one node.
+ boolean[] allV = new boolean[adjM.length];
+ allV[0] = true;
+
+ // Adjacency matrix defining MST
+ double[][] mst = new double[adjM.length][adjM.length];
+ for (int i = 0, n = mst.length; i < n; i++) {
+ for (int j = 0; j < n; j++) {
+ /*
+ * Using -1 to indicate that there is no edge between nodes i
+ * and j. Can't use 0 because it is a valid distance.
+ */
+ mst[i][j] = -1;
+ }
+ }
+
+ Edge e = null;
+ while ((e = findMinimumEdge(allV)) != null) {
+ allV[e.getJ()] = true;
+ mst[e.getI()][e.getJ()] = e.getW();
+ mst[e.getJ()][e.getI()] = e.getW();
+ }
+
+ return mst;
+ }
+
+ private Edge findMinimumEdge(boolean[] mstV) {
+ Edge e = null;
+ double minW = Double.POSITIVE_INFINITY;
+ int minI = -1;
+ int minJ = -1;
+
+ for (int i = 0, n = adjM.length; i < n; i++) {
+ // part of MST
+ if (mstV[i] == true) {
+ for (int j = 0, k = adjM.length; j < k; j++) {
+ // not part of MST
+ if (mstV[j] == false) {
+ if (minW > adjM[i][j]) {
+ minW = adjM[i][j];
+ minI = i;
+ minJ = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (minI > -1) {
+ e = new Edge(minI, minJ, minW);
+ }
+
+ return e;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java
new file mode 100644
index 0000000..2162623
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java
@@ -0,0 +1,142 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.config.YooreekaConfigurator;
+
+public class MSTSingleLinkAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(MSTSingleLinkAlgorithm.class.getName());
+
+ public static void main(String[] args) {
+ // Define data
+ DataPoint[] elements = new DataPoint[5];
+ elements[0] = new DataPoint("A", new double[] {});
+ elements[1] = new DataPoint("B", new double[] {});
+ elements[2] = new DataPoint("C", new double[] {});
+ elements[3] = new DataPoint("D", new double[] {});
+ elements[4] = new DataPoint("E", new double[] {});
+
+ double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 },
+ { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } };
+
+ MSTSingleLinkAlgorithm ca = new MSTSingleLinkAlgorithm(elements, a);
+ Dendrogram dnd = ca.cluster();
+ dnd.printAll();
+ }
+ private DataPoint[] elements;
+ private double[][] a;
+ private double[][] m;
+
+ private ClusterSet allClusters;
+
+ public MSTSingleLinkAlgorithm(DataPoint[] elements,
+ double[][] adjacencyMatrix) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(MSTSingleLinkAlgorithm.class.getName()));
+
+ this.elements = elements;
+ this.a = adjacencyMatrix;
+ this.allClusters = new ClusterSet();
+ }
+
+ public Dendrogram cluster() {
+
+ m = (new MST()).buildMST(a);
+
+ Dendrogram dnd = new Dendrogram("Distance");
+ double d = 0.0;
+
+ // initially load all elements as individual clusters
+ for (DataPoint e : elements) {
+ Cluster c = new Cluster(e);
+ allClusters.add(c);
+ }
+
+ int lastDndLevel = dnd.addLevel(String.valueOf(d),
+ allClusters.getAllClusters());
+
+ double previousD = d;
+
+ while (allClusters.size() > 1) {
+ d = mergeTwoClosestClusters();
+ if (previousD == d) {
+ dnd.setLevel(lastDndLevel, String.valueOf(d),
+ allClusters.getAllClusters());
+ } else {
+ lastDndLevel = dnd.addLevel(String.valueOf(d),
+ allClusters.getAllClusters());
+ }
+ previousD = d;
+ }
+
+ return dnd;
+ }
+
+ private double mergeTwoClosestClusters() {
+ int minI = -1;
+ int minJ = -1;
+ double minWeight = Double.POSITIVE_INFINITY;
+
+ for (int i = 0, n = m.length; i < n; i++) {
+ for (int j = 0, k = m.length; j < k; j++) {
+ if (m[i][j] >= 0 && minWeight > m[i][j]) {
+ minI = i;
+ minJ = j;
+ minWeight = m[i][j];
+ }
+ }
+ }
+
+ double d = Double.NaN;
+ if (minI > -1) {
+ DataPoint e1 = elements[minI];
+ Cluster c1 = allClusters.findClusterByElement(e1);
+ DataPoint e2 = elements[minJ];
+ Cluster c2 = allClusters.findClusterByElement(e2);
+ allClusters.remove(c1);
+ allClusters.remove(c2);
+ d = minWeight;
+ Cluster mergedCluster = new Cluster(c1, c2);
+ allClusters.add(mergedCluster);
+ m[minI][minJ] = -1; // remove link. Using -1 because 0 is a valid
+ // distance.
+ m[minJ][minI] = -1; // remove link. Using -1 because 0 is a valid
+ // distance.
+ }
+
+ return d;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java
new file mode 100644
index 0000000..a0090d4
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java
@@ -0,0 +1,126 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.hierarchical;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/** A hierarchical agglomerative clustering algorithm based on single link */
+public class SingleLinkAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(SingleLinkAlgorithm.class.getName());
+
+ public static void main(String[] args) {
+ // Define data
+ DataPoint[] elements = new DataPoint[5];
+ elements[0] = new DataPoint("A", new double[] {});
+ elements[1] = new DataPoint("B", new double[] {});
+ elements[2] = new DataPoint("C", new double[] {});
+ elements[3] = new DataPoint("D", new double[] {});
+ elements[4] = new DataPoint("E", new double[] {});
+
+ double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 },
+ { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } };
+
+ SingleLinkAlgorithm ca = new SingleLinkAlgorithm(elements, a);
+ Dendrogram dnd = ca.cluster();
+ dnd.printAll();
+ // dnd.print(3);
+ }
+ private DataPoint[] elements;
+
+ private double[][] a;
+
+ // Hierarchical Agglomerative Algorithm
+ public SingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(SingleLinkAlgorithm.class.getName()));
+
+ this.elements = elements;
+ this.a = adjacencyMatrix;
+ }
+
+ // Implements Single Link Technique
+ private List buildClusters(double distanceThreshold) {
+ boolean[] usedElementFlags = new boolean[elements.length];
+ List clusters = new ArrayList();
+ for (int i = 0, n = a.length; i < n; i++) {
+ List clusterPoints = new ArrayList();
+ for (int j = i, k = a.length; j < k; j++) {
+ if (a[i][j] <= distanceThreshold
+ && usedElementFlags[j] == false) {
+ clusterPoints.add(elements[j]);
+ usedElementFlags[j] = true;
+ }
+ }
+ if (clusterPoints.size() > 0) {
+ Cluster c = new Cluster(clusterPoints);
+ clusters.add(c);
+ }
+ }
+ return clusters;
+ }
+
+ public Dendrogram cluster() {
+ Dendrogram dnd = new Dendrogram("Distance");
+ double d = 0;
+
+ // initially load all elements as individual clusters
+ List initialClusters = new ArrayList();
+ for (DataPoint e : elements) {
+ Cluster c = new Cluster(e);
+ initialClusters.add(c);
+ }
+
+ dnd.addLevel(String.valueOf(d), initialClusters);
+
+ d = 1.0;
+
+ int k = initialClusters.size();
+
+ while (k > 1) {
+ int oldK = k;
+ List clusters = buildClusters(d);
+ k = clusters.size();
+ if (oldK != k) {
+ dnd.addLevel(String.valueOf(d), clusters);
+ }
+
+ d = d + 1;
+ }
+ return dnd;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/model/Attribute.java b/src/org/yooreeka/algos/clustering/model/Attribute.java
new file mode 100644
index 0000000..375f3f1
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/model/Attribute.java
@@ -0,0 +1,119 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.model;
+
+/**
+ * Attribute for text or numeric values.
+ */
+public class Attribute {
+
+ private String name;
+ private Object value;
+
+ public Attribute(String name, Double numericValue) {
+ init(name, numericValue);
+ }
+
+ public Attribute(String name, String textValue) {
+ init(name, textValue);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final Attribute other = (Attribute) obj;
+ if (name == null) {
+ if (other.name != null)
+ return false;
+ } else if (!name.equals(other.name))
+ return false;
+ if (value == null) {
+ if (other.value != null)
+ return false;
+ } else if (!value.equals(other.value))
+ return false;
+ return true;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public Double getNumericValue() {
+ return (Double) value;
+ }
+
+ public String getTextValue() {
+ return (String) value;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((name == null) ? 0 : name.hashCode());
+ result = prime * result + ((value == null) ? 0 : value.hashCode());
+ return result;
+ }
+
+ private void init(String name, Object value) {
+ this.name = name;
+ this.value = value;
+ }
+
+ public boolean isNumeric() {
+ if (value instanceof java.lang.Double) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ public boolean isText() {
+ if (value instanceof java.lang.String) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "[name=" + this.name + ", value=" + value + ", isText="
+ + this.isText() + ", isNumeric=" + this.isNumeric() + "]";
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/model/Cluster.java b/src/org/yooreeka/algos/clustering/model/Cluster.java
new file mode 100644
index 0000000..5038059
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/model/Cluster.java
@@ -0,0 +1,197 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.model;
+
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+/*
+ * Group of data points.
+ */
+public class Cluster {
+
+ private String label;
+
+ private Set elements;
+
+ // Empty cluster with no elements.
+ public Cluster() {
+ init("");
+ }
+
+ // New cluster that contains all elements from provided clusters.
+ public Cluster(Cluster c1, Cluster c2) {
+ init("");
+ add(c1);
+ add(c2);
+ }
+
+ public Cluster(Collection elements) {
+ init("");
+ for (DataPoint e : elements) {
+ add(e);
+ }
+ }
+
+ public Cluster(DataPoint element) {
+ init("");
+ add(element);
+ }
+
+ public Cluster(String label) {
+ init(label);
+ }
+
+ public Cluster(String label, Collection elements) {
+ init(label);
+ for (DataPoint e : elements) {
+ add(e);
+ }
+ }
+
+ /**
+ * Modifies existing cluster by adding all elements from provided cluster.
+ *
+ * @param c
+ */
+ public void add(Cluster c) {
+ for (DataPoint e : c.getElements()) {
+ elements.add(e);
+ }
+ }
+
+ /**
+ * Modifies existing cluster by adding a new element.
+ *
+ * @param e
+ */
+ public void add(DataPoint e) {
+ elements.add(e);
+ }
+
+ public boolean contains(Cluster c) {
+ boolean result = true;
+ for (DataPoint e : c.getElements()) {
+ if (!contains(e)) {
+ result = false;
+ break;
+ }
+ }
+ return result;
+ }
+
+ public boolean contains(DataPoint e) {
+ return elements.contains(e);
+ }
+
+ public Cluster copy() {
+ Cluster copy = new Cluster();
+ for (DataPoint e : this.getElements()) {
+ // DataPoint is immutable. No need to create a copy.
+ copy.add(e);
+ }
+ return copy;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final Cluster other = (Cluster) obj;
+ if (elements == null) {
+ if (other.elements != null)
+ return false;
+ } else if (!elements.equals(other.elements))
+ return false;
+ return true;
+ }
+
+ /*
+ * Returns number of attributes used to define points in the cluster.
+ */
+ public int getDimensionCount() {
+ if (elements == null || elements.isEmpty()) {
+ return 0;
+ }
+
+ return elements.iterator().next().getAttributeCount();
+ }
+
+ public Set getElements() {
+ return new LinkedHashSet(elements);
+ }
+
+ public String getElementsAsString() {
+ StringBuffer buf = new StringBuffer("{");
+ for (DataPoint e : elements) {
+ if (buf.length() > 1) {
+ buf.append(",\n");
+ }
+ buf.append(e.getLabel());
+ }
+ buf.append("}");
+
+ return buf.toString();
+ }
+
+ public String getLabel() {
+ return label;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result
+ + ((elements == null) ? 0 : elements.hashCode());
+ return result;
+ }
+
+ private void init(String label) {
+ this.label = label;
+ elements = new LinkedHashSet();
+ }
+
+ public int size() {
+ return elements.size();
+ }
+
+ @Override
+ public String toString() {
+ return getElementsAsString();
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/model/DataPoint.java b/src/org/yooreeka/algos/clustering/model/DataPoint.java
new file mode 100644
index 0000000..7ccadc6
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/model/DataPoint.java
@@ -0,0 +1,181 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.model;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.yooreeka.algos.clustering.utils.Attributes;
+import org.yooreeka.util.metrics.EuclideanDistance;
+
+/**
+ * A Thing to be clustered. Defined by a set of attributes.
+ */
+public class DataPoint {
+
+ /**
+ * Descriptive label or name. We also use it as unique ID for the instance.
+ */
+ private String label;
+
+ /**
+ * Collection of attributes that define this point.
+ */
+ private Attribute[] attributes;
+
+ /*
+ * Values derived from attributes.
+ */
+ private String[] attributeNames;
+ private double[] numericAttributeValues;
+ private String[] textAttributeValues;
+
+ public DataPoint(String label, Attribute[] attributes) {
+ init(label, attributes);
+ }
+
+ /**
+ * Creates a new point with numerical attributes. Attribute names are
+ * auto-generated.
+ */
+ public DataPoint(String label, double[] attrValues) {
+ // create attributes with auto-generated names
+ init(label, Attributes.createAttributes(attrValues));
+ }
+
+ public DataPoint(String label, String[] attrValues) {
+ // create attributes with auto-generated names
+ init(label, Attributes.createAttributes(attrValues));
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final DataPoint other = (DataPoint) obj;
+ if (!Arrays.equals(attributes, other.attributes))
+ return false;
+ if (label == null) {
+ if (other.label != null)
+ return false;
+ } else if (!label.equals(other.label))
+ return false;
+ return true;
+ }
+
+ public int getAttributeCount() {
+ return numericAttributeValues.length;
+ }
+
+ public String[] getAttributeNames() {
+ return attributeNames;
+ }
+
+ public Attribute[] getAttributes() {
+ return attributes;
+ }
+
+ public String getLabel() {
+ return label;
+ }
+
+ public double[] getNumericAttrValues() {
+ return numericAttributeValues;
+ }
+
+ public double getR() {
+
+ EuclideanDistance euclid = new EuclideanDistance();
+
+ int n = attributes.length;
+
+ double[] x = new double[n];
+
+ for (int i = 0; i < n; i++) {
+ x[i] = 0d;
+ }
+
+ return euclid.getDistance(x, this.numericAttributeValues);
+ }
+
+ public String[] getTextAttrValues() {
+ return textAttributeValues;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + Arrays.hashCode(attributes);
+ result = prime * result + ((label == null) ? 0 : label.hashCode());
+ return result;
+ }
+
+ private void init(String label, Attribute[] attributes) {
+ this.label = label;
+ this.attributes = attributes;
+ this.attributeNames = Attributes.getNames(attributes);
+ if (Attributes.allText(attributes)) {
+ this.textAttributeValues = Attributes.getTextValues(attributes);
+ } else {
+ this.textAttributeValues = null;
+ }
+ if (Attributes.allNumeric(attributes)) {
+ this.numericAttributeValues = Attributes
+ .getNumericValues(attributes);
+ } else {
+ this.numericAttributeValues = null;
+ }
+ }
+
+ public String toShortString() {
+ List attrValues = new ArrayList();
+ for (Attribute a : attributes) {
+ if (a.isNumeric()) {
+ attrValues.add(String.valueOf(a.getNumericValue()));
+ } else {
+ attrValues.add(a.getTextValue());
+ }
+ }
+ return label + "(" + attrValues.toString() + ")";
+ }
+
+ @Override
+ public String toString() {
+ return label + "(" + Arrays.toString(attributes) + ")";
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java b/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java
new file mode 100644
index 0000000..43b07c1
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java
@@ -0,0 +1,306 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.partitional;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.Attributes;
+import org.yooreeka.config.YooreekaConfigurator;
+
+public class KMeansAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(KMeansAlgorithm.class.getName());
+
+ public static void main(String[] args) {
+
+ DataPoint[] dataPoints = new DataPoint[] {
+ new DataPoint("2", new double[] { 2.0 }),
+ new DataPoint("4", new double[] { 4.0 }),
+ new DataPoint("10", new double[] { 10.0 }),
+ new DataPoint("12", new double[] { 12.0 }),
+ new DataPoint("3", new double[] { 3.0 }),
+ new DataPoint("20", new double[] { 20.0 }),
+ new DataPoint("30", new double[] { 30.0 }),
+ new DataPoint("11", new double[] { 11.0 }),
+ new DataPoint("25", new double[] { 25.0 }) };
+
+ DataPoint[] clusterMeans = new DataPoint[] {
+ new DataPoint("Mean-2", new double[] { 2.0 }),
+ new DataPoint("Mean-4", new double[] { 4.0 }) };
+
+ KMeansAlgorithm kmeans = new KMeansAlgorithm(clusterMeans, dataPoints);
+ kmeans.cluster();
+
+ kmeans.print();
+
+ }
+ public static DataPoint[] pickInitialCentroids(int k, DataPoint[] data) {
+
+ Random randGen = new Random();
+ DataPoint[] centroids = new DataPoint[k];
+
+ // Calculate random mean values for each cluster based on the data
+ /**
+ * TODO: 4.2 -- Selecting the means for seeding
+ *
+ * In large datasets, the selection of the initial centroids can be
+ * important from a computational (time) complexity perspective.
+ *
+ * In general, how can we improve the seeding of the initial mean
+ * values? For example, consider the following heuristic:
+ *
+ * 1. pick randomly one node 2. calculate the distance between that node
+ * and O (10*k) other nodes 3. sort the list of nodes according to their
+ * distance from the first node 4. pick every 10th node in the sequence
+ * 5. calculate the mean distance between each one of these nodes and
+ * the original node
+ *
+ * This algorithmic choice is as ad hoc as they come, however, it does
+ * have some key principles embedded in it? What are these principles?
+ * How can you generalize this algorithm?
+ *
+ * Discuss advantages/disadvantages of the initial seeding with your
+ * friends.
+ *
+ */
+ Set previouslyUsedIds = new HashSet();
+ for (int i = 0; i < k; i++) {
+ // pick point index that we haven't used yet
+ int centroidId;
+ do {
+ centroidId = randGen.nextInt(data.length);
+ } while (previouslyUsedIds.add(centroidId) == false);
+
+ // Create DataPoint that will represent the cluster's centroid.
+ String label = "Mean-" + i + "(" + data[centroidId].getLabel()
+ + ")";
+ double[] values = data[centroidId].getNumericAttrValues();
+ String[] attrNames = data[centroidId].getAttributeNames();
+ centroids[i] = new DataPoint(label, Attributes.createAttributes(
+ attrNames, values));
+ }
+
+ return centroids;
+ }
+ private int k;
+ private DataPoint[] allCentroids;
+
+ private Cluster[] allClusters;
+
+ private DataPoint[] allDataPoints;
+
+ /**
+ * @param initialCentroids
+ * - starting values for the centroids of each cluster.
+ */
+ public KMeansAlgorithm(DataPoint[] initialCentroids, DataPoint[] dataPoints) {
+ init(initialCentroids, dataPoints);
+ }
+
+ /**
+ *
+ * @param k
+ * - desired number of clusters.
+ *
+ */
+ public KMeansAlgorithm(int k, DataPoint[] dataPoints) {
+ DataPoint[] initialCentroids = KMeansAlgorithm.pickInitialCentroids(k,
+ dataPoints);
+ init(initialCentroids, dataPoints);
+ }
+
+ public void cluster() {
+
+ boolean centroidsChanged = true;
+
+ while (centroidsChanged == true) {
+ // Create a set points for each cluster
+ List> clusters = new ArrayList>(k);
+ for (int i = 0; i < k; i++) {
+ clusters.add(new HashSet());
+ }
+
+ // Assign points to each set based on minimum distance from the
+ // centroids
+ for (DataPoint p : allDataPoints) {
+ int i = findClosestCentroid(allCentroids, p);
+ clusters.get(i).add(p);
+ }
+
+ for (int i = 0; i < k; i++) {
+ allClusters[i] = new Cluster(clusters.get(i));
+ }
+
+ // Calculate new cluster centroids, and
+ // check if any of the centroids has changed
+ centroidsChanged = false;
+ for (int i = 0; i < allClusters.length; i++) {
+ if (clusters.get(i).size() > 0) {
+ double[] newCentroidValues = findCentroid(allClusters[i]);
+ double[] oldCentroidValues = allCentroids[i]
+ .getNumericAttrValues();
+ if (!Arrays.equals(oldCentroidValues, newCentroidValues)) {
+ allCentroids[i] = new DataPoint(
+ allCentroids[i].getLabel(), newCentroidValues);
+ centroidsChanged = true;
+ }
+ } else {
+ // keep mean unchanged if cluster has no elements.
+ }
+ }
+ }
+ }
+
+ private double distance(DataPoint x, DataPoint y) {
+ return distance(x.getNumericAttrValues(), y.getNumericAttrValues());
+ }
+
+ private double distance(double[] x, double[] y) {
+ double sumXY2 = 0.0;
+ for (int i = 0, n = x.length; i < n; i++) {
+ sumXY2 += Math.pow(x[i] - y[i], 2);
+ }
+ return Math.sqrt(sumXY2);
+ }
+
+ private double[] findCentroid(Cluster c) {
+
+ Set clusterPoints = c.getElements();
+ int n = clusterPoints.size();
+
+ if (n == 0) {
+ return new double[0];
+ }
+
+ int d = c.getDimensionCount();
+ double[] meanAttributes = new double[d];
+
+ for (DataPoint p : clusterPoints) {
+ double[] pointAttributes = p.getNumericAttrValues();
+ for (int i = 0; i < d; i++) {
+ meanAttributes[i] += pointAttributes[i];
+ }
+ }
+
+ for (int i = 0; i < d; i++) {
+ meanAttributes[i] = meanAttributes[i] / n;
+ }
+
+ return meanAttributes;
+ }
+
+ /**
+ * This method calculates the closest centroid for a given data point
+ *
+ * @param centroids
+ * @param x
+ * is the DataPoint
for which we seek the closest
+ * centroid
+ * @return the index (from the centroids array) of the closest centroid
+ */
+ private int findClosestCentroid(DataPoint[] centroids, DataPoint x) {
+ double minDistance = Double.POSITIVE_INFINITY;
+ int closestCentroid = -1;
+ for (int i = 0, n = centroids.length; i < n; i++) {
+ double d = distance(centroids[i], x);
+ // if the d == minDistance then keep current selection
+ if (d < minDistance) {
+ minDistance = d;
+ closestCentroid = i;
+ }
+
+ }
+ return closestCentroid;
+ }
+
+ public DataPoint[] getAllCentroids() {
+ return this.allCentroids;
+ }
+
+ public Cluster[] getAllClusters() {
+ return this.allClusters;
+ }
+
+ public int getK() {
+ return this.k;
+ }
+
+ private void init(DataPoint[] initialCentroids, DataPoint[] dataPoints) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(KMeansAlgorithm.class.getName()));
+
+ this.k = initialCentroids.length;
+ this.allDataPoints = dataPoints;
+ this.allCentroids = initialCentroids;
+ this.allClusters = new Cluster[k];
+ }
+
+ public void print() {
+ // show results
+ Cluster[] clusters = this.getAllClusters();
+
+ System.out.println("Clusters:");
+ for (Cluster c : clusters) {
+ System.out.println(c.getElementsAsString());
+ }
+ }
+
+ public void printAll() {
+
+ Cluster[] clusters = this.getAllClusters();
+ System.out.println("Clusters:");
+ for (Cluster c : clusters) {
+ System.out.println(c.getElementsAsString());
+ }
+ System.out
+ .println("___________________________________________________");
+ DataPoint[] means = this.getAllCentroids();
+ System.out.println("Cluster means:");
+ for (DataPoint p : means) {
+ System.out.println(p.toString());
+ }
+ }
+
+ public void printMeans() {
+ System.out.println("Cluster means:");
+ for (DataPoint mean : this.allCentroids) {
+ System.out.println(mean);
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java b/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java
new file mode 100644
index 0000000..b1a67f6
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java
@@ -0,0 +1,230 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.partitional;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping;
+import org.yooreeka.config.YooreekaConfigurator;
+import org.yooreeka.util.metrics.NumericDistance;
+import org.yooreeka.util.metrics.EuclideanDistance;
+
+public class NearestNeighborAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(NearestNeighborAlgorithm.class.getName());
+
+ public static void main(String[] args) {
+
+ DataPoint[] elements = new DataPoint[5];
+ elements[0] = new DataPoint("A", new double[] {});
+ elements[1] = new DataPoint("B", new double[] {});
+ elements[2] = new DataPoint("C", new double[] {});
+ elements[3] = new DataPoint("D", new double[] {});
+ elements[4] = new DataPoint("E", new double[] {});
+
+ double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 },
+ { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } };
+
+ double threshold = 2;
+
+ NearestNeighborAlgorithm nn = new NearestNeighborAlgorithm(elements, a,
+ threshold);
+
+ nn.run();
+ }
+
+ /*
+ * All elements for clustering.
+ */
+ private DataPoint[] allDataPoints;
+
+ /*
+ * Matrix with distances between elements.
+ */
+ private double[][] a;
+
+ /*
+ * Threshold value that is used to determine if elements will be added to
+ * one of the existing clusters or if a new cluster will be created.
+ */
+ private double t = 0.5;
+
+ /*
+ * List of clusters.
+ */
+ private List allClusters;
+
+ /*
+ * Distance metric that will be used to calculate distance between elements.
+ */
+ private NumericDistance dist = new EuclideanDistance();
+
+ /*
+ * DataPoint -> Index mapping. Used to access data in distance matrix.
+ */
+ ObjectToIndexMapping idxMapping = null;
+
+ private boolean verbose = true;
+
+ public NearestNeighborAlgorithm(DataPoint[] dataPoints, double t) {
+ this(dataPoints, null, t);
+ }
+
+ /**
+ *
+ * @param dataPoints
+ * elements to cluster. Element order should correspond to
+ * elements in distance matrix.
+ * @param a
+ * matrix showing distance between elements. Can be null.
+ * @param t
+ * threshold value for new cluster creation.
+ */
+ public NearestNeighborAlgorithm(DataPoint[] dataPoints, double[][] a,
+ double t) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(NearestNeighborAlgorithm.class.getName()));
+
+ this.t = t;
+ this.allDataPoints = dataPoints;
+ this.a = a;
+ this.allClusters = new ArrayList();
+
+ /*
+ * Create DataPoint -> Index mapping for all data points.
+ */
+ idxMapping = new ObjectToIndexMapping();
+
+ for (int i = 0, n = dataPoints.length; i < n; i++) {
+ idxMapping.getIndex(dataPoints[i]);
+ }
+
+ }
+
+ private void assignPointToCluster(DataPoint x) {
+
+ /* find min distance between current point and all clusters */
+ double minNNDist = Double.POSITIVE_INFINITY;
+ Cluster closestCluster = null;
+ for (Cluster c : allClusters) {
+ double nnDist = getNNDistance(c, x);
+ if (nnDist < minNNDist) {
+ minNNDist = nnDist;
+ closestCluster = c;
+ }
+ }
+
+ /* Assign point to cluster based on calculated distance and threshold */
+ if (minNNDist <= t) {
+ closestCluster.add(x);
+ } else {
+ /* Best distance exceeds the threshold - create a new cluster. */
+ Cluster newCluster = new Cluster();
+ newCluster.add(x);
+ allClusters.add(newCluster);
+ }
+ }
+
+ private void calculateDistanceMatrix() {
+ a = new double[allDataPoints.length][allDataPoints.length];
+ for (int i = 0, n = allDataPoints.length; i < n; i++) {
+ DataPoint x = allDataPoints[i];
+ for (int j = i + 1; j < n; j++) {
+ DataPoint y = allDataPoints[j];
+ a[i][j] = dist.getDistance(x.getNumericAttrValues(),
+ y.getNumericAttrValues());
+ a[j][i] = a[i][j];
+ }
+ a[i][i] = 0.0;
+ }
+ }
+
+ public List getAllClusters() {
+ return allClusters;
+ }
+
+ /**
+ * Calculates distance between cluster and element using Nearest Neighbor
+ * approach.
+ */
+ private double getNNDistance(Cluster c, DataPoint x) {
+
+ double nnDist = Double.POSITIVE_INFINITY;
+
+ if (c.contains(x)) {
+ nnDist = 0.0;
+ } else {
+ int i = idxMapping.getIndex(x);
+ for (DataPoint y : c.getElements()) {
+ int j = idxMapping.getIndex(y);
+ double xyDist = a[i][j];
+ nnDist = Math.min(nnDist, xyDist);
+ }
+ }
+
+ return nnDist;
+ }
+
+ private void printResults() {
+ System.out.println("Nearest Neighbor Clustering with t = " + t);
+ System.out.println("Clusters:");
+ for (Cluster c : allClusters) {
+ System.out.println(c.getElementsAsString());
+ }
+ }
+
+ public void run() {
+
+ if (allDataPoints == null || allDataPoints.length == 0) {
+ return;
+ }
+
+ if (a == null) {
+ calculateDistanceMatrix();
+ }
+
+ for (int i = 0, n = allDataPoints.length; i < n; i++) {
+ assignPointToCluster(allDataPoints[i]);
+ }
+
+ if (verbose) {
+ printResults();
+ }
+ }
+
+ public void setDistance(NumericDistance dist) {
+ this.dist = dist;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java b/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java
new file mode 100644
index 0000000..4728cca
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java
@@ -0,0 +1,195 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.rock;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping;
+import org.yooreeka.config.YooreekaConfigurator;
+import org.yooreeka.util.metrics.SimilarityMeasure;
+
+/**
+ * Calculates number of links between data points.
+ */
+public class LinkMatrix {
+
+ private static final Logger LOG = Logger.getLogger(LinkMatrix.class.getName());
+
+ private double th;
+ double[][] pointSimilarityMatrix;
+ int[][] pointNeighborMatrix;
+ int[][] pointLinkMatrix;
+ private ObjectToIndexMapping objToIndexMapping;
+
+ public LinkMatrix(DataPoint[] points, double[][] similarityMatrix, double th) {
+ init(points, similarityMatrix, th);
+ }
+
+ public LinkMatrix(DataPoint[] points, SimilarityMeasure pointSim, double th) {
+
+ double[][] similarityMatrix = calculatePointSimilarities(points,
+ pointSim);
+ init(points, similarityMatrix, th);
+ }
+
+ /*
+ * Calculates similarity matrix for all points.
+ */
+ private double[][] calculatePointSimilarities(DataPoint[] points,
+ SimilarityMeasure pointSim) {
+
+ int n = points.length;
+ double[][] simMatrix = new double[n][n];
+ for (int i = 0; i < n; i++) {
+ DataPoint itemX = points[i];
+ String[] attributesX = itemX.getTextAttrValues();
+ for (int j = i + 1; j < n; j++) {
+ DataPoint itemY = points[j];
+ String[] attributesY = itemY.getTextAttrValues();
+ simMatrix[i][j] = pointSim.similarity(attributesX, attributesY);
+ simMatrix[j][i] = simMatrix[i][j];
+ }
+ simMatrix[i][i] = 1.0;
+ }
+
+ return simMatrix;
+ }
+
+ /**
+ * Calculates number of links between two clusters. Number of links between
+ * two clusters is the sum of links between all point pairs( p1, p2) where
+ * p1 belongs to the first cluster and p2 belongs to the other cluster.
+ *
+ * @param clusterX
+ * @param clusterY
+ *
+ * @return link count between two clusters.
+ */
+ public int getLinks(Cluster clusterX, Cluster clusterY) {
+ Set itemsX = clusterX.getElements();
+ Set itemsY = clusterY.getElements();
+
+ int linkSum = 0;
+
+ for (DataPoint x : itemsX) {
+ for (DataPoint y : itemsY) {
+ linkSum += getLinks(x, y);
+ }
+ }
+ return linkSum;
+ }
+
+ public int getLinks(DataPoint p1, DataPoint p2) {
+ int i = objToIndexMapping.getIndex(p1);
+ int j = objToIndexMapping.getIndex(p2);
+ return pointLinkMatrix[i][j];
+ }
+
+ private void init(DataPoint[] points, double[][] similarityMatrix, double th) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(LinkMatrix.class.getName()));
+
+ this.th = th;
+
+ objToIndexMapping = new ObjectToIndexMapping();
+
+ // Create DataPoint <-> Index mapping.
+ for (DataPoint point : points) {
+ objToIndexMapping.getIndex(point);
+ }
+
+ pointSimilarityMatrix = similarityMatrix;
+
+ // Identify neighbors: a[i][j] == 1 if (i,j) are neighbors and 0
+ // otherwise.
+ int n = points.length;
+
+ pointNeighborMatrix = new int[n][n];
+ for (int i = 0; i < n; i++) {
+ for (int j = i + 1; j < n; j++) {
+ if (pointSimilarityMatrix[i][j] >= th) {
+ pointNeighborMatrix[i][j] = 1;
+ } else {
+ pointNeighborMatrix[i][j] = 0;
+ }
+ pointNeighborMatrix[j][i] = pointNeighborMatrix[i][j];
+ }
+ pointNeighborMatrix[i][i] = 1;
+ }
+
+ // Calculate number of links between points
+ pointLinkMatrix = new int[n][n];
+ for (int i = 0; i < n; i++) {
+ for (int j = i; j < n; j++) {
+ pointLinkMatrix[i][j] = nLinksBetweenPoints(
+ pointNeighborMatrix, i, j);
+ pointLinkMatrix[j][i] = pointLinkMatrix[i][j];
+ }
+ }
+
+ }
+
+ private int nLinksBetweenPoints(int[][] neighbors, int indexX, int indexY) {
+ int nLinks = 0;
+ for (int i = 0, n = neighbors.length; i < n; i++) {
+ nLinks += neighbors[indexX][i] * neighbors[i][indexY];
+ }
+ return nLinks;
+ }
+
+ public void printPointLinkMatrix() {
+ System.out
+ .println("Point Link matrix (th=" + String.valueOf(th) + "):");
+ for (int i = 0; i < pointLinkMatrix.length; i++) {
+ System.out.println(Arrays.toString(pointLinkMatrix[i]));
+ }
+ }
+
+ public void printPointNeighborMatrix() {
+ System.out.println("Point Neighbor matrix (th=" + String.valueOf(th)
+ + "):");
+ for (int i = 0; i < pointNeighborMatrix.length; i++) {
+ System.out.println(Arrays.toString(pointNeighborMatrix[i]));
+ }
+ }
+
+ public void printSimilarityMatrix() {
+ System.out.println("Point Similarity matrix:");
+ for (int i = 0; i < pointSimilarityMatrix.length; i++) {
+ System.out.println(Arrays.toString(pointSimilarityMatrix[i]));
+ }
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java b/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java
new file mode 100644
index 0000000..21d217a
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java
@@ -0,0 +1,92 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.rock;
+
+/**
+ * Goodness measure for merging two clusters.
+ */
+public class MergeGoodnessMeasure {
+
+ /*
+ * Threshold value that was used to identify neighbors among points.
+ */
+ private double linkThreshold;
+
+ /*
+ * Intermediate value that is used in calculation of goodness measure and
+ * stays the same for different clusters.
+ */
+ private double p;
+
+ public MergeGoodnessMeasure(double th) {
+ this.linkThreshold = th;
+ this.p = 1.0 + 2.0 * f(th);
+ }
+
+ /**
+ * This is just one of the possible implementations.
+ *
+ * @param linkThreshold
+ * threshold value that was used to identify neighbors among
+ * points.
+ */
+ private double f(double th) {
+
+ /*
+ * This implementation assumes that linkThreshold was a threshold for
+ * similarity measure (as opposed to dissimilarity/distance).
+ */
+ return (1.0 - th) / (1.0 + th);
+ }
+
+ public double g(int nLinks, int nX, int nY) {
+ double a = Math.pow(nX + nY, p);
+ double b = Math.pow(nX, p);
+ double c = Math.pow(nY, p);
+
+ return nLinks / (a - b - c);
+ }
+
+ /**
+ * @return the linkThreshold
+ */
+ public double getTh() {
+ return linkThreshold;
+ }
+
+ /**
+ * @param linkThreshold
+ * the linkThreshold to set
+ */
+ public void setTh(double th) {
+ this.linkThreshold = th;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java b/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java
new file mode 100644
index 0000000..2932f0c
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java
@@ -0,0 +1,142 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.rock;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.hierarchical.Dendrogram;
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.config.YooreekaConfigurator;
+import org.yooreeka.util.metrics.JaccardCoefficient;
+import org.yooreeka.util.metrics.SimilarityMeasure;
+
+public class ROCKAlgorithm {
+
+ private static final Logger LOG = Logger.getLogger(ROCKAlgorithm.class.getName());
+
+ public static void main(String[] args) {
+ // Define data
+ DataPoint[] elements = new DataPoint[4];
+ elements[0] = new DataPoint("Doc1", new String[] { "book" });
+ elements[1] = new DataPoint("Doc2", new String[] { "water", "sun",
+ "sand", "swim" });
+ elements[2] = new DataPoint("Doc3", new String[] { "water", "sun",
+ "swim", "read" });
+ elements[3] = new DataPoint("Doc4", new String[] { "read", "sand" });
+
+ int k = 1;
+ double th = 0.2;
+ ROCKAlgorithm rock = new ROCKAlgorithm(elements, k, th);
+ Dendrogram dnd = rock.cluster();
+ dnd.printAll();
+ }
+ private DataPoint[] points;
+ private int k;
+
+ private double th;
+
+ private SimilarityMeasure similarityMeasure;
+
+ private LinkMatrix linkMatrix;
+
+ /**
+ *
+ * @param k
+ * desired number of clusters.
+ * @param th
+ * threshold value to identify neighbors among points.
+ */
+ public ROCKAlgorithm(DataPoint[] points, int k, double th) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(ROCKAlgorithm.class.getName()));
+
+ this.points = points;
+ this.k = k;
+ this.th = th;
+ this.similarityMeasure = new JaccardCoefficient();
+ // this.similarityMeasure = new CosineSimilarity();
+ this.linkMatrix = new LinkMatrix(points, similarityMeasure, th);
+ }
+
+ public Dendrogram cluster() {
+
+ // Create a new cluster out of every point.
+ List initialClusters = new ArrayList();
+ for (int i = 0, n = points.length; i < n; i++) {
+ Cluster cluster = new Cluster(points[i]);
+ initialClusters.add(cluster);
+ }
+ double g = Double.POSITIVE_INFINITY;
+ Dendrogram dnd = new Dendrogram("Goodness");
+ dnd.addLevel(String.valueOf(g), initialClusters);
+
+ MergeGoodnessMeasure goodnessMeasure = new MergeGoodnessMeasure(th);
+
+ ROCKClusters allClusters = new ROCKClusters(initialClusters,
+ linkMatrix, goodnessMeasure);
+
+ int nClusters = allClusters.size();
+ while (nClusters > k) {
+ int nClustersBeforeMerge = nClusters;
+ g = allClusters.mergeBestCandidates();
+ nClusters = allClusters.size();
+ if (nClusters == nClustersBeforeMerge) {
+ // there are no linked clusters to merge
+ break;
+ }
+ dnd.addLevel(String.valueOf(g), allClusters.getAllClusters());
+ }
+
+ System.out.println("Number of clusters: "
+ + allClusters.getAllClusters().size());
+ return dnd;
+ }
+
+ public int getK() {
+ return k;
+ }
+
+ public LinkMatrix getLinkMatrix() {
+ return linkMatrix;
+ }
+
+ public SimilarityMeasure getSimilarityMeasure() {
+ return similarityMeasure;
+ }
+
+ public double getTh() {
+ return th;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java b/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java
new file mode 100644
index 0000000..447cb57
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java
@@ -0,0 +1,205 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.rock;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.Cluster;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * Set of clusters and link data for ROCK implementation.
+ */
+public class ROCKClusters {
+
+ private static final Logger LOG = Logger.getLogger(ROCKClusters.class.getName());
+
+ /*
+ * Used to assign unique IDs to clusters.
+ */
+ private int nextKey;
+
+ /*
+ * Provides ID -> Cluster mapping.
+ */
+ private Map clusterMap;
+
+ /*
+ * Provides ID -> Similar Clusters mapping.
+ */
+ private Map> similarClustersMap;
+
+ /*
+ * Goodness measure between two clusters. It is used to determine cluster
+ * eligibility for merge.
+ */
+ private MergeGoodnessMeasure goodnessMeasure;
+
+ /*
+ * Links between data points and clusters.
+ */
+ private LinkMatrix linkMatrix;
+
+ public ROCKClusters(List initialClusters, LinkMatrix linkMatrix,
+ MergeGoodnessMeasure goodnessMeasure) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(ROCKClusters.class.getName()));
+
+ this.linkMatrix = linkMatrix;
+ clusterMap = new HashMap();
+ nextKey = 0;
+ this.goodnessMeasure = goodnessMeasure;
+
+ for (Cluster c : initialClusters) {
+ addCluster(c);
+ }
+ calculateClusterSimilarities();
+ }
+
+ public int addCluster(Cluster c) {
+ int key = nextKey;
+ clusterMap.put(key, c);
+ nextKey++;
+ return key;
+ }
+
+ public void calculateClusterSimilarities() {
+ similarClustersMap = new HashMap>();
+ for (Integer clusterKey : getAllKeys()) {
+ List similarClusters = new LinkedList();
+ Cluster cluster = getCluster(clusterKey);
+ for (Integer similarClusterKey : getAllKeys()) {
+ if (clusterKey != similarClusterKey) {
+ Cluster similarCluster = getCluster(similarClusterKey);
+ int nLinks = linkMatrix.getLinks(cluster, similarCluster);
+ if (nLinks > 0) {
+ double goodness = goodnessMeasure.g(nLinks,
+ cluster.size(), similarCluster.size());
+ similarClusters.add(new SimilarCluster(
+ similarClusterKey, goodness));
+ }
+ }
+ }
+ setSimilarClusters(clusterKey, similarClusters);
+ }
+ }
+
+ /**
+ * Finds a pair of cluster indexes with the best goodness measure.
+ */
+ public List findBestMergeCandidates() {
+ Integer bestKey = null;
+ SimilarCluster bestSimilarCluster = null;
+ Double bestGoodness = Double.NEGATIVE_INFINITY;
+ for (Map.Entry> e : similarClustersMap
+ .entrySet()) {
+ List similarClusters = e.getValue();
+ if (similarClusters != null && similarClusters.size() > 0) {
+ SimilarCluster topSimilarCluster = similarClusters.get(0);
+ if (topSimilarCluster.getGoodness() > bestGoodness) {
+ bestGoodness = topSimilarCluster.getGoodness();
+ bestKey = e.getKey();
+ bestSimilarCluster = topSimilarCluster;
+ }
+ }
+ }
+ List bestMergeCandidates = new ArrayList();
+ if (bestKey != null) {
+ bestMergeCandidates.add(bestKey);
+ bestMergeCandidates.add(bestSimilarCluster.getClusterKey());
+ }
+ return bestMergeCandidates;
+ }
+
+ public Collection getAllClusters() {
+ return clusterMap.values();
+ }
+
+ public Set getAllKeys() {
+ return new HashSet(clusterMap.keySet());
+ }
+
+ public Cluster getCluster(Integer key) {
+ return clusterMap.get(key);
+ }
+
+ public double mergeBestCandidates() {
+ List mergeCandidates = findBestMergeCandidates();
+
+ double goodness = Double.NaN;
+
+ if (mergeCandidates.size() > 1) {
+
+ Integer key1 = mergeCandidates.get(0);
+ Integer key2 = mergeCandidates.get(1);
+ goodness = similarClustersMap.get(key1).get(0).getGoodness();
+
+ mergeClusters(key1, key2);
+ }
+
+ return goodness;
+ }
+
+ public Integer mergeClusters(Integer key1, Integer key2) {
+
+ Cluster cluster1 = getCluster(key1);
+ Cluster cluster2 = getCluster(key2);
+ Cluster cluster3 = new Cluster(cluster1, cluster2);
+ removeCluster(key1);
+ removeCluster(key2);
+ Integer key3 = addCluster(cluster3);
+
+ calculateClusterSimilarities();
+
+ return key3;
+ }
+
+ public Cluster removeCluster(Integer key) {
+ return clusterMap.remove(key);
+ }
+
+ private void setSimilarClusters(Integer key, List list) {
+ SimilarCluster.sortByGoodness(list);
+ similarClustersMap.put(key, list);
+ }
+
+ public int size() {
+ return clusterMap.size();
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java b/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java
new file mode 100644
index 0000000..e4ad7dc
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java
@@ -0,0 +1,85 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.rock;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+public class SimilarCluster {
+ /**
+ * Sorts list by goodness value in descending order. Higher goodness values
+ * will be in the head of the list.
+ *
+ * @param values
+ * list to sort.
+ */
+ public static void sortByGoodness(List values) {
+ Collections.sort(values, new Comparator() {
+
+ public int compare(SimilarCluster f1, SimilarCluster f2) {
+
+ int result = 0;
+ if (f1.getGoodness() < f2.getGoodness()) {
+ result = 1; // order in the decreasing order of goodness
+ // value
+ } else if (f1.getGoodness() > f2.getGoodness()) {
+ result = -1;
+ } else {
+ result = 0;
+ }
+ return result;
+ }
+ });
+ }
+ private Integer clusterKey;
+
+ private Double goodness;
+
+ public SimilarCluster(Integer clusterKey, Double goodness) {
+ this.clusterKey = clusterKey;
+ this.goodness = goodness;
+ }
+
+ public Integer getClusterKey() {
+ return clusterKey;
+ }
+
+ public Double getGoodness() {
+ return goodness;
+ }
+
+ @Override
+ public String toString() {
+ return "[clusterKey=" + this.clusterKey + ",goodness=" + this.goodness
+ + "]";
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java
new file mode 100644
index 0000000..5489bbd
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java
@@ -0,0 +1,125 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.test;
+
+import java.util.List;
+
+import org.yooreeka.algos.clustering.dbscan.DBSCANAlgorithm;
+import org.yooreeka.algos.clustering.hierarchical.Dendrogram;
+import org.yooreeka.algos.clustering.model.Attribute;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.rock.ROCKAlgorithm;
+import org.yooreeka.algos.clustering.utils.Attributes;
+import org.yooreeka.algos.reco.collab.data.DiggData;
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.algos.reco.content.digg.DiggStoryItem;
+import org.yooreeka.config.YooreekaConfigurator;
+import org.yooreeka.util.metrics.CosineDistance;
+
+public class MyDiggSpaceData {
+
+ private static DataPoint createDataPoint(DiggStoryItem story, int topNTerms) {
+ String storyLabel = String.valueOf(story.getId() + ":"
+ + story.getTitle());
+ String storyText = story.getTitle() + " " + story.getDescription();
+ Content content = new Content(storyLabel, storyText, topNTerms);
+ String[] terms = content.getTerms();
+ // using term as attribute name and value.
+ Attribute[] attributes = Attributes.createAttributes(terms, terms);
+ return new DataPoint(storyLabel, attributes);
+ }
+
+ public static MyDiggSpaceDataset createDataset() {
+ return createDataset(10);
+ }
+
+ public static MyDiggSpaceDataset createDataset(int topNTerms) {
+ DiggData.loadData(YooreekaConfigurator.getHome()
+ + "/data/ch04/ch4_digg_stories.csv");
+
+ List allStories = DiggData.allStories;
+
+ DataPoint[] allDataPoints = new DataPoint[allStories.size()];
+
+ for (int i = 0, n = allDataPoints.length; i < n; i++) {
+ DiggStoryItem story = allStories.get(i);
+ DataPoint di = createDataPoint(story, topNTerms);
+ allDataPoints[i] = di;
+ }
+ return new MyDiggSpaceDataset(allDataPoints);
+ }
+
+ public static MyDiggSpaceDataset createDataset(int topNTerms,
+ List allStories) {
+
+ DataPoint[] allDataPoints = new DataPoint[allStories.size()];
+
+ for (int i = 0, n = allDataPoints.length; i < n; i++) {
+
+ DiggStoryItem story = allStories.get(i);
+ story.print();
+
+ DataPoint di = createDataPoint(story, topNTerms);
+ allDataPoints[i] = di;
+ }
+ return new MyDiggSpaceDataset(allDataPoints);
+ }
+
+ public static void main(String[] args) {
+ // testRockOnDigg();
+ testDBSCAN();
+ }
+
+ private static void testDBSCAN() {
+ MyDiggSpaceDataset ds = MyDiggSpaceData.createDataset(3);
+ double eps = 0.8;
+ int minPts = 2;
+ boolean useTermFreq = true;
+ DBSCANAlgorithm dbscan = new DBSCANAlgorithm(ds.getData(),
+ new CosineDistance(), eps, minPts, useTermFreq);
+
+ dbscan.cluster();
+ // dbscan.printDistances();
+ }
+
+ public static void testRockOnDigg() {
+ MyDiggSpaceDataset ds = MyDiggSpaceData.createDataset(10);
+ ROCKAlgorithm rock = new ROCKAlgorithm(ds.getData(), 4, 0.1);
+ // rock.getLinkMatrix().printSimilarityMatrix();
+ // rock.getLinkMatrix().printPointNeighborMatrix();
+ // rock.getLinkMatrix().printPointLinkMatrix();
+ Dendrogram dnd = rock.cluster();
+ dnd.print(130); // if you get NPE here it means that level doesn't
+ // exist.
+
+ // ROCK stops clustering if there are no links between clusters.
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java
new file mode 100644
index 0000000..a374d6f
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java
@@ -0,0 +1,56 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.test;
+
+import org.yooreeka.algos.clustering.model.DataPoint;
+
+public class MyDiggSpaceDataset {
+
+ private DataPoint[] data;
+
+ private boolean verbose = true;
+
+ public MyDiggSpaceDataset(DataPoint[] data) {
+ this.data = data;
+
+ if (verbose) {
+ System.out.println("\nCreated " + this.getClass().getSimpleName()
+ + " dataset with " + data.length + " items:\n");
+ for (DataPoint item : data) {
+ System.out.println(item.toShortString());
+ }
+ }
+ }
+
+ public DataPoint[] getData() {
+ return data;
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/test/SFData.java b/src/org/yooreeka/algos/clustering/test/SFData.java
new file mode 100644
index 0000000..94e4f42
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/test/SFData.java
@@ -0,0 +1,212 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.test;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.supercsv.io.CsvListReader;
+import org.supercsv.prefs.CsvPreference;
+import org.yooreeka.algos.clustering.model.Attribute;
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.algos.clustering.partitional.NearestNeighborAlgorithm;
+import org.yooreeka.config.YooreekaConfigurator;
+import org.yooreeka.util.metrics.NumericDistance;
+import org.yooreeka.util.metrics.EuclideanDistance;
+
+public class SFData {
+
+ /*
+ * All available attributes.
+ */
+ private static String[] allAvailableAttributeNames = { "Age",
+ "IncomeRange", "Education", "Skills", "Social", "isPaid" };
+
+ public static SFDataset createDataset() {
+ return createDataset(allAvailableAttributeNames);
+ }
+
+ /**
+ * Creates dataset that uses only attributes with specified names. Other
+ * attributes will not be loaded.
+ *
+ * @param attrNames
+ * attribute names to use.
+ * @return dataset that uses only specified attributes.
+ */
+ public static SFDataset createDataset(String[] attrNames) {
+
+ // check that attribute names are valid
+ validateAttrNames(attrNames, allAvailableAttributeNames);
+
+ DataPoint[] allData = loadDataFromFile(YooreekaConfigurator.getHome()
+ + "/data/ch04/clusteringSF.dat", attrNames);
+
+ NumericDistance dist = new EuclideanDistance();
+ SFDataset sfDataset = new SFDataset(allData, dist);
+ return sfDataset;
+ }
+
+ private static DataPoint[] loadDataFromFile(String filename,
+ String[] attrNames) {
+ List allData = new ArrayList();
+ CsvListReader csvReader = null;
+ try {
+ csvReader = new CsvListReader(new BufferedReader(new FileReader(
+ filename)), CsvPreference.EXCEL_PREFERENCE);
+
+ // Load all available headers from CSV file
+ String[] csvHeaders = csvReader.getCSVHeader(true);
+
+ // Map attribute names to field IDs from CSV file using header names
+ int[] attrFieldIndexes = new int[attrNames.length];
+ for (int i = 0; i < attrFieldIndexes.length; i++) {
+ String header = attrNames[i];
+ int csvHeaderId = -1;
+ for (int j = 0; j < csvHeaders.length; j++) {
+ if (header.equalsIgnoreCase(csvHeaders[j])) {
+ csvHeaderId = j;
+ break;
+ }
+ }
+ // If there is no header found it means we have wrong attribute
+ // name or wrong file.
+ if (csvHeaderId == -1) {
+ throw new IllegalStateException(
+ "Attribute name mismatch. "
+ + "Failed to find attribute name: '"
+ + header
+ + "' among cvs file headers. All available headers: "
+ + Arrays.toString(csvHeaders));
+ } else {
+ attrFieldIndexes[i] = csvHeaderId;
+ }
+ }
+
+ // Read file and include only selected attributes
+ List line = null;
+ while ((line = csvReader.read()) != null) {
+ try {
+ String label = line.get(0);
+ Attribute[] attributes = new Attribute[attrNames.length];
+ for (int i = 0, n = attrNames.length; i < n; i++) {
+ int attrFieldIndex = attrFieldIndexes[i];
+ String value = line.get(attrFieldIndex);
+ attributes[i] = new Attribute(attrNames[i],
+ Double.valueOf(value));
+ }
+ DataPoint dataPoint = new DataPoint(label, attributes);
+ allData.add(dataPoint);
+ } catch (Exception e) {
+ throw new RuntimeException("Error while reading line: '"
+ + line + "'", e);
+ }
+ }
+
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Error while reading SF data from csv file: '" + filename
+ + "'. ", e);
+ } finally {
+ try {
+ if (csvReader != null) {
+ csvReader.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("From file: " + filename);
+ System.out.println("Using attribute names: "
+ + Arrays.toString(attrNames));
+ System.out.println("Loaded " + allData.size() + " data points.");
+
+ return allData.toArray(new DataPoint[allData.size()]);
+ }
+
+ public static void main(String[] args) {
+
+ // Creates dataset that uses all available attributes
+ SFDataset ds = SFData.createDataset();
+
+ // Creates dataset that uses only a subset of available attributes
+ // SFDataset ds = SFData.createDataset(new String[] {"IncomeRange",
+ // "Age"});
+ // SFDataset ds = SFData.createDataset(new String[] {"Age"});
+
+ ds.printDistanceMatrix();
+
+ // Dendrogram dnd = null;
+
+ // Uncomment one of these two run clustering
+
+ // // Run Single Link Clustering
+ // SingleLinkAlgorithm sla = new SingleLinkAlgorithm(ds.getData(),
+ // ds.getDistanceMatrix());
+ // dnd = sla.cluster();
+ // dnd.print();
+
+ // // Run MST Single Link Clustering
+ // MSTSingleLinkAlgorithm msla = new
+ // MSTSingleLinkAlgorithm(ds.getData(), ds.getDistanceMatrix());
+ // dnd = msla.cluster();
+ // dnd.print();
+
+ // // Run Average Link Clustering
+ // AverageLinkAlgorithm ala = new AverageLinkAlgorithm(ds.getData(),
+ // ds.getDistanceMatrix());
+ // dnd = ala.cluster();
+ // dnd.print();
+
+ // double T = 5.0;
+
+ NearestNeighborAlgorithm nna = new NearestNeighborAlgorithm(
+ ds.getData(), ds.getAdjacencyMatrix(), 5.0);
+ nna.run();
+ }
+
+ private static void validateAttrNames(String[] actualAttrNames,
+ String[] validAttrNames) {
+ List validNames = Arrays.asList(validAttrNames);
+ for (String actualAttrName : actualAttrNames) {
+ if (!validNames.contains(actualAttrName)) {
+ throw new IllegalArgumentException("Invalid attribute name: '"
+ + actualAttrName + "'. " + "Valid names are: "
+ + Arrays.toString(allAvailableAttributeNames));
+ }
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/test/SFDataset.java b/src/org/yooreeka/algos/clustering/test/SFDataset.java
new file mode 100644
index 0000000..1f65d16
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/test/SFDataset.java
@@ -0,0 +1,93 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.test;
+
+import java.util.Arrays;
+
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.util.metrics.NumericDistance;
+
+public class SFDataset {
+
+ private DataPoint[] data;
+ private NumericDistance distance;
+ private double[][] adjacencyMatrix;
+
+ public SFDataset(DataPoint[] data, NumericDistance distance) {
+ this.data = data;
+ this.distance = distance;
+ this.adjacencyMatrix = calculateAdjacencyMatrix();
+ }
+
+ /**
+ * Adjacency matrix for all data instances in the dataset. Each element
+ * represents distance between corresponding elements.
+ *
+ * @return
+ */
+ private double[][] calculateAdjacencyMatrix() {
+ int n = data.length;
+ double[][] adjMatrix = new double[n][n];
+
+ DataPoint x = null;
+ DataPoint y = null;
+
+ for (int i = 0; i < n; i++) {
+ x = data[i];
+ for (int j = i + 1; j < n; j++) {
+ y = data[j];
+ adjMatrix[i][j] = distance.getDistance(
+ x.getNumericAttrValues(), y.getNumericAttrValues());
+ adjMatrix[j][i] = adjMatrix[i][j];
+ }
+ adjMatrix[i][i] = 0.0;
+ }
+
+ return adjMatrix;
+ }
+
+ // We might need to move Matrix related methods to separate class
+ // eventually.
+
+ public double[][] getAdjacencyMatrix() {
+ return adjacencyMatrix;
+ }
+
+ public DataPoint[] getData() {
+ return data;
+ }
+
+ public void printDistanceMatrix() {
+ for (int i = 0, n = adjacencyMatrix.length; i < n; i++) {
+ System.out.println(Arrays.toString(adjacencyMatrix[i]));
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/utils/Attributes.java b/src/org/yooreeka/algos/clustering/utils/Attributes.java
new file mode 100644
index 0000000..4234fd5
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/utils/Attributes.java
@@ -0,0 +1,143 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.utils;
+
+import org.yooreeka.algos.clustering.model.Attribute;
+
+/*
+ * Utility methods to simplify operations on attributes.
+ */
+public class Attributes {
+
+ public static boolean allNumeric(Attribute[] attributes) {
+ boolean allNumeric = true;
+ for (Attribute a : attributes) {
+ if (a.isNumeric() == false) {
+ allNumeric = false;
+ break;
+ }
+ }
+ return allNumeric;
+ }
+
+ public static boolean allText(Attribute[] attributes) {
+ boolean allText = true;
+ for (Attribute a : attributes) {
+ if (a.isText() == false) {
+ allText = false;
+ break;
+ }
+ }
+ return allText;
+ }
+
+ public static Attribute[] createAttributes(double[] attrValues) {
+ int n = attrValues.length;
+ Attribute[] attrs = new Attribute[n];
+ for (int i = 0; i < n; i++) {
+ String attrName = "a-" + i;
+ Attribute a = new Attribute(attrName, attrValues[i]);
+ attrs[i] = a;
+ }
+ return attrs;
+ }
+
+ public static Attribute[] createAttributes(String[] attrValues) {
+ int n = attrValues.length;
+ Attribute[] attrs = new Attribute[n];
+ for (int i = 0; i < n; i++) {
+ String attrName = "a-" + i;
+ Attribute a = new Attribute(attrName, attrValues[i]);
+ attrs[i] = a;
+ }
+ return attrs;
+ }
+
+ public static Attribute[] createAttributes(String[] names, double[] values) {
+ int n = names.length;
+ Attribute[] attributes = new Attribute[n];
+ for (int i = 0; i < n; i++) {
+ attributes[i] = new Attribute(names[i], values[i]);
+ }
+ return attributes;
+ }
+
+ public static Attribute[] createAttributes(String[] names, String[] values) {
+ int n = names.length;
+ Attribute[] attributes = new Attribute[n];
+ for (int i = 0; i < n; i++) {
+ attributes[i] = new Attribute(names[i], values[i]);
+ }
+ return attributes;
+ }
+
+ public static String[] getNames(Attribute[] attributes) {
+ int n = attributes.length;
+ String[] names = new String[n];
+ for (int i = 0; i < n; i++) {
+ Attribute a = attributes[i];
+ names[i] = a.getName();
+ }
+ return names;
+ }
+
+ public static double[] getNumericValues(Attribute[] attributes) {
+ int n = attributes.length;
+ double[] values = new double[n];
+ for (int i = 0; i < n; i++) {
+ Attribute a = attributes[i];
+ if (a.isNumeric()) {
+ values[i] = a.getNumericValue();
+ } else {
+ throw new RuntimeException(
+ "Non-numeric attribute encountered. " + "Attribute: "
+ + a.toString());
+ }
+ }
+ return values;
+ }
+
+ public static String[] getTextValues(Attribute[] attributes) {
+ int n = attributes.length;
+ String[] values = new String[n];
+ for (int i = 0; i < n; i++) {
+ Attribute a = attributes[i];
+ if (a.isText()) {
+ values[i] = a.getTextValue();
+ } else {
+ throw new RuntimeException("Non-text attribute encountered. "
+ + "Attribute: " + a.toString());
+ }
+ }
+ return values;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java b/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java
new file mode 100644
index 0000000..1ff7a1f
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java
@@ -0,0 +1,90 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.utils;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Maps object values to an index. Index is zero-based.
+ */
+public class ObjectToIndexMapping implements java.io.Serializable {
+
+ private static final long serialVersionUID = 2031098306406708902L;
+
+ /*
+ * Index value that will be returned for the next new value.
+ */
+ private int nextIndex = 0;
+
+ /*
+ * Maintains mapping from object to index.
+ */
+ private Map objMapping = new HashMap();
+
+ /*
+ * Maintains mapping from index to value.
+ */
+ private Map indexMapping = new HashMap();
+
+ public ObjectToIndexMapping() {
+ // empty
+ }
+
+ /**
+ * Returns index assigned to the value. For new values new index will be
+ * assigned and returned.
+ */
+ public int getIndex(T value) {
+ Integer index = objMapping.get(value);
+ if (index == null) {
+ index = nextIndex;
+ objMapping.put(value, index);
+ indexMapping.put(index, value);
+ nextIndex++;
+ }
+ return index;
+ }
+
+ /**
+ * Returns value mapped to the index or null if mapping doesn't exist.
+ */
+ public T getObject(int index) {
+ return indexMapping.get(index);
+ }
+
+ /**
+ * Current number of elements.
+ */
+ public int getSize() {
+ return objMapping.size();
+ }
+}
diff --git a/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java b/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java
new file mode 100644
index 0000000..87be6e8
--- /dev/null
+++ b/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java
@@ -0,0 +1,71 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.clustering.utils;
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.clustering.model.DataPoint;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * @author Babis Marmanis
+ *
+ */
+public class SortedArrayClustering {
+
+ private static final Logger LOG = Logger.getLogger(SortedArrayClustering.class.getName());
+
+ public static void cluster(DataPoint[] points) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(SortedArrayClustering.class.getName()));
+
+ Arrays.sort(points, new Comparator() {
+ public int compare(DataPoint p1, DataPoint p2) {
+ int result = 0;
+ // sort based on score value
+ if (p1.getR() < p2.getR()) {
+ result = 1; // sorting in descending order
+ } else if (p1.getR() > p2.getR()) {
+ result = -1;
+ } else {
+ result = 0;
+ }
+ return result;
+ }
+ });
+
+ for (int i = 0; i < points.length; i++) {
+ System.out.println(points[i].toShortString());
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/cache/FileStore.java b/src/org/yooreeka/algos/reco/collab/cache/FileStore.java
new file mode 100644
index 0000000..74feb15
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/cache/FileStore.java
@@ -0,0 +1,134 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.cache;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.logging.Logger;
+
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * Implementation of Store
interface. Uses files to store objects
+ * using java serialization. Each object instance is stored in a separate file.
+ */
+public class FileStore implements Store {
+
+ private static final Logger LOG = Logger.getLogger(FileStore.class.getName());
+
+ private File dataDir;
+
+ public FileStore(File dir) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(FileStore.class.getName()));
+
+ if (!dir.exists()) {
+ dir.mkdirs();
+ }
+ this.dataDir = dir;
+ }
+
+ /**
+ * Creates a new instance that will use specified directory to store
+ * objects.
+ *
+ * @param dir
+ * directory that should be used to store/retrieve objects.
+ */
+ public FileStore(String dir) {
+ this(new File(dir));
+ }
+
+ public boolean exists(String key) {
+ File f = getFile(key);
+ return f.exists();
+ }
+
+ public Object get(String key) {
+ Object o = null;
+ try {
+ File f = getFile(key);
+ if (f.exists()) {
+ FileInputStream fInStream = new FileInputStream(f);
+ BufferedInputStream bufInStream = new BufferedInputStream(
+ fInStream);
+ ObjectInputStream objInStream = new ObjectInputStream(
+ bufInStream);
+ o = objInStream.readObject();
+ objInStream.close();
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Error while loading data from file (dir: '" + dataDir
+ + "', filename: '" + key + "').", e);
+ }
+ return o;
+ }
+
+ /*
+ * Derives filename from the key and returns instance of File
+ */
+ private File getFile(String key) {
+ // key is used as a filename
+ return new File(dataDir, key + ".tmp");
+ }
+
+ public void put(String key, Object o) {
+ try {
+ File f = getFile(key);
+ FileOutputStream foutStream = new FileOutputStream(f);
+ BufferedOutputStream boutStream = new BufferedOutputStream(
+ foutStream);
+ ObjectOutputStream objOutputStream = new ObjectOutputStream(
+ boutStream);
+ objOutputStream.writeObject(o);
+ objOutputStream.flush();
+ boutStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Error while saving data into file (dir: '" + dataDir
+ + "', filename: '" + key + "').", e);
+ }
+ }
+
+ public void remove(String key) {
+ File f = getFile(key);
+ if (f.exists()) {
+ f.delete();
+ }
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/cache/Store.java b/src/org/yooreeka/algos/reco/collab/cache/Store.java
new file mode 100644
index 0000000..fdecebd
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/cache/Store.java
@@ -0,0 +1,72 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.cache;
+
+/**
+ * A Store
provides service for persisting pre-calculated data.
+ */
+public interface Store {
+ /**
+ * Checks if key already exists.
+ *
+ * @param key
+ * object id.
+ * @return true if the key already exists.
+ */
+ public boolean exists(String key);
+
+ /**
+ * Retrieves object by key.
+ *
+ * @param key
+ * identifies data to retrieve.
+ * @return
+ */
+ public Object get(String key);
+
+ /**
+ * Persists object. Overwrites previously stored data with the same id.
+ *
+ * @param key
+ * id to identify the object.
+ * @param o
+ * object to be stored.
+ */
+ public void put(String key, Object o);
+
+ /**
+ * Deletes object.
+ *
+ * @param key
+ * identifies object to retrieve.
+ */
+ public void remove(String key);
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java b/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java
new file mode 100644
index 0000000..e57ebac
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java
@@ -0,0 +1,431 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.algos.reco.collab.model.Dataset;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.User;
+
+/**
+ * Dataset implementation that we will use to work with sample data.
+ *
+ * @author Babis Marmanis
+ */
+public class BaseDataset implements Serializable, Dataset {
+
+ // private static final Logger logger = Logger.getLogger(BaseDataset.class);
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 8414181723065929475L;
+
+ public static BaseDataset load(String file) {
+ Object o = null;
+ File f = new File(file);
+ if (f.exists()) {
+ try {
+ FileInputStream fInStream = new FileInputStream(f);
+ BufferedInputStream bufInStream = new BufferedInputStream(
+ fInStream);
+ ObjectInputStream objInStream = new ObjectInputStream(
+ bufInStream);
+ o = objInStream.readObject();
+ objInStream.close();
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Error while loading data from file: '" + file + "'", e);
+ }
+ } else {
+ throw new IllegalArgumentException("File doesn't exist: '" + file
+ + "'.");
+ }
+ System.out.println("loaded dataset from file");
+ return (BaseDataset) o;
+ }
+
+ public static void save(String file, BaseDataset o) {
+ try {
+ File f = new File(file);
+ FileOutputStream foutStream = new FileOutputStream(f);
+ BufferedOutputStream boutStream = new BufferedOutputStream(
+ foutStream);
+ ObjectOutputStream objOutputStream = new ObjectOutputStream(
+ boutStream);
+ objOutputStream.writeObject(o);
+ objOutputStream.flush();
+ boutStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException("Error while saving data into file: '"
+ + file + "'", e);
+ }
+ }
+
+ /*
+ * Dataset name
+ */
+ private String name = getClass().getSimpleName()
+ + System.currentTimeMillis();
+
+ /*
+ * All item ratings.
+ */
+ private List allRatings = new ArrayList();
+
+ /*
+ * Map of all users.
+ */
+ private Map allUsers = new HashMap();
+
+ /*
+ * Map of all items.
+ */
+ private Map allItems = new HashMap();
+
+ /*
+ * Map of item ratings by user id.
+ */
+ Map> ratingsByUserId = new HashMap>();
+
+ Set allTermsSet = new HashSet();
+
+ /**
+ * Auxiliary method for loading users one by one. This is for demonstration
+ * purposes. Use other kind of loaders for loading data en mass.
+ *
+ * @param u
+ * denotes a User who has rated certain items and we want to add
+ * his ratings in this dataset
+ * @return true if no errors occurred and all data have been added.
+ * Otherwise, return false but do add whatever we can.
+ */
+ public boolean add(User u) {
+
+ boolean addedUser = true;
+
+ // Auxiliary
+ Item item;
+
+ // Add the ratings
+ Collection urc = u.getAllRatings();
+ Rating[] uRatings = urc.toArray(new Rating[urc.size()]);
+
+ // Add the user
+ if (!allUsers.containsKey(u.getId())) {
+ this.allUsers.put(u.getId(), u);
+
+ for (Content content : u.getUserContent()) {
+ updateTerms(content.getTerms());
+ }
+ }
+
+ for (Rating r : uRatings) {
+ if (!this.allRatings.add(r)) {
+ System.out.println("________________________________");
+ System.out.println("ERROR >> Could not add rating! ");
+ System.out.println(" >> User ID: " + r.getUserId());
+ System.out.println(" >> Item ID: " + r.getItemId());
+ System.out.println(" >> Rating : " + r.getRating());
+ System.out.println("________________________________");
+
+ addedUser = false;
+ }
+
+ item = r.getItem();
+
+ /*
+ * Reuse existing item if it is available. Existing item contains
+ * ratings of previously added users and we don't want to overwrite
+ * them in case new item is a different instance with the same id.
+ */
+ if (!allItems.containsKey(item.getId())) {
+ this.allItems.put(item.getId(), item);
+ }
+
+ // Populate item ratings if item doesn't have them
+ // Note that here we rely on all users/ratings sharing the same
+ // instance of an item.
+ if (item.getUserRating(u.getId()) == null) {
+ item.addUserRating(r);
+ }
+ }
+
+ return addedUser;
+ }
+
+ /*
+ * Auxiliary method for loading items one by one. This is for demonstration
+ * purposes. Can be used when we want to link users and items using item
+ * content instead of rating. In such cases ratings won't be available and
+ * as a result add(User)
won't be able to derive any Items
+ * through user ratings.
+ */
+ public boolean addItem(Item item) {
+ boolean addedItem = false;
+ if (!allItems.containsKey(item.getId())) {
+ this.allItems.put(item.getId(), item);
+ addedItem = true;
+
+ Content content = item.getItemContent();
+ updateTerms(content.getTerms());
+ }
+ return addedItem;
+ }
+
+ public Item findItemByName(String name) {
+ Item matchedItem = null;
+ for (Item item : this.allItems.values()) {
+ if (name.equalsIgnoreCase(item.getName())) {
+ matchedItem = item;
+ break;
+ }
+ }
+ return matchedItem;
+
+ }
+
+ public User findUserByName(String name) {
+ User matchedUser = null;
+ for (User user : this.allUsers.values()) {
+ if (name.equalsIgnoreCase(user.getName())) {
+ matchedUser = user;
+ break;
+ }
+ }
+ return matchedUser;
+ }
+
+ public String[] getAllTerms() {
+ return allTermsSet.toArray(new String[allTermsSet.size()]);
+ }
+
+ public double getAverageItemRating(int itemId) {
+ return getItem(itemId).getAverageRating();
+ }
+
+ public double getAverageUserRating(int userId) {
+ return getUser(userId).getAverageRating();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getItem(java.lang.Integer)
+ */
+ public Item getItem(Integer itemId) {
+ return allItems.get(itemId);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getItemCount()
+ */
+ public int getItemCount() {
+ return allItems.size();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getItems()
+ */
+ public Collection- getItems() {
+ return allItems.values();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getName()
+ */
+ public String getName() {
+ return name;
+ }
+
+ public List
- getRatedItems(Integer userId) {
+ List
- ratedItems = new ArrayList
- ();
+ User user = getUser(userId);
+ Collection userRatings = user.getAllRatings();
+ for (Rating r : userRatings) {
+ Item ratedItem = getItem(r.getItemId());
+ ratedItems.add(ratedItem);
+ }
+ return ratedItems;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getRatings()
+ */
+ public Collection getRatings() {
+ return this.allRatings;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getRatingsCount()
+ */
+ public int getRatingsCount() {
+ return allRatings.size();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getUser(java.lang.Integer)
+ */
+ public User getUser(Integer userId) {
+ return allUsers.get(userId);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getUserCount()
+ */
+ public int getUserCount() {
+ return allUsers.size();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see iweb2.ch3.collaborative.model.Dataset#getUsers()
+ */
+ public Collection getUsers() {
+ return allUsers.values();
+ }
+
+ public boolean isIdMappingRequired() {
+ return true;
+ }
+
+ public ContentItem pickContentItem(String name) {
+ ContentItem contentItem = null;
+
+ for (Map.Entry entry : allItems.entrySet()) {
+ Item anItem = entry.getValue();
+ if (name.equals(anItem.getName())) {
+ contentItem = new ContentItem(entry.getValue());
+ break;
+ }
+ }
+ return contentItem;
+ }
+
+ public Item pickItem(String name) {
+ Item item = null;
+ for (Map.Entry entry : allItems.entrySet()) {
+ Item anItem = entry.getValue();
+ if (name.equals(anItem.getName())) {
+ item = entry.getValue();
+ break;
+ }
+ }
+ return item;
+ }
+
+ public User pickUser(String name) {
+ User user = null;
+ for (Map.Entry entry : allUsers.entrySet()) {
+ User aUser = entry.getValue();
+ if (name.equals(aUser.getName())) {
+ user = entry.getValue();
+ break;
+ }
+ }
+ return user;
+ }
+
+ /**
+ * Prints all ratings by item.
+ */
+ public void printItemRatings() {
+ System.out.println("\nItem ratings:\n");
+ for (Item item : allItems.values()) {
+ System.out.println("Item: " + item.getName());
+ for (Rating r : item.getAllRatings()) {
+ User user = this.allUsers.get(r.getUserId());
+ System.out.println(" Rated by " + user.getName() + " as "
+ + r.getRating());
+ }
+ }
+ }
+
+ /**
+ * Prints all ratings by item.
+ */
+ public void printUserRatings() {
+ System.out.println("\nUser ratings:\n");
+ for (User user : allUsers.values()) {
+ System.out.println("User: " + user.getName());
+ for (Rating r : user.getAllRatings()) {
+ Item item = allItems.get(r.getItemId());
+ System.out.println(" Rated " + item.getName() + " as "
+ + r.getRating());
+ }
+ }
+ }
+
+ public void save(String file) {
+ BaseDataset.save(file, this);
+ System.out.println("saved dataset into file");
+ }
+
+ private void updateTerms(String[] terms) {
+ for (String term : terms) {
+ allTermsSet.add(term);
+ }
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/ContentItem.java b/src/org/yooreeka/algos/reco/collab/data/ContentItem.java
new file mode 100644
index 0000000..d754247
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/ContentItem.java
@@ -0,0 +1,59 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Item for news dataset.
+ */
+public class ContentItem extends Item {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = 6349342365379966975L;
+
+ public ContentItem(int id, String name, Content content) {
+ super(id, name, new ArrayList(3));
+ setItemContent(content);
+ }
+
+ public ContentItem(Item val) {
+ super(val.getId(), val.getName(), new ArrayList(3));
+ this.setItemContent(val.getItemContent());
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/DiggData.java b/src/org/yooreeka/algos/reco/collab/data/DiggData.java
new file mode 100644
index 0000000..7da7fad
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/DiggData.java
@@ -0,0 +1,361 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.supercsv.io.CsvListReader;
+import org.supercsv.io.CsvListWriter;
+import org.supercsv.prefs.CsvPreference;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.RecommendationType;
+import org.yooreeka.algos.reco.collab.model.SimilarItem;
+import org.yooreeka.algos.reco.collab.recommender.Delphi;
+import org.yooreeka.algos.reco.content.digg.DiggService;
+import org.yooreeka.algos.reco.content.digg.DiggStoryItem;
+import org.yooreeka.algos.reco.content.digg.DiggUser;
+
+public class DiggData {
+
+ public static List allUsers = new ArrayList();
+ public static List allStories = new ArrayList();
+
+ private static final String[] CSV_ITEM_HEADERS = new String[] { "id",
+ "username", "title", "category", "topic", "description", "link",
+ "userid" };
+
+ public static BaseDataset createDataset() {
+
+ BaseDataset ds = new BaseDataset();
+
+ Delphi delphiIC = createItemContentDelphi();
+ int topN = 10;
+ for (DiggUser user : allUsers) {
+ List userItems = findItemsByUsername(user.getName());
+ for (DiggStoryItem item : userItems) {
+
+ // similar items across all categories
+ SimilarItem[] similarItems = delphiIC.findSimilarItems(item,
+ topN);
+
+ // Create a set of biased ratings for user using a subset from
+ // similar items
+ int lowRating = 0;
+ int highRating = 0;
+ if (user.getName().toLowerCase().charAt(0) <= 'd') {
+ // range of ratings for users whose name starts from A to D
+ lowRating = 4;
+ highRating = 5;
+ } else {
+ // range of ratings for users whose name starts from E to Z
+ lowRating = 1;
+ highRating = 3;
+ }
+
+ // select 70% of similar items
+ Item[] randomItems = pickRandomItems(similarItems, 0.7);
+ RatingBuilder ratingBuider = new RatingBuilder();
+ List ratings = ratingBuider.createBiasedRatings(
+ user.getId(), randomItems, lowRating, highRating);
+ for (Rating r : ratings) {
+ user.addRating(r);
+ }
+ }
+ ds.add(user);
+ System.out.println("Generated " + user.getAllRatings().size()
+ + " ratings for user id: " + user.getId() + ", name: "
+ + user.getName() + ", average rating: "
+ + user.getAverageRating());
+ }
+
+ System.out.println("Created Dataset with " + ds.getUserCount()
+ + " users, " + ds.getItemCount() + " items, "
+ + ds.getRatingsCount() + " ratings.");
+
+ return ds;
+ }
+
+ private static Delphi createItemContentDelphi() {
+ BaseDataset ds = new BaseDataset();
+ for (DiggUser user : allUsers) {
+ ds.add(user);
+ }
+
+ for (DiggStoryItem item : allStories) {
+ System.out.println("Description:" + item.getDescription());
+ ds.addItem(item);
+ }
+
+ return new Delphi(ds, RecommendationType.ITEM_CONTENT_BASED, true);
+ }
+
+ private static List findItemsByUsername(String username) {
+ List items = new ArrayList();
+ for (DiggStoryItem item : allStories) {
+ if (item.getUsername().equals(username)) {
+ items.add(item);
+ }
+ }
+ return items;
+ }
+
+ private static DiggUser findUserByUsername(String username) {
+ DiggUser matchedUser = null;
+ for (DiggUser u : allUsers) {
+ if (u.getName().equals(username)) {
+ matchedUser = u;
+ break;
+ }
+ }
+ return matchedUser;
+ }
+
+ /**
+ * Load data from csv file.
+ *
+ * @param filename
+ */
+ public static BaseDataset loadData(String filename) {
+
+ allStories = new ArrayList();
+ allUsers = new ArrayList();
+
+ CsvListReader csvReader = null;
+ try {
+ csvReader = new CsvListReader(new BufferedReader(new FileReader(
+ filename)), CsvPreference.EXCEL_PREFERENCE);
+
+ csvReader.getCSVHeader(true);
+
+ List line = null;
+ while ((line = csvReader.read()) != null) {
+ try {
+ int id = Integer.valueOf(line.get(0));
+ String username = line.get(1);
+ String title = line.get(2);
+ String category = line.get(3);
+ String topic = line.get(4);
+ String description = line.get(5);
+ String link = line.get(6);
+ int userid = Integer.valueOf(line.get(7));
+
+ DiggUser user = findUserByUsername(username);
+ if (user == null) {
+ user = new DiggUser(userid, username);
+ allUsers.add(user);
+ }
+
+ DiggStoryItem item = new DiggStoryItem(id, title,
+ description);
+ item.setUsername(username);
+ item.setCategory(category);
+ item.setTopic(topic);
+ item.setLink(link);
+ allStories.add(item);
+
+ // adding item content to the user
+ user.addUserContent(item.getItemContent());
+ } catch (Exception e) {
+ throw new RuntimeException("Error while reading item: ", e);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Error while reading digg items from csv file.", e);
+ } finally {
+ try {
+ if (csvReader != null) {
+ csvReader.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("From file: " + filename);
+ System.out.println("Loaded " + allUsers.size() + " users.");
+ System.out.println("Loaded " + allStories.size() + " stories (items).");
+
+ return DiggData.createDataset();
+ }
+
+ /*
+ * Loading data from Digg.
+ *
+ * @param filename file that will be used to save the data.
+ */
+ public static BaseDataset loadDataFromDigg(String filename) {
+
+ allUsers.clear();
+ allStories.clear();
+
+ Set allKnownUsers = new HashSet();
+ Set allKnownStories = new HashSet();
+
+ DiggService news = new DiggService();
+ news.setItemCountPerCategory(5);
+ // Top stories across a set of categories (Technology, Sports, ...)
+ List topStories = news.getAllStories();
+
+ // used to assign unique id to each user
+ int nextUserId = 1;
+
+ // iterate through top stories and collect a set of users
+ for (DiggStoryItem item : topStories) {
+ String username = item.getUsername();
+ if (!allKnownUsers.contains(username)) {
+ allKnownUsers.add(username);
+ int userId = nextUserId++;
+ DiggUser diggUser = new DiggUser(userId, username);
+ allUsers.add(diggUser);
+ }
+ }
+
+ // for every user retrieve up to 5 stories
+ int maxStories = 5;
+ for (DiggUser user : allUsers) {
+ List userItems = news.getUserStories(user.getName(),
+ maxStories);
+
+ for (DiggStoryItem i : userItems) {
+ if (!allKnownStories.contains(i.getId())) {
+ allStories.add(i);
+ allKnownStories.add(i.getId());
+ } else {
+ System.out.println("Duplicate story: id=" + i.getId()
+ + ", name=" + i.getName());
+ }
+ // adding item content to the user
+ user.addUserContent(i.getItemContent());
+ }
+ }
+ System.out.println("From Digg:");
+ System.out.println("Loaded " + allUsers.size() + " users.");
+ System.out.println("Loaded " + allStories.size() + " stories (items).");
+
+ DiggData.saveData(filename);
+ return DiggData.createDataset();
+ }
+
+ private static Item[] pickRandomItems(SimilarItem[] items,
+ double percentOfAllItems) {
+
+ if (percentOfAllItems < 0.0 || percentOfAllItems > 1.0) {
+ throw new IllegalArgumentException(
+ "Value for 'percentOfAllItems' argument should be between 0 and 1.");
+ }
+ Random rand = new Random();
+ int sampleSize = (int) Math.round(percentOfAllItems * items.length);
+ Map pickedItems = new HashMap();
+ while (pickedItems.size() < sampleSize) {
+ int itemId = rand.nextInt(items.length);
+ Item item = items[itemId].getItem();
+ if (!pickedItems.containsKey(item.getId())) {
+ pickedItems.put(item.getId(), item);
+ }
+ }
+
+ return pickedItems.values().toArray(new Item[pickedItems.size()]);
+ }
+
+ /**
+ * Save data into csv file.
+ *
+ * @param filename
+ */
+ public static void saveData(String filename) {
+ String[] data = new String[CSV_ITEM_HEADERS.length];
+
+ CsvListWriter csvWriter = null;
+ try {
+ csvWriter = new CsvListWriter(new BufferedWriter(new FileWriter(
+ filename)), CsvPreference.EXCEL_PREFERENCE);
+
+ csvWriter.writeHeader(CSV_ITEM_HEADERS);
+
+ for (DiggStoryItem item : allStories) {
+ try {
+ data[0] = String.valueOf(item.getId());
+ data[1] = item.getUsername();
+ data[2] = item.getTitle();
+ data[3] = item.getCategory();
+ data[4] = item.getTopic();
+ data[5] = item.getDescription();
+ data[6] = item.getLink();
+ DiggUser user = findUserByUsername(item.getUsername());
+ data[7] = String.valueOf(user.getId());
+ csvWriter.write(data);
+ } catch (Exception e) {
+ throw new RuntimeException("Error while writing item "
+ + item.getName() + ": ", e);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Error while writing digg items into csv file.", e);
+ } finally {
+ try {
+ if (csvWriter != null) {
+ csvWriter.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("Saved data into file: " + filename);
+ System.out.println("saved " + allUsers.size() + " users.");
+ System.out.println("saved " + allStories.size() + " stories (items).");
+
+ }
+
+ public static void showUsers() {
+ System.out.println("All Users:");
+ for (DiggUser user : allUsers) {
+ System.out.println("User id:" + user.getId() + ", name: "
+ + user.getName());
+ }
+
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java b/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java
new file mode 100644
index 0000000..479e35d
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java
@@ -0,0 +1,99 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.util.parsing.html.HTMLDocumentParser;
+import org.yooreeka.util.parsing.html.HTMLDocumentParserException;
+
+public class HTMLContent extends Content {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = -354667863913509004L;
+
+ private static String extractContentFromHtmlDoc(File htmlFile) {
+
+ String htmlText = null;
+ FileInputStream fis = null;
+
+ try {
+ fis = new FileInputStream(htmlFile);
+ Reader reader = new InputStreamReader(new BufferedInputStream(fis));
+ HTMLDocumentParser htmlParser = new HTMLDocumentParser(reader);
+
+ htmlText = htmlParser.getHtmlDoc().getText();
+
+ } catch (IOException e) {
+
+ throw new RuntimeException(e);
+
+ } catch (HTMLDocumentParserException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } finally {
+ if (fis != null) {
+ try {
+ fis.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ return htmlText;
+ }
+
+ public HTMLContent(String id, File htmlDocFile) {
+ super(id, extractContentFromHtmlDoc(htmlDocFile));
+ }
+
+ public HTMLContent(String id, File htmlDocFile, int topNTerms) {
+ super(id, extractContentFromHtmlDoc(htmlDocFile), topNTerms);
+ }
+
+ public HTMLContent(String id, String htmlDocFilename) {
+ super(id, extractContentFromHtmlDoc(new File(htmlDocFilename)));
+ }
+
+ public HTMLContent(String id, String htmlDocFilename, int topNTerms) {
+ super(id, extractContentFromHtmlDoc(new File(htmlDocFilename)),
+ topNTerms);
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java b/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java
new file mode 100644
index 0000000..6ad655c
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java
@@ -0,0 +1,83 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.File;
+
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * Utility class to create MovieLens dataset.
+ */
+public class MovieLensData {
+
+ /**
+ * Loads MovieLens dataset from default directory.
+ */
+ public static MovieLensDataset createDataset() {
+ int numOfTestRatings = 0;
+ return createDataset(numOfTestRatings);
+ }
+
+ public static MovieLensDataset createDataset(int numOfTestRatings) {
+ return createDataset(
+ YooreekaConfigurator.getProperty("iweb2.movielens.data.dir"),
+ numOfTestRatings);
+ }
+
+ /**
+ * Loads MovieLens dataset from specified directory.
+ *
+ * @param dataDir
+ * directory that contains MovieLens files.
+ * @return
+ */
+ public static MovieLensDataset createDataset(String dataDir,
+ int numOfTestRatings) {
+ File users = new File(dataDir, MovieLensDataset.USERS_FILENAME);
+ File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME);
+ File ratings = new File(dataDir, MovieLensDataset.RATINGS_FILENAME);
+
+ System.out.println("*** Loading MovieLens dataset...");
+ System.out.println("make sure that you are using at least: -Xmx1024m");
+
+ MovieLensDataset dataSet = new MovieLensDataset(users, items, ratings,
+ numOfTestRatings);
+
+ System.out.println("\n*** Loaded MovieLens dataset.");
+ System.out.println("users: " + dataSet.getUserCount());
+ System.out.println("movies: " + dataSet.getItemCount());
+ System.out.println("ratings: " + dataSet.getRatingsCount());
+ System.out.println("test ratings: " + dataSet.getTestRatings().size());
+
+ return dataSet;
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java b/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java
new file mode 100644
index 0000000..74af7cf
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java
@@ -0,0 +1,385 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.yooreeka.algos.reco.collab.model.Dataset;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.User;
+
+/**
+ * Dataset implementation that we will use to work with MovieLens data. All data
+ * is loaded from three files: users, movies (items), and ratings.
+ */
+public class MovieLensDataset implements Dataset {
+
+ public static final String USERS_FILENAME = "users.dat";
+ public static final String ITEMS_FILENAME = "movies.dat";
+ public static final String RATINGS_FILENAME = "ratings.dat";
+
+ /*
+ * Delimiter that is used by MovieLens data files.
+ */
+ private static final String FIELD_DELIMITER = "::";
+
+ /**
+ * Saves provided ratings into a new file. Used to split ratings provided as
+ * part of MovieLens data set into files that represent various rating sets
+ * for training and testing.
+ *
+ * @param f
+ * file to write to.
+ * @param ratings
+ * ratings to save.
+ */
+ public static void createNewRatingsFile(File f, Collection ratings) {
+ try {
+ PrintWriter pw = new PrintWriter(new BufferedWriter(new FileWriter(
+ f)));
+ for (Rating rating : ratings) {
+ pw.println(rating.getUserId() + FIELD_DELIMITER
+ + rating.getItemId() + FIELD_DELIMITER
+ + rating.getRating());
+ }
+ pw.flush();
+ pw.close();
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Failed to save rating into file (file: '"
+ + f.getAbsolutePath() + "').", e);
+ }
+ }
+
+ private static BufferedReader getReader(File f)
+ throws FileNotFoundException {
+ return new BufferedReader(new FileReader(f));
+ }
+
+ public static List loadRatings(File f) {
+ List allRatings = new ArrayList();
+
+ BufferedReader reader = null;
+ String line = null;
+ try {
+ reader = getReader(f);
+ while ((line = reader.readLine()) != null) {
+ String[] tokens = parseLine(line);
+ int userId = Integer.parseInt(tokens[0]);
+ int itemId = Integer.parseInt(tokens[1]);
+ int rating = Integer.parseInt(tokens[2]);
+ allRatings.add(new Rating(userId, itemId, rating));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Failed to load rating from file (file: '"
+ + f.getAbsolutePath() + "'): ", e);
+ } finally {
+ if (reader != null) {
+ try {
+ reader.close();
+ } catch (Exception e) {
+ System.out.println("ERROR: \n");
+ System.out.println(e.getMessage()
+ + "\n while closing file reader (file: '"
+ + f.getAbsolutePath() + "'): ");
+ }
+ }
+ }
+
+ return allRatings;
+ }
+
+ private static String[] parseLine(String line) {
+ // possible field delimiters: "::", "\t", "|"
+ return line.split("::|\t|\\|");
+ }
+ /*
+ * All item ratings.
+ */
+ private List allRatings = new ArrayList();
+
+ /*
+ * Map of all users.
+ */
+ private Map allUsers = new HashMap();
+
+ /*
+ * Map of all items.
+ */
+ private Map allItems = new HashMap();
+
+ /*
+ * Parameters for test dataset
+ */
+ private int numberOfTestRatings = 0;
+
+ private List testRatings = new ArrayList();
+
+ /*
+ * Map of item ratings by item id.
+ */
+ private Map> ratingsByItemId = new HashMap>();
+
+ /*
+ * Map of item ratings by user id.
+ */
+ Map> ratingsByUserId = new HashMap>();
+
+ private String name;
+
+ public MovieLensDataset(File users, File movies, File ratings) {
+ name = getClass().getSimpleName() + System.currentTimeMillis();
+ loadData(users, movies, ratings, null);
+ }
+
+ public MovieLensDataset(File users, File movies, File ratings,
+ int numOfTestRatings) {
+ name = getClass().getSimpleName() + System.currentTimeMillis();
+ this.numberOfTestRatings = numOfTestRatings;
+ loadData(users, movies, ratings, null);
+ }
+
+ public MovieLensDataset(String name, File users, File movies, File ratings) {
+
+ this.name = name;
+ loadData(users, movies, ratings, null);
+ }
+
+ public MovieLensDataset(String name, File users, File items,
+ List ratings) {
+
+ this.name = name;
+ loadData(users, items, null, ratings);
+ }
+
+ private void addRatingToMap(Map> map, Integer key,
+ Rating rating) {
+ List ratingsForKey = map.get(key);
+ if (ratingsForKey == null) {
+ ratingsForKey = new ArrayList();
+ map.put(key, ratingsForKey);
+ }
+ ratingsForKey.add(rating);
+ }
+
+ private Item createNewItem(int itemId, String name) {
+ List ratings = ratingsByItemId.get(itemId);
+ if (ratings == null) {
+ ratings = new ArrayList();
+ }
+
+ Item item = new Item(itemId, name, ratings);
+
+ // establish link between rating and item
+ for (Rating r : ratings) {
+ r.setItem(item);
+ }
+
+ return item;
+ }
+
+ public String[] getAllTerms() {
+ return new String[0];
+ }
+
+ public double getAverageItemRating(int itemId) {
+ return getItem(itemId).getAverageRating();
+ }
+
+ public double getAverageUserRating(int userId) {
+ return getUser(userId).getAverageRating();
+ }
+
+ public Item getItem(Integer itemId) {
+ return allItems.get(itemId);
+ }
+
+ public int getItemCount() {
+ return allItems.size();
+ }
+
+ public Collection
- getItems() {
+ return allItems.values();
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public Collection getRatings() {
+ return this.allRatings;
+ }
+
+ public int getRatingsCount() {
+ return allRatings.size();
+ }
+
+ public Collection getTestRatings() {
+ return this.testRatings;
+ }
+
+ public User getUser(Integer userId) {
+ return allUsers.get(userId);
+ }
+
+ public int getUserCount() {
+ return allUsers.size();
+ }
+
+ public Collection getUsers() {
+ return allUsers.values();
+ }
+
+ public boolean isIdMappingRequired() {
+ return false;
+ }
+
+ private void loadData(File usersFile, File itemsFile, File ratingsFile,
+ List ratings) {
+ try {
+ /* Load all available ratings */
+ if (ratings == null) {
+ allRatings = loadRatings(ratingsFile);
+ } else {
+ allRatings = ratings;
+ }
+
+ /* Exclude ratings if needed */
+ withholdRatings();
+
+ /* build maps that provide access to ratings by userId or itemId */
+ for (Rating rating : allRatings) {
+ addRatingToMap(ratingsByItemId, rating.getItemId(), rating);
+ addRatingToMap(ratingsByUserId, rating.getUserId(), rating);
+ }
+ /*
+ * load users and item. Each instance will have a set of ratings
+ * relevant to it
+ */
+ allUsers = loadUsers(usersFile);
+ allItems = loadItems(itemsFile);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to load MovieLens data: ", e);
+ }
+ }
+
+ private Map loadItems(File moviesFile) throws IOException {
+
+ Map items = new HashMap();
+
+ BufferedReader reader = getReader(moviesFile);
+ String line = null;
+ int lastId = 0;
+ while ((line = reader.readLine()) != null) {
+
+ String[] tokens = parseLine(line);
+
+ /* at the moment we are only interested in movie id */
+ int itemId = Integer.parseInt(tokens[0]);
+ String title = tokens[1];
+
+ /*
+ * In some cases we need to create items for missing ids. Movies
+ * file from MovieLens dataset skips over some ids. To keep things
+ * simple we made assumption that user and movie (item) ids are
+ * sequences without gaps that start with 1.
+ */
+ if (itemId > lastId + 1) {
+
+ for (int i = lastId + 1; i < itemId; i++) {
+ // System.out.println("DEBUG:\n");
+ // System.out.println("Movies file has a gap in ID sequence. ");
+ // System.out.println("Creating artificial item for ID: " +
+ // i);
+
+ Item missingItem = createNewItem(i, "Missing-Item-" + i);
+ items.put(missingItem.getId(), missingItem);
+ }
+ }
+
+ Item item = createNewItem(itemId, title);
+
+ items.put(item.getId(), item);
+ lastId = item.getId();
+ }
+ return items;
+ }
+
+ private Map loadUsers(File usersFile) throws IOException {
+ Map users = new HashMap();
+
+ BufferedReader reader = getReader(usersFile);
+ String line = null;
+
+ while ((line = reader.readLine()) != null) {
+ String[] tokens = parseLine(line);
+ /* at the moment we are only interested in user id */
+ int userId = Integer.parseInt(tokens[0]);
+ List userRatings = ratingsByUserId.get(userId);
+ if (userRatings == null) {
+ userRatings = new ArrayList();
+ }
+ User user = new User(userId, userRatings);
+ users.put(user.getId(), user);
+ }
+
+ return users;
+ }
+
+ public void setTestRatingsCount(int numberOfRatings) {
+ this.numberOfTestRatings = numberOfRatings;
+ }
+
+ private void withholdRatings() {
+ Random rnd = new Random();
+ while (testRatings.size() < this.numberOfTestRatings) {
+ int randomIndex = rnd.nextInt(allRatings.size());
+ Rating rating = allRatings.remove(randomIndex);
+ testRatings.add(rating);
+ }
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicData.java b/src/org/yooreeka/algos/reco/collab/data/MusicData.java
new file mode 100644
index 0000000..d3682a8
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MusicData.java
@@ -0,0 +1,256 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Utility class that we use as the source for Music data.
+ */
+public class MusicData {
+
+ public static final String[] USERS = { "Albert", "Alexandra", "Athena",
+ "Aurora", "Babis", "Bill", "Bob", "Carl", "Catherine", "Charlie",
+ "Constantine", "Dmitry", "Elena", "Eric", "Frank", "George",
+ "Jack", "John", "Maria", "Lukas", "Nick", "Terry", "Todd" };
+
+ public static final String[] MUSIC_SAMPLES = {
+ "You've Lost That Lovin' Feelin'", "Mrs. Robinson",
+ "Wind Beneath My Wings", "Fiddler On The Roof", "La Bamba",
+ "Wizard Of Oz", "White Christmas", "Let It Be", "Yesterday",
+ "Singing In The Rain", "Sunday, Bloody Sunday", "Tears In Heaven",
+ "Beethoven: Symphony No. 9 in D minor",
+ "Bach: The Brandenburg Concerti", "Mozart: Symphony #41 (Jupiter)",
+ "What A Wonderful World", "I Love Rock And Roll",
+ "Albinoni: Adagio In G Minor", "Vivaldi: Four Seasons" };
+
+ /**
+ * Builds data set with all the users where each user rates 80% of all the
+ * songs. User ratings created randomly with bias:
+ *
+ * - Users whose name starts from A to D will have ratings between 3 and
+ * 5.
+ * - Users whose name starts from E to Z will have ratings between 1 and
+ * 3.
+ *
+ */
+ public static BaseDataset createDataset() {
+ BaseDataset ds = new BaseDataset();
+
+ double percentOfAllSongs = 0.80;
+
+ /* Create items first */
+ MusicItem[] allItems = loadAllMusicItems();
+
+ for (int i = 0, n = USERS.length; i < n; i++) {
+ int userId = i;
+ String userName = USERS[i];
+ int lowRating = 1;
+ int highRating = 5;
+
+ if (userName.toLowerCase().charAt(0) <= 'd') {
+ // range of ratings for users whose name starts from A to D
+ lowRating = 4;
+ highRating = 5;
+ } else {
+ // range of ratings for users whose name starts from E to Z
+ lowRating = 1;
+ highRating = 3;
+ }
+ MusicItem[] items = pickRandomSongs(allItems, percentOfAllSongs);
+
+ RatingBuilder ratingBuider = new RatingBuilder();
+ List ratings = ratingBuider.createBiasedRatings(userId,
+ items, lowRating, highRating);
+
+ MusicUser mu = new MusicUser(userId, userName, ratings);
+
+ ds.add(mu);
+ }
+ return ds;
+ }
+
+ private static MusicItem createItem(String song) {
+ int id = -1;
+ for (int i = 0, n = MUSIC_SAMPLES.length; i < n; i++) {
+ if (MUSIC_SAMPLES[i].equalsIgnoreCase(song)) {
+ id = i;
+ break;
+ }
+ }
+ if (id < 0) {
+ throw new IllegalArgumentException("Invalid song name: '" + song
+ + "'. This song is not on the list of predefined songs.");
+ }
+
+ return new MusicItem(id, MUSIC_SAMPLES[id]);
+ }
+
+ private static MusicUser createUser(String name) {
+ int id = -1;
+ for (int i = 0, n = USERS.length; i < n; i++) {
+ if (USERS[i].equalsIgnoreCase(name)) {
+ id = i;
+ break;
+ }
+ }
+ if (id < 0) {
+ throw new IllegalArgumentException("Invalid user name: '" + name
+ + "'. Name is not on the list of predefined user names.");
+ }
+
+ return new MusicUser(id, name);
+ }
+
+ /**
+ * Returns array of new MusicItem instances for every songs listed in
+ * MUSIC_SAMPLES
array.
+ */
+ private static MusicItem[] loadAllMusicItems() {
+ MusicItem[] allItems = new MusicItem[MusicData.MUSIC_SAMPLES.length];
+ for (int i = 0, n = allItems.length; i < n; i++) {
+ int id = i;
+ String name = MusicData.MUSIC_SAMPLES[i];
+ MusicItem item = new MusicItem(id, name);
+ allItems[i] = item;
+ }
+ return allItems;
+ }
+
+ public static MusicUser[] loadExample() {
+ MusicUser[] mu = new MusicUser[3];
+
+ mu[0] = createUser("Frank");
+ mu[1] = createUser("Constantine");
+ mu[2] = createUser("Catherine");
+
+ MusicItem[] mi = new MusicItem[11];
+
+ mi[0] = createItem("Tears In Heaven");
+ mi[1] = createItem("La Bamba");
+ mi[2] = createItem("Mrs. Robinson");
+ mi[3] = createItem("Yesterday");
+ mi[4] = createItem("Wizard Of Oz");
+ mi[5] = createItem("Mozart: Symphony #41 (Jupiter)");
+ mi[6] = createItem("Beethoven: Symphony No. 9 in D minor");
+ mi[7] = createItem("Fiddler On The Roof");
+ mi[8] = createItem("What A Wonderful World");
+ mi[9] = createItem("Let It Be");
+ mi[10] = createItem("Sunday, Bloody Sunday");
+
+ ArrayList mr0 = new ArrayList();
+ ArrayList mr1 = new ArrayList();
+ ArrayList mr2 = new ArrayList();
+
+ /*
+ * Tears In Heaven <- 0 La Bamba <- 1 Mrs. Robinson <- 2 Yesterday <- 3
+ * Wizard Of Oz <- 4 Mozart: Symphony #41 (Jupiter) <- 5 Beethoven:
+ * Symphony No. 9 in D <- 6
+ */
+ mr0.add(new MusicRating(mu[0].getId(), mi[0].getId(), 5));
+ mr0.add(new MusicRating(mu[0].getId(), mi[1].getId(), 4));
+ mr0.add(new MusicRating(mu[0].getId(), mi[2].getId(), 5));
+ mr0.add(new MusicRating(mu[0].getId(), mi[3].getId(), 4));
+ mr0.add(new MusicRating(mu[0].getId(), mi[4].getId(), 5));
+ mr0.add(new MusicRating(mu[0].getId(), mi[5].getId(), 4));
+ mr0.add(new MusicRating(mu[0].getId(), mi[6].getId(), 5));
+
+ /*
+ * Tears In Heaven <- 0 Fiddler On The Roof <- 7 Mrs. Robinson <- 2 What
+ * A Wonderful World <- 8 Wizard Of Oz <- 4 Let It Be <- 9 Mozart:
+ * Symphony #41 (Jupiter) <- 5
+ */
+
+ mr1.add(new MusicRating(mu[1].getId(), mi[0].getId(), 5));
+ mr1.add(new MusicRating(mu[1].getId(), mi[7].getId(), 5));
+ mr1.add(new MusicRating(mu[1].getId(), mi[2].getId(), 5));
+ mr1.add(new MusicRating(mu[1].getId(), mi[8].getId(), 4));
+ mr1.add(new MusicRating(mu[1].getId(), mi[4].getId(), 4));
+ mr1.add(new MusicRating(mu[1].getId(), mi[9].getId(), 5));
+ mr1.add(new MusicRating(mu[1].getId(), mi[5].getId(), 5));
+
+ /*
+ * Tears In Heaven <- 0 Mrs. Robinson <- 2 Yesterday <- 3 Beethoven:
+ * Symphony No. 9 in D <- 6 Sunday, Bloody Sunday <- 10 Yesterday <- 3
+ * Let It Be <- 9
+ */
+ mr2.add(new MusicRating(mu[2].getId(), mi[0].getId(), 1));
+ mr2.add(new MusicRating(mu[2].getId(), mi[2].getId(), 2));
+ mr2.add(new MusicRating(mu[2].getId(), mi[3].getId(), 2));
+ mr2.add(new MusicRating(mu[2].getId(), mi[6].getId(), 3));
+ mr2.add(new MusicRating(mu[2].getId(), mi[10].getId(), 1));
+ mr2.add(new MusicRating(mu[2].getId(), mi[3].getId(), 1));
+ mr2.add(new MusicRating(mu[2].getId(), mi[9].getId(), 2));
+
+ mu[0].setRatings(mr0);
+ mu[1].setRatings(mr1);
+ mu[2].setRatings(mr2);
+
+ return mu;
+ }
+
+ /**
+ * Returns a random selection of songs.
+ *
+ * @param songs
+ * list of songs to pick from
+ * @param percentOfAllSongs
+ * determines size of returned selection.
+ *
+ * @return array of songs.
+ */
+ private static MusicItem[] pickRandomSongs(MusicItem[] songs,
+ double percentOfAllSongs) {
+
+ if (percentOfAllSongs < 0.0 || percentOfAllSongs > 1.0) {
+ throw new IllegalArgumentException(
+ "Value for 'percentOfAllSongs' argument should be between 0 and 1.");
+ }
+ Random rand = new Random();
+ int sampleSize = (int) Math.round(percentOfAllSongs * songs.length);
+ Map pickedItems = new HashMap();
+ while (pickedItems.size() < sampleSize) {
+ int songId = rand.nextInt(songs.length);
+ MusicItem song = songs[songId];
+ if (!pickedItems.containsKey(song.getId())) {
+ pickedItems.put(song.getId(), song);
+ }
+ }
+
+ return pickedItems.values().toArray(new MusicItem[pickedItems.size()]);
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicItem.java b/src/org/yooreeka/algos/reco/collab/data/MusicItem.java
new file mode 100644
index 0000000..ab5e823
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MusicItem.java
@@ -0,0 +1,71 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Item for music dataset.
+ *
+ * @author Babis Marmanis
+ */
+public class MusicItem extends Item {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 3219691524340585231L;
+
+ String artist;
+
+ public MusicItem(int id, String name) {
+ super(id, name, new ArrayList(3));
+ }
+
+ /**
+ * @return the artist
+ */
+ public String getArtist() {
+ return artist;
+ }
+
+ /**
+ * @param artist
+ * the artist to set
+ */
+ public void setArtist(String artist) {
+ this.artist = artist;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicRating.java b/src/org/yooreeka/algos/reco/collab/data/MusicRating.java
new file mode 100644
index 0000000..9046889
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MusicRating.java
@@ -0,0 +1,52 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Rating for music dataset.
+ *
+ * @author Babis Marmanis
+ */
+public class MusicRating extends Rating {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 4015578066768031191L;
+
+ public MusicRating(int userId, int songId, int rating) {
+
+ super(userId, songId, rating);
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicUser.java b/src/org/yooreeka/algos/reco/collab/data/MusicUser.java
new file mode 100644
index 0000000..1c8e89a
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/MusicUser.java
@@ -0,0 +1,249 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.User;
+import org.yooreeka.util.gui.XyGui;
+
+/**
+ * User for music dataset.
+ *
+ * @author Babis Marmanis
+ */
+public class MusicUser extends User {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 4866915806848833932L;
+
+ public MusicUser(int userId, String name) {
+ super(userId, name);
+ }
+
+ public MusicUser(int userId, String name, List ratings) {
+ super(userId, name, ratings);
+ }
+
+ public double getSimilarity(MusicUser u, int simType) {
+
+ double sim = 0.0d;
+ int commonItems = 0;
+
+ /**
+ * TODO: 3.1 -- Types of similarity (Book section 3.1.2)
+ *
+ * In the following switch, we include two types of similarity You can
+ * extend the functionality of this method by adding more types. For
+ * example, the Jaccard similarity could be defined as the ratio of the
+ * intersection over the union of the items between two users. In other
+ * words, Number of songs in common Jaccard Similarity =
+ * ------------------------------------------- Number of all songs
+ * listened by either user
+ *
+ * Are more complicated similarity metrics more accurate?
+ */
+
+ switch (simType) {
+
+ case 0:
+ for (Rating r : this.ratingsByItemId.values()) {
+ for (Rating r2 : u.ratingsByItemId.values()) {
+
+ // Find the same item
+ if (r.getItemId() == r2.getItemId()) {
+ commonItems++;
+ sim += Math.pow((r.getRating() - r2.getRating()), 2);
+ }
+ }
+ }
+
+ // If there are not common items, we cannot tell whether
+ // the users are similar or not. So, we let it return 0.
+ if (commonItems > 0) {
+
+ // This is the RMSE, which is more like the distance
+ sim = Math.sqrt(sim / commonItems);
+
+ // Similarity should be between 0 and 1
+ // For the value 0, the two users are as dissimilar as they come
+ // For the value 1, their preferences (based on the available
+ // data) are identical.
+ //
+ // Here is a function that accomplishes exactly that
+ sim = 1.0d - Math.tanh(sim);
+ }
+
+ break;
+
+ // ---------------------------------------------------------
+ case 1:
+ for (Rating r : this.ratingsByItemId.values()) {
+ for (Rating r2 : u.ratingsByItemId.values()) {
+
+ // Find the same item
+ if (r.getItemId() == r2.getItemId()) {
+ commonItems++;
+ sim += Math.pow((r.getRating() - r2.getRating()), 2);
+ }
+ }
+ }
+
+ // If there are not common items, we cannot tell whether
+ // the users are similar or not. So, we let it return 0.
+ if (commonItems > 0) {
+ // Same as before (case 0)
+ sim = Math.sqrt(sim / commonItems);
+
+ // Similarity should be between 0 and 1
+ // For the value 0, the two users are as disimilar as they come
+ // For the value 1, their preferences (based on the available
+ // data) are identical.
+ //
+ // Here is a function that accomplishes exactly that
+ sim = 1.0d - Math.tanh(sim);
+
+ // However, the above calculation takes into account only the
+ // common items
+ // It does not account for the number of items that could have
+ // in common
+ // So, let us consider the following
+
+ // This is the maximum number of items that the two users can
+ // have in common
+ int maxCommonItems = Math.min(this.ratingsByItemId.size(),
+ u.ratingsByItemId.size());
+
+ // Adjust the similarity to account for the importance of the
+ // common terms
+ // through the ratio of the common items over the number of all
+ // possible common items
+
+ sim = sim * ((double) commonItems / (double) maxCommonItems);
+ }
+
+ break;
+ }
+
+ // Let us know what it is
+ System.out.print("\n"); // Just for pretty printing in the Shell
+ System.out.print(" User Similarity between");
+ System.out.print(" " + this.getName());
+ System.out.print(" and " + u.getName());
+ System.out.println(" is equal to " + sim);
+ System.out.print("\n"); // Just for pretty printing in the Shell
+
+ return sim;
+ }
+
+ public void plot() {
+
+ int n = this.ratingsByItemId.size();
+
+ double[] x = new double[n];
+ double[] y = new double[n];
+
+ double xCount = 0;
+ int i;
+ for (Integer itemId : this.ratingsByItemId.keySet()) {
+ i = (int) xCount;
+ x[i] = xCount;
+ y[i] = this.getItemRating(itemId).getRating();
+ }
+
+ XyGui gui = new XyGui("", x, y);
+ gui.plot();
+ }
+
+ public void plot(MusicUser anotherUser) {
+ // ratings for items rated by both users
+ List sharedRatings = new ArrayList();
+
+ // iterate through user ratings and check if another user rated the same
+ // items
+ for (Rating r : ratingsByItemId.values()) {
+ Rating anotherUserRating = anotherUser.getItemRating(r.getItemId());
+ if (anotherUserRating != null) {
+ // item was rated by both users. Add both ratings to the list
+ Rating[] itemRatings = new Rating[2];
+ itemRatings[0] = r;
+ itemRatings[1] = anotherUserRating;
+ sharedRatings.add(itemRatings);
+ }
+ }
+
+ // sort shared ratings based on the difference of opinions
+ Collections.sort(sharedRatings, new Comparator() {
+ public int compare(Rating[] x, Rating[] y) {
+ int result = 0;
+
+ double xDiff = Math.abs(x[0].getRating() - x[1].getRating());
+ double yDiff = Math.abs(y[0].getRating() - y[1].getRating());
+
+ if (xDiff < yDiff) {
+ result = -1;
+ } else if (xDiff > yDiff) {
+ result = 1;
+ } else {
+ result = 0;
+ }
+
+ return result;
+ }
+ });
+
+ double[] data1 = new double[sharedRatings.size()];
+ double[] data2 = new double[sharedRatings.size()];
+ String[] itemNames = new String[sharedRatings.size()];
+ for (int i = 0, n = itemNames.length; i < n; i++) {
+ Rating[] itemRatings = sharedRatings.get(i);
+ // Right now there is no way to get to Item from User or Rating.
+ // Only itemId is available from User or Rating instance.
+ // I'll change loading to include Item in Rating if we need to show
+ // song name on the chart.
+ itemNames[i] = String.valueOf(itemRatings[0].getItemId());
+ data1[i] = itemRatings[0].getRating();
+ data2[i] = itemRatings[1].getRating();
+ }
+
+ XyGui gui = new XyGui("User Similarity", this.getName(),
+ anotherUser.getName(), itemNames, data1, data2);
+
+ gui.plot();
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsData.java b/src/org/yooreeka/algos/reco/collab/data/NewsData.java
new file mode 100644
index 0000000..4077e68
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/NewsData.java
@@ -0,0 +1,202 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * Utility class that we use as the source for Music data.
+ */
+public class NewsData {
+
+ public static final String[] USERS = { "Albert", "Alexandra", "Athena",
+ "Aurora", "Babis", "Bill", "Bob", "Carl", "Catherine", "Charlie",
+ "Constantine", "Dmitry", "Elena", "Eric", "Frank", "George",
+ "Jack", "John", "Maria", "Lukas", "Nick", "Terry", "Todd" };
+
+ public static final String[] DOC_SAMPLES = { "biz-01.html", "biz-02.html",
+ "biz-03.html", "biz-04.html", "biz-05.html", "biz-06.html",
+ "biz-07.html", "sport-01.html", "sport-02.html", "sport-03.html",
+ "usa-01.html", "usa-02.html", "usa-03.html", "usa-04.html",
+ "world-01.html", "world-02.html", "world-03.html", "world-04.html",
+ "world-05.html" };
+
+ /**
+ * Builds data set with all the users where each user is assigned 80% of all
+ * the eligible content, as defined below:
+ *
+ * - Users whose name starts from A to D will have 'business' and 'sport'
+ * content.
+ * - Users whose name starts from E to Z will have 'usa' and 'world'
+ * content.
+ *
+ */
+ public static BaseDataset createDataset() {
+ BaseDataset ds = new BaseDataset();
+
+ /* Create items first */
+ ContentItem[] allItems = loadAllNewsItems();
+
+ for (ContentItem item : allItems) {
+ ds.addItem(item);
+ }
+
+ for (int i = 0, n = USERS.length; i < n; i++) {
+ int userId = i;
+ String userName = USERS[i];
+ ContentItem[] eligibleDocs = null;
+ if (userName.toLowerCase().charAt(0) <= 'd') {
+ eligibleDocs = selectEligibleDocs(allItems, new String[] {
+ "biz", "sport" });
+ } else {
+ eligibleDocs = selectEligibleDocs(allItems, new String[] {
+ "usa", "world" });
+ }
+
+ /*
+ * Percent of document items that will be selected from provided
+ * group of items.
+ */
+ double percentOfDocs = 0.80;
+
+ ContentItem[] docs = pickRandomDocs(eligibleDocs, percentOfDocs);
+
+ NewsUser u = new NewsUser(userId, userName);
+ for (ContentItem doc : docs) {
+ u.addUserContent(doc.getItemContent());
+ }
+
+ ds.add(u);
+ }
+
+ return ds;
+ }
+
+ // private static Item createItem(String docName) {
+ // int id = -1;
+ // for(int i = 0, n = DOC_SAMPLES.length; i < n; i++) {
+ // if( DOC_SAMPLES[i].equals(docName)) {
+ // id = i;
+ // break;
+ // }
+ // }
+ //
+ // if( id < 0 ) {
+ // throw new IllegalArgumentException("Invalid document name: '" + docName +
+ // "'. This document is not on the list of predefined documents.");
+ // }
+ //
+ // return createDocItem(id, docName);
+ // }
+
+ private static ContentItem createNewsItem(int docId, String docName) {
+ Content content = loadContent(docName);
+ ContentItem docItem = new ContentItem(docId, docName, content);
+ // docItem.setItemContent(content);
+ return docItem;
+ }
+
+ /**
+ * Returns array of new ContentItem instances for every document listed in
+ * DOC_SAMPLES
array.
+ */
+ private static ContentItem[] loadAllNewsItems() {
+ ContentItem[] allItems = new ContentItem[NewsData.DOC_SAMPLES.length];
+ for (int i = 0, n = allItems.length; i < n; i++) {
+ int id = i;
+ String name = NewsData.DOC_SAMPLES[i];
+ ContentItem item = createNewsItem(id, name);
+ allItems[i] = item;
+ }
+ return allItems;
+ }
+
+ private static Content loadContent(String docName) {
+ return new HTMLContent(docName, YooreekaConfigurator.getHome()
+ + "/data/ch02/" + docName);
+ }
+
+ /**
+ * Returns a random selection of documents.
+ *
+ * @param newsItems
+ * list of documents to pick from
+ * @param percentOfDocs
+ * determines size of returned selection.
+ *
+ * @return array of songs.
+ */
+ private static ContentItem[] pickRandomDocs(ContentItem[] newsItems,
+ double percentOfDocs) {
+
+ if (percentOfDocs < 0.0 || percentOfDocs > 1.0) {
+ throw new IllegalArgumentException(
+ "Value for 'percentOfDocs' argument should be "
+ + "between 0 and 1.");
+ }
+
+ Random rand = new Random();
+ int sampleSize = (int) Math.round(percentOfDocs * newsItems.length);
+ Map pickedItems = new HashMap();
+ while (pickedItems.size() < sampleSize) {
+ int itemId = rand.nextInt(newsItems.length);
+ Item item = newsItems[itemId];
+ if (!pickedItems.containsKey(item.getId())) {
+ pickedItems.put(item.getId(), item);
+ }
+ }
+
+ return pickedItems.values()
+ .toArray(new ContentItem[pickedItems.size()]);
+ }
+
+ private static ContentItem[] selectEligibleDocs(ContentItem[] docs,
+ String[] prefixes) {
+ List eligibleDocs = new ArrayList();
+ for (ContentItem doc : docs) {
+ for (String prefix : prefixes) {
+ if (doc.getName().startsWith(prefix)) {
+ eligibleDocs.add(doc);
+ break;
+ }
+ }
+ }
+ return eligibleDocs.toArray(new ContentItem[eligibleDocs.size()]);
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsItem.java b/src/org/yooreeka/algos/reco/collab/data/NewsItem.java
new file mode 100644
index 0000000..7dedef4
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/NewsItem.java
@@ -0,0 +1,54 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+
+import org.yooreeka.algos.reco.collab.model.Content;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Item for news dataset.
+ */
+public class NewsItem extends Item {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = 6349342365379966975L;
+
+ public NewsItem(int id, String name, Content content) {
+ super(id, name, new ArrayList(3));
+ setItemContent(content);
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsUser.java b/src/org/yooreeka/algos/reco/collab/data/NewsUser.java
new file mode 100644
index 0000000..ae316b5
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/NewsUser.java
@@ -0,0 +1,82 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.io.Serializable;
+import java.util.List;
+
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.User;
+
+/**
+ * @author Babis Marmanis
+ *
+ */
+public class NewsUser extends User implements Serializable {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = 3415187707158663184L;
+
+ /**
+ * @param id
+ */
+ public NewsUser(int id) {
+ super(id);
+ }
+
+ /**
+ * @param id
+ * @param ratings
+ */
+ public NewsUser(int id, List ratings) {
+ super(id, ratings);
+ }
+
+ /**
+ * @param id
+ * @param name
+ */
+ public NewsUser(int id, String name) {
+ super(id, name);
+ }
+
+ /**
+ * @param id
+ * @param name
+ * @param ratings
+ */
+ public NewsUser(int id, String name, List ratings) {
+ super(id, name, ratings);
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java b/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java
new file mode 100644
index 0000000..0491ae2
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java
@@ -0,0 +1,94 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.data;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Utility class to generate random ratings.
+ */
+class RatingBuilder {
+
+ private Random rand = null;
+
+ public RatingBuilder() {
+ rand = new java.util.Random();
+ }
+
+ /**
+ * Creates biased ratings for all items.
+ *
+ * @param userId
+ * rating user.
+ * @param items
+ * to create ratings for.
+ * @param lowerBias
+ * low range for rating value
+ * @param upperBias
+ * high range for rating value
+ * @return
+ */
+ public List createBiasedRatings(int userId, Item[] items,
+ int lowerBias, int upperBias) {
+ List ratings = new ArrayList();
+ for (Item item : items) {
+ int biasedRandomRating = getRandomRating(lowerBias, upperBias);
+ Rating rating = new Rating(userId, item.getId(), biasedRandomRating);
+ rating.setItem(item);
+ ratings.add(rating);
+ }
+ return ratings;
+ }
+
+ public int getRandomRating() {
+ // No bias
+ return getRandomRating(5);
+ }
+
+ public int getRandomRating(int upperBias) {
+
+ // Lower bias is 1
+ return getRandomRating(1, upperBias);
+ }
+
+ public int getRandomRating(int lowerBias, int upperBias) {
+
+ // We add 1 at the end because the nextInt(n) call excludes n
+ int n = (upperBias - lowerBias) + 1;
+ return (lowerBias + rand.nextInt(n));
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java b/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java
new file mode 100644
index 0000000..c742f7a
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java
@@ -0,0 +1,44 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.evaluation;
+
+import java.util.List;
+
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+/**
+ * Interface to access previously generated evaluation data.
+ */
+public interface EvaluationDataProvider {
+ List loadTestRatings(int testSize, int testSequence);
+
+ List loadTrainingRatings(int testSize, int testSequence);
+}
diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java
new file mode 100644
index 0000000..70a45f4
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java
@@ -0,0 +1,283 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.evaluation;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import org.yooreeka.algos.reco.collab.data.MovieLensDataset;
+import org.yooreeka.algos.reco.collab.model.Dataset;
+import org.yooreeka.algos.reco.collab.model.Rating;
+
+public class MovieLensEvaluationDataProvider implements EvaluationDataProvider {
+
+ /*
+ * Location for files with test and training data.
+ */
+ private String evaluationDataDir;
+
+ /*
+ * Provides data that will be used to produce training and test files.
+ */
+ private Dataset dataset;
+
+ /*
+ * Prefix that will be used in filename for files with test ratings.
+ */
+ private String testFilenamePrefix;
+
+ /*
+ * Prefix that will be used in filename for files with training ratings.
+ */
+ private String trainingFilenamePrefix;
+
+ public MovieLensEvaluationDataProvider(Dataset dataset) {
+ this.dataset = dataset;
+ }
+
+ public void createData(int testSize) {
+ createData(testSize, 1);
+ }
+
+ /**
+ * Creates evaluation data by splitting original item rating set into two
+ * sets: training set and test sets. Test set is built by randomly selecting
+ * ratings from the original ratings set. Training set is built by selecting
+ * everything that is left from the original set.
+ *
+ * @param testSize
+ * number of ratings in test set.
+ * @param testSequence
+ * allows to generate multiple test sets with the same number or
+ * ratings.
+ */
+ public void createData(int testSize, int sequence) {
+
+ /* start with complete list of all available ratings */
+ List allRatings = new ArrayList(dataset.getRatings());
+
+ /* extract required number of ratings and use them as testing set */
+ List testRatings = removeRatings(allRatings, testSize);
+ /* use the of ratings as a training set */
+ List trainingRatings = allRatings;
+
+ String testRatingsFilename = createFilename(testFilenamePrefix,
+ testSize, sequence);
+
+ String trainingRatingsFilename = createFilename(trainingFilenamePrefix,
+ testSize, sequence);
+
+ saveRatings(testRatingsFilename, testRatings);
+ saveRatings(trainingRatingsFilename, trainingRatings);
+ }
+
+ /**
+ * Builds unique filename for file that contains ratings for training or
+ * test.
+ *
+ * @param namePrefix
+ * identifies source of the data and the purpose (testing or
+ * training) of the file.
+ * @param n
+ * number or ratings that were randomly selected from the
+ * original set of ratings and put in test file. Both training
+ * and test files are identified by this number.
+ * @param sequence
+ * random selection sequence. In some cases when we need to
+ * generate multiple test files with the same number of ratings
+ * but with different selection every time. Defaults to 1.
+ *
+ * Example:
+ *
+ * MovieLensRatingsTrainingN16000Rnd1.dat - first training file
+ * that was obtained by removing 16000 ratings from original
+ * ratings file. MovieLensRatingsTestN16000Rnd1.dat - first test
+ * file with 16000 ratings that were removed from original
+ * ratings file. MovieLensRatingsTrainingN16000Rnd2.dat - second
+ * training file that was obtained by removing 16000 ratings from
+ * original ratings file. MovieLensRatingsTestN16000Rnd2.dat -
+ * second test file with 16000 ratings that were removed from
+ * original ratings file.
+ */
+ public String createFilename(String namePrefix, int n, int sequence) {
+ return namePrefix + "N" + n + "Rnd" + sequence + ".dat";
+ }
+
+ public String getEvaluationDataDir() {
+ return evaluationDataDir;
+ }
+
+ public String getTestFilenamePrefix() {
+ return testFilenamePrefix;
+ }
+
+ public String getTrainingFilenamePrefix() {
+ return trainingFilenamePrefix;
+ }
+
+ public List loadTestRatings(int testSize, int testSequence) {
+ String filename = createFilename(testFilenamePrefix, testSize,
+ testSequence);
+ File f = new File(evaluationDataDir, filename);
+
+ return MovieLensDataset.loadRatings(f);
+ }
+
+ public List loadTrainingRatings(int testSize, int testSequence) {
+ String filename = createFilename(trainingFilenamePrefix, testSize,
+ testSequence);
+ File f = new File(evaluationDataDir, filename);
+ return MovieLensDataset.loadRatings(f);
+ }
+
+ /**
+ * Creates a set of training and test data.
+ *
+ * @param testSize
+ * number of ratings that will be used to create testing set.
+ * Size of training set is defined as AllAvailableRatings -
+ * testSize
+ */
+ public void prepareTestData(int testSize) {
+ prepareTestData(testSize, 1);
+ }
+
+ /**
+ * Creates multiple sets of training and test data. Should be used when we
+ * need to create multiple test files for the same tests.
+ *
+ * @param testSize
+ * number of test ratings.
+ * @param sequence
+ * test sequence.
+ */
+ public void prepareTestData(int testSize, int sequence) {
+ if (!testDataExist(testSize, sequence)) {
+ createData(testSize, sequence);
+ }
+ }
+
+ private void removeFile(String filename) {
+ File f = new File(evaluationDataDir, filename);
+ if (f.exists()) {
+ f.delete();
+ }
+ }
+
+ private List removeRatings(List allRatings, int n) {
+
+ List removedRatings = new ArrayList();
+ Random rnd = new Random();
+ while (removedRatings.size() < n) {
+ int randomIndex = rnd.nextInt(allRatings.size());
+ Rating rating = allRatings.remove(randomIndex);
+ removedRatings.add(rating);
+ }
+ return removedRatings;
+ }
+
+ /**
+ * Deletes test data. Defaults sequence to 1.
+ *
+ * @param testSize
+ */
+ public void removeTestData(int testSize) {
+ removeTestData(testSize, 1);
+ }
+
+ /**
+ * Deletes test data.
+ *
+ * @param testSize
+ * @param sequence
+ */
+ public void removeTestData(int testSize, int sequence) {
+ String testFilename = createFilename(testFilenamePrefix, testSize,
+ sequence);
+ removeFile(testFilename);
+
+ String trainingFilename = createFilename(trainingFilenamePrefix,
+ testSize, sequence);
+ removeFile(trainingFilename);
+ }
+
+ private void saveRatings(String filename, Collection ratings) {
+ File f = new File(evaluationDataDir, filename);
+ MovieLensDataset.createNewRatingsFile(f, ratings);
+ }
+
+ public void setEvaluationDataDir(String value) {
+ this.evaluationDataDir = value;
+ }
+
+ public void setTestFilenamePrefix(String testFilenamePrefix) {
+ this.testFilenamePrefix = testFilenamePrefix;
+ }
+
+ public void setTrainingFilenamePrefix(String trainingFilenamePrefix) {
+ this.trainingFilenamePrefix = trainingFilenamePrefix;
+ }
+
+ public boolean testDataExist(int testSize) {
+ return testDataExist(testSize, 1);
+ }
+
+ /**
+ * Checks if the test set already exists.
+ *
+ * @param testSize
+ * @param sequence
+ * @return
+ */
+ public boolean testDataExist(int testSize, int sequence) {
+ // create temporary directory if it doesn't exist yet.
+ File tmpDirFile = new File(evaluationDataDir);
+ if (!tmpDirFile.exists()) {
+ tmpDirFile.mkdirs();
+ }
+
+ boolean filesExist = false;
+ String testFilename = createFilename(testFilenamePrefix, testSize,
+ sequence);
+ String trainingFilename = createFilename(trainingFilenamePrefix,
+ testSize, sequence);
+ if (new File(evaluationDataDir, testFilename).exists()
+ && new File(evaluationDataDir, trainingFilename).exists()) {
+ filesExist = true;
+ }
+
+ return filesExist;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java
new file mode 100644
index 0000000..d33c7d7
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java
@@ -0,0 +1,104 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.evaluation;
+
+import java.io.File;
+import java.util.List;
+
+import org.yooreeka.algos.reco.collab.data.MovieLensDataset;
+import org.yooreeka.algos.reco.collab.model.Dataset;
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.RecommendationType;
+import org.yooreeka.algos.reco.collab.recommender.Delphi;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ *
+ * @deprecated use RMSEEstimator
instead.
+ */
+public class MovieLensRMSE {
+
+ public static void main(String[] args) {
+ MovieLensRMSE rmse = new MovieLensRMSE();
+ rmse.calculate();
+ }
+
+ public MovieLensRMSE() {
+ }
+
+ public double[] calculate() {
+
+ double similarityThreshold = 0.50;
+
+ int N = 5;
+
+ double[] rmse = new double[N];
+
+ RMSEEstimator rmseEstimator = new RMSEEstimator();
+
+ for (int i = 1; i <= N; i++) {
+
+ Dataset ds = createTrainingDataset(i);
+
+ Delphi delphi = new Delphi(ds, RecommendationType.ITEM_BASED);
+ delphi.setSimilarityThreshold(similarityThreshold);
+
+ List testRatings = createTestRatings(i);
+
+ double rmseValue = rmseEstimator.calculateRMSE(delphi, testRatings);
+ System.out.println(i + ": rmse = " + rmseValue);
+
+ rmse[i - 1] = rmseValue;
+ }
+
+ return rmse;
+ }
+
+ public List createTestRatings(int n) {
+ String dataDir = YooreekaConfigurator
+ .getProperty("iweb2.movielens.data.dir");
+
+ File ratings = new File(dataDir, "u" + n + ".test");
+
+ return MovieLensDataset.loadRatings(ratings);
+ }
+
+ public MovieLensDataset createTrainingDataset(int n) {
+ String dataDir = YooreekaConfigurator
+ .getProperty("iweb2.movielens.data.dir");
+
+ File users = new File(dataDir, MovieLensDataset.USERS_FILENAME);
+ File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME);
+ File ratings = new File(dataDir, "u" + n + ".base");
+
+ return new MovieLensDataset(users, items, ratings);
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java
new file mode 100644
index 0000000..34b053a
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java
@@ -0,0 +1,173 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.evaluation;
+
+import java.util.Collection;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.reco.collab.data.MovieLensDataset;
+import org.yooreeka.algos.reco.collab.model.Dataset;
+import org.yooreeka.algos.reco.collab.model.Item;
+import org.yooreeka.algos.reco.collab.model.Rating;
+import org.yooreeka.algos.reco.collab.model.User;
+import org.yooreeka.algos.reco.collab.recommender.Recommender;
+import org.yooreeka.config.YooreekaConfigurator;
+
+/**
+ * Calculates Root Mean Squared Error for the recommender.
+ */
+public class RMSEEstimator {
+
+ private static final Logger LOG = Logger.getLogger(RMSEEstimator.class.getName());
+
+ public RMSEEstimator() {
+ LOG.setLevel(YooreekaConfigurator.getLevel(RMSEEstimator.class.getName()));
+ }
+
+ /**
+ * Calculates Root Mean Squared Error for the recommender. Uses test rating
+ * values returned by recommender's dataset.
+ *
+ * @param delphi
+ * recommender.
+ * @return root mean squared error value.
+ */
+ public double calculateRMSE(Recommender delphi) {
+
+ MovieLensDataset ds = (MovieLensDataset) delphi.getDataset();
+ Collection testRatings = ds.getTestRatings();
+
+ return calculateRMSE(delphi, testRatings);
+ }
+
+ /**
+ * Calculates Root Mean Squared Error for the recommender.
+ *
+ * @param delphi
+ * recommender to evaluate.
+ * @param testRatings
+ * ratings that will be used to calculate the error.
+ * @return root mean squared error.
+ */
+ public double calculateRMSE(Recommender delphi,
+ Collection testRatings) {
+
+ double sum = 0.0;
+
+ Dataset ds = delphi.getDataset();
+
+ int totalSamples = testRatings.size();
+
+ LOG.fine("Calculating RMSE ...");
+ LOG.fine("Training ratings count: " + ds.getRatingsCount());
+ LOG.fine("Test ratings count: " + testRatings.size());
+
+ for (Rating r : testRatings) {
+ User user = ds.getUser(r.getUserId());
+ Item item = ds.getItem(r.getItemId());
+ double predictedItemRating = delphi.predictRating(user, item);
+
+ if (predictedItemRating > 5.0) {
+ predictedItemRating = 5.0;
+ LOG.finest("Predicted item rating: " + predictedItemRating);
+ }
+ LOG.finest(
+ "user: " + r.getUserId() +
+ ", item: " + r.getItemId() +
+ ", actual rating: " + r.getRating() +
+ ", predicted: " + String.valueOf(predictedItemRating));
+
+ sum += Math.pow((predictedItemRating - r.getRating()), 2);
+
+ }
+ double rmse = Math.sqrt(sum / totalSamples);
+
+ LOG.fine("RMSE:" + rmse);
+
+ return rmse;
+ }
+
+ public void compareRMSEs(Recommender delphi) {
+
+ MovieLensDataset ds = (MovieLensDataset) delphi.getDataset();
+ Collection testRatings = ds.getTestRatings();
+
+ compareRMSEs(delphi, testRatings);
+ }
+
+ public void compareRMSEs(Recommender delphi, Collection testRatings) {
+
+ double sum = 0.0;
+ double sumAvgItem = 0.0;
+ double sumAvgUser = 0.0;
+
+ Dataset ds = delphi.getDataset();
+
+ int totalSamples = testRatings.size();
+
+ LOG.fine("Calculating RMSE ...");
+ LOG.fine("Training ratings count: "+ds.getRatingsCount());
+ LOG.fine("Test ratings count: " + testRatings.size());
+
+ for (Rating r : testRatings) {
+ User user = ds.getUser(r.getUserId());
+ Item item = ds.getItem(r.getItemId());
+ double predictedItemRating = delphi.predictRating(user, item);
+ double predictedAvgItemRating = delphi
+ .predictBasedOnItemAverage(item);
+ double predictedAvgUserRating = delphi
+ .predictBasedOnUserAverage(user);
+
+ if (predictedItemRating > 5.0) {
+ predictedItemRating = 5.0;
+ LOG.finest("Predicted item rating: " + predictedItemRating);
+ }
+ LOG.finest(
+ "user: " + r.getUserId() +
+ ", item: " + r.getItemId() +
+ ", actual rating: " + r.getRating() +
+ ", predicted: " + String.valueOf(predictedItemRating));
+
+ sum += Math.pow((predictedItemRating - r.getRating()), 2);
+ sumAvgItem += Math.pow((predictedAvgItemRating - r.getRating()), 2);
+ sumAvgUser += Math.pow((predictedAvgUserRating - r.getRating()), 2);
+
+ }
+
+ double rmse = Math.sqrt(sum / totalSamples);
+ double rmseAvgItem = Math.sqrt(sumAvgItem / totalSamples);
+ double rmseAvgUser = Math.sqrt(sumAvgUser / totalSamples);
+
+ System.out.println("RMSE:" + rmse);
+ System.out.println("RMSE (based on avg. Item rating):" + rmseAvgItem);
+ System.out.println("RMSE (based on avg. User rating):" + rmseAvgUser);
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java
new file mode 100644
index 0000000..298b6be
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java
@@ -0,0 +1,85 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.evaluation;
+
+public class RMSEResult {
+ private String type;
+ private long testSize;
+ private double similarityThreshold;
+ private double error;
+
+ public RMSEResult(String type, long testSize, double simThreshold,
+ double error) {
+ this.type = type;
+ this.testSize = testSize;
+ this.similarityThreshold = simThreshold;
+ this.error = error;
+ }
+
+ public double getError() {
+ return error;
+ }
+
+ public double getSimilarityThreshold() {
+ return similarityThreshold;
+ }
+
+ public long getTestSize() {
+ return testSize;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setError(double error) {
+ this.error = error;
+ }
+
+ public void setSimilarityThreshold(double similarityThreshold) {
+ this.similarityThreshold = similarityThreshold;
+ }
+
+ public void setTestSize(long testSize) {
+ this.testSize = testSize;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ @Override
+ public String toString() {
+ return "RMSE (testSize=" + getTestSize() + ", type=" + getType()
+ + ", similarityThreshold=" + getSimilarityThreshold() + "): "
+ + getError();
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/model/Content.java b/src/org/yooreeka/algos/reco/collab/model/Content.java
new file mode 100644
index 0000000..995eca6
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/Content.java
@@ -0,0 +1,182 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.yooreeka.algos.search.lucene.analyzer.TextDocumentTerms;
+import org.yooreeka.config.YooreekaConfigurator;
+
+public class Content implements java.io.Serializable {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = 1098727290087922462L;
+ private static final Logger LOG = Logger.getLogger(Content.class.getName());
+
+ private String id;
+ private String text;
+ private String[] terms;
+ private int[] termFrequencies;
+ private Map tfMap;
+
+ public Content(String id, String text) {
+ this(id, text, 10);
+ }
+
+ public Content(String id, String text, int topNTerms) {
+
+ LOG.setLevel(YooreekaConfigurator.getLevel(Content.class.getName()));
+
+ this.id = id;
+ this.text = text;
+
+ Map allTermFrequencyMap = (new TextDocumentTerms(text))
+ .getTf();
+ tfMap = getTopNTermFrequencies(allTermFrequencyMap, topNTerms);
+
+ terms = new String[tfMap.size()];
+ termFrequencies = new int[tfMap.size()];
+
+ int i = 0;
+ for (Map.Entry e : tfMap.entrySet()) {
+ terms[i] = e.getKey();
+ termFrequencies[i] = e.getValue();
+ i++;
+ }
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public int[] getTermFrequencies() {
+ return termFrequencies;
+ }
+
+ public String[] getTerms() {
+ return terms;
+ }
+
+ public double[] getTermVector(String[] terms) {
+ double[] termVector = new double[terms.length];
+ for (int i = 0, n = terms.length; i < n; i++) {
+ if (tfMap.containsKey(terms[i])) {
+ termVector[i] = 1;
+ } else {
+ termVector[i] = 0;
+ }
+ }
+ return termVector;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public Map getTFMap() {
+ return this.tfMap;
+ }
+
+ // private Map buildTermFrequencyMap(String text) {
+ //
+ // CustomAnalyzer analyzer = new CustomAnalyzer(Version.LUCENE_40);
+ // TokenStream tokenStream = analyzer.tokenStream("content", new
+ // StringReader(text));
+ //
+ // Map termFrequencyMap = new HashMap();
+ //
+ // boolean hasTokens = true;
+ // try {
+ // while (hasTokens) {
+ // Token t = null;//tokenStream.next();
+ // if (t == null) {
+ // hasTokens = false;
+ // } else {
+ // String term = new String(t.termBuffer(), 0, t.termLength());
+ // Integer frequency = termFrequencyMap.get(term);
+ // if( frequency == null ) {
+ // termFrequencyMap.put(term, 1);
+ // }
+ // else {
+ // termFrequencyMap.put(term, frequency + 1);
+ // }
+ // }
+ // }
+ // }
+ // catch(IOException e) {
+ // throw new RuntimeException(e);
+ // }
+ //
+ // return termFrequencyMap;
+ // }
+
+ private Map getTopNTermFrequencies(
+ Map termFrequencyMap, int topN) {
+
+ List> terms = new ArrayList>(
+ termFrequencyMap.entrySet());
+
+ // Different terms can have the same frequency.
+ Collections.sort(terms, new Comparator>() {
+ public int compare(Map.Entry e1,
+ Map.Entry e2) {
+ int result = 0;
+ if (e1.getValue() < e2.getValue()) {
+ result = 1; // reverse order
+ } else if (e1.getValue() > e2.getValue()) {
+ result = -1;
+ } else {
+ result = 0;
+ }
+ return result;
+ }
+ });
+
+ Map topNTermsFrequencyMap = new HashMap();
+ for (Map.Entry term : terms) {
+ topNTermsFrequencyMap.put(term.getKey(), term.getValue());
+ if (topNTermsFrequencyMap.size() >= topN) {
+ break;
+ }
+ }
+
+ return topNTermsFrequencyMap;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/model/Dataset.java b/src/org/yooreeka/algos/reco/collab/model/Dataset.java
new file mode 100644
index 0000000..c1256df
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/Dataset.java
@@ -0,0 +1,142 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+import java.util.Collection;
+
+/**
+ * Defines service that provides access to all users, items, and ratings.
+ * Recommender and similarity implementations rely on this service to access
+ * data.
+ */
+public interface Dataset {
+
+ /**
+ * For content-based dataset returns array of terms that represent document
+ * space.
+ *
+ * @return
+ */
+ public String[] getAllTerms();
+
+ /**
+ * Provides the average rating for this item
+ *
+ * @param itemId
+ * @return
+ */
+ public double getAverageItemRating(int itemId);
+
+ /**
+ * Provides the average rating for this user
+ *
+ * @param userId
+ * @return
+ */
+ public double getAverageUserRating(int userId);
+
+ /**
+ * Retrieves a specific item.
+ *
+ * @param itemId
+ * item id.
+ * @return item.
+ */
+ public Item getItem(Integer itemId);
+
+ /**
+ * Total number of all available items.
+ *
+ * @return number of items.
+ */
+ public int getItemCount();
+
+ /**
+ * Retrieves all items.
+ *
+ * @return collection of all items.
+ */
+ public Collection- getItems();
+
+ /**
+ * Logical name for the dataset instance.
+ *
+ * @return name
+ */
+ public String getName();
+
+ /**
+ * Provides access to all ratings.
+ *
+ * @return collection of ratings.
+ */
+ public Collection getRatings();
+
+ /**
+ * Total number of all available item ratings.
+ *
+ * @return number of item ratings by users.
+ */
+ public int getRatingsCount();
+
+ /**
+ * Retrieves a specific user.
+ *
+ * @param userId
+ * user id.
+ * @return user.
+ */
+ public User getUser(Integer userId);
+
+ /**
+ * Total number of all available users.
+ *
+ * @return number of users.
+ */
+ public int getUserCount();
+
+ /**
+ * Retrieves all users.
+ *
+ * @return collection of users.
+ */
+ public Collection getUsers();
+
+ /**
+ * Provides information about user and item ids returned by this dataset.
+ *
+ * @return true if ids aren't in sequence and can't be used as array
+ * indexes. false if user or items ids can be treated as sequences
+ * that start with 1. In this case index will be derived from id:
+ * index = id - 1.
+ */
+ public boolean isIdMappingRequired();
+}
diff --git a/src/org/yooreeka/algos/reco/collab/model/Item.java b/src/org/yooreeka/algos/reco/collab/model/Item.java
new file mode 100644
index 0000000..1c152ec
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/Item.java
@@ -0,0 +1,171 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Generic representation of product or service that users can rate.
+ */
+public class Item implements java.io.Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 6119040388138010186L;
+
+ public static Integer[] getSharedUserIds(Item x, Item y) {
+ List sharedUsers = new ArrayList();
+ for (Rating r : x.getAllRatings()) {
+ // same user rated the item
+ if (y.getUserRating(r.getUserId()) != null) {
+ sharedUsers.add(r.getUserId());
+ }
+ }
+ return sharedUsers.toArray(new Integer[sharedUsers.size()]);
+ }
+
+ /*
+ * Unique id in the dataset.
+ */
+ private int id;
+
+ /*
+ * Name.
+ */
+ private String name;
+
+ /*
+ * All ratings for this item. Supports only one rating per item for a user.
+ * Mapping: userId -> rating
+ */
+ private Map ratingsByUserId;
+
+ private Content itemContent;
+
+ public Item(Integer id, List ratings) {
+ this(id, String.valueOf(id), ratings);
+ }
+
+ public Item(Integer id, String name) {
+ this(id, name, new ArrayList(3));
+ }
+
+ public Item(Integer id, String name, List ratings) {
+ this.id = id;
+ this.name = name;
+ // load ratings into userId -> rating map.
+ ratingsByUserId = new HashMap(ratings.size());
+ for (Rating r : ratings) {
+ ratingsByUserId.put(r.getUserId(), r);
+ }
+ }
+
+ /**
+ * Updates existing user rating or adds a new user rating for this item.
+ *
+ * @param r
+ * rating to add.
+ */
+ public void addUserRating(Rating r) {
+ ratingsByUserId.put(r.getUserId(), r);
+ }
+
+ /**
+ * Returns all ratings that we have for this item.
+ *
+ * @return
+ */
+ public Collection getAllRatings() {
+ return ratingsByUserId.values();
+ }
+
+ public double getAverageRating() {
+ double allRatingsSum = 0.0;
+ Collection allItemRatings = ratingsByUserId.values();
+ for (Rating rating : allItemRatings) {
+ allRatingsSum += rating.getRating();
+ }
+ // use 2.5 if there are no ratings.
+ return allItemRatings.size() > 0 ? allRatingsSum
+ / allItemRatings.size() : 2.5;
+ }
+
+ public int getId() {
+ return id;
+ }
+
+ public Content getItemContent() {
+ return itemContent;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ /*
+ * Utility method to extract array of ratings based on array of user ids.
+ */
+ public double[] getRatingsForItemList(Integer[] userIds) {
+ double[] ratings = new double[userIds.length];
+ for (int i = 0, n = userIds.length; i < n; i++) {
+ Rating r = getUserRating(userIds[i]);
+ if (r == null) {
+ throw new IllegalArgumentException(
+ "Item doesn't have rating by specified user id ("
+ + "userId=" + userIds[i] + ", itemId="
+ + getId());
+ }
+ ratings[i] = r.getRating();
+ }
+ return ratings;
+ }
+
+ /**
+ * Returns rating that specified user gave to the item.
+ *
+ * @param userId
+ * user
+ * @return user rating or null if user hasn't rated this item.
+ */
+ public Rating getUserRating(Integer userId) {
+ return ratingsByUserId.get(userId);
+ }
+
+ public void setItemContent(Content content) {
+ this.itemContent = content;
+ }
+
+}
diff --git a/src/org/yooreeka/algos/reco/collab/model/Rating.java b/src/org/yooreeka/algos/reco/collab/model/Rating.java
new file mode 100644
index 0000000..2a3f665
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/Rating.java
@@ -0,0 +1,127 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+/**
+ * Generic representation of a rating given by user to a product (item).
+ */
+public class Rating implements java.io.Serializable {
+
+ /**
+ * SVUID
+ */
+ private static final long serialVersionUID = 1438346522502387789L;
+
+ protected Item item;
+
+ private int userId;
+ private int itemId;
+ private int rating;
+
+ public Rating(int userId, int bookId, int rating) {
+ this.userId = userId;
+ this.itemId = bookId;
+ this.rating = rating;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final Rating other = (Rating) obj;
+ if (itemId != other.itemId)
+ return false;
+ if (rating != other.rating)
+ return false;
+ if (userId != other.userId)
+ return false;
+ return true;
+ }
+
+ /**
+ * @return the item
+ */
+ public Item getItem() {
+ return item;
+ }
+
+ public int getItemId() {
+ return itemId;
+ }
+
+ public int getRating() {
+ return rating;
+ }
+
+ public int getUserId() {
+ return userId;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + itemId;
+ result = prime * result + rating;
+ result = prime * result + userId;
+ return result;
+ }
+
+ /**
+ * @param item
+ * the item to set
+ */
+ public void setItem(Item item) {
+ this.item = item;
+ }
+
+ public void setItemId(int bookId) {
+ this.itemId = bookId;
+ }
+
+ public void setRating(int rating) {
+ this.rating = rating;
+ }
+
+ public void setUserId(int userId) {
+ this.userId = userId;
+ }
+
+ @Override
+ public String toString() {
+ return this.getClass().getSimpleName() + "[userId: " + userId
+ + ", itemId: " + itemId + ", rating: " + rating + "]";
+ }
+}
diff --git a/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java b/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java
new file mode 100644
index 0000000..7a78128
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java
@@ -0,0 +1,38 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+/**
+ * Defines all supported similarity types.
+ */
+public enum RecommendationType {
+ USER_BASED, ITEM_BASED, IMPROVED_USER_BASED, ITEM_PENALTY_BASED, USER_CONTENT_BASED, ITEM_CONTENT_BASED, USER_ITEM_CONTENT_BASED
+}
\ No newline at end of file
diff --git a/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java b/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java
new file mode 100644
index 0000000..aadd2cc
--- /dev/null
+++ b/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java
@@ -0,0 +1,128 @@
+/*
+ * ________________________________________________________________________________________
+ *
+ * Y O O R E E K A
+ * A library for data mining, machine learning, soft computing, and mathematical analysis
+ * ________________________________________________________________________________________
+ *
+ * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web "
+ * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms
+ * are valuable in any software application.
+ *
+ * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
+ * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.
+ *
+ * Certain library functions depend on other Open Source software libraries, which are covered
+ * by different license agreements. See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership and licensing.
+ *
+ * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.yooreeka.algos.reco.collab.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+/**
+ * @author Babis Marmanis
+ *
+ */
+public class SimilarItem {
+
+ public static SimilarItem[] getTopSimilarItems(
+ List similarItems, int topN) {
+
+ // sort friends based on itemAgreement
+ SimilarItem.sort(similarItems);
+
+ // select top N friends
+ List