From 762d794eae7b589b883d1bd31dcd1fad1942c51b Mon Sep 17 00:00:00 2001 From: "H. Marmanis" Date: Mon, 3 Dec 2012 12:00:06 -0500 Subject: [PATCH] Checking in the code --- .../clustering/dbscan/DBSCANAlgorithm.java | 448 ++++++++++++++ .../hierarchical/AverageLinkAlgorithm.java | 145 +++++ .../clustering/hierarchical/ClusterSet.java | 83 +++ .../clustering/hierarchical/Dendrogram.java | 162 ++++++ .../algos/clustering/hierarchical/MST.java | 130 +++++ .../hierarchical/MSTSingleLinkAlgorithm.java | 142 +++++ .../hierarchical/SingleLinkAlgorithm.java | 126 ++++ .../algos/clustering/model/Attribute.java | 119 ++++ .../algos/clustering/model/Cluster.java | 197 +++++++ .../algos/clustering/model/DataPoint.java | 181 ++++++ .../partitional/KMeansAlgorithm.java | 306 ++++++++++ .../partitional/NearestNeighborAlgorithm.java | 230 ++++++++ .../algos/clustering/rock/LinkMatrix.java | 195 +++++++ .../clustering/rock/MergeGoodnessMeasure.java | 92 +++ .../algos/clustering/rock/ROCKAlgorithm.java | 142 +++++ .../algos/clustering/rock/ROCKClusters.java | 205 +++++++ .../algos/clustering/rock/SimilarCluster.java | 85 +++ .../clustering/test/MyDiggSpaceData.java | 125 ++++ .../clustering/test/MyDiggSpaceDataset.java | 56 ++ .../algos/clustering/test/SFData.java | 212 +++++++ .../algos/clustering/test/SFDataset.java | 93 +++ .../algos/clustering/utils/Attributes.java | 143 +++++ .../utils/ObjectToIndexMapping.java | 90 +++ .../utils/SortedArrayClustering.java | 71 +++ .../algos/reco/collab/cache/FileStore.java | 134 +++++ .../algos/reco/collab/cache/Store.java | 72 +++ .../algos/reco/collab/data/BaseDataset.java | 431 ++++++++++++++ .../algos/reco/collab/data/ContentItem.java | 59 ++ .../algos/reco/collab/data/DiggData.java | 361 ++++++++++++ .../algos/reco/collab/data/HTMLContent.java | 99 ++++ .../algos/reco/collab/data/MovieLensData.java | 83 +++ .../reco/collab/data/MovieLensDataset.java | 385 +++++++++++++ .../algos/reco/collab/data/MusicData.java | 256 ++++++++ .../algos/reco/collab/data/MusicItem.java | 71 +++ .../algos/reco/collab/data/MusicRating.java | 52 ++ .../algos/reco/collab/data/MusicUser.java | 249 ++++++++ .../algos/reco/collab/data/NewsData.java | 202 +++++++ .../algos/reco/collab/data/NewsItem.java | 54 ++ .../algos/reco/collab/data/NewsUser.java | 82 +++ .../algos/reco/collab/data/RatingBuilder.java | 94 +++ .../evaluation/EvaluationDataProvider.java | 44 ++ .../MovieLensEvaluationDataProvider.java | 283 +++++++++ .../reco/collab/evaluation/MovieLensRMSE.java | 104 ++++ .../reco/collab/evaluation/RMSEEstimator.java | 173 ++++++ .../reco/collab/evaluation/RMSEResult.java | 85 +++ .../algos/reco/collab/model/Content.java | 182 ++++++ .../algos/reco/collab/model/Dataset.java | 142 +++++ .../algos/reco/collab/model/Item.java | 171 ++++++ .../algos/reco/collab/model/Rating.java | 127 ++++ .../reco/collab/model/RecommendationType.java | 38 ++ .../algos/reco/collab/model/SimilarItem.java | 128 ++++ .../algos/reco/collab/model/SimilarUser.java | 134 +++++ .../algos/reco/collab/model/User.java | 175 ++++++ .../algos/reco/collab/recommender/Delphi.java | 545 ++++++++++++++++++ .../reco/collab/recommender/DiggDelphi.java | 282 +++++++++ .../collab/recommender/MovieLensDelphi.java | 324 +++++++++++ .../recommender/PredictedItemRating.java | 152 +++++ .../reco/collab/recommender/Recommender.java | 88 +++ .../movielens/MovieLensItemSimilarity.java | 92 +++ .../movielens/MovieLensUserSimilarity.java | 107 ++++ .../naive/ImprovedItemBasedSimilarity.java | 120 ++++ .../naive/ImprovedUserBasedSimilarity.java | 129 +++++ .../similarity/naive/ItemBasedSimilarity.java | 110 ++++ .../naive/ItemContentBasedSimilarity.java | 92 +++ .../naive/ItemPenaltyBasedSimilarity.java | 161 ++++++ .../similarity/naive/SimilarityMatrix.java | 74 +++ .../naive/SimilarityMatrixImpl.java | 148 +++++ .../similarity/naive/UserBasedSimilarity.java | 117 ++++ .../naive/UserContentBasedSimilarity.java | 107 ++++ .../naive/UserItemContentBasedSimilarity.java | 184 ++++++ .../UpperTriangularSimilarityMatrix.java | 82 +++ .../UpperTriangularSimilarityMatrixImpl.java | 150 +++++ .../similarity/util/PearsonCorrelation.java | 170 ++++++ .../similarity/util/RatingCountMatrix.java | 131 +++++ .../util/SimilarityMatrixCache.java | 71 +++ .../util/SimilarityMatrixRepository.java | 173 ++++++ .../algos/reco/content/digg/DiggCategory.java | 83 +++ .../algos/reco/content/digg/DiggService.java | 253 ++++++++ .../reco/content/digg/DiggStoryItem.java | 109 ++++ .../algos/reco/content/digg/DiggUser.java | 45 ++ .../algos/search/data/SearchResult.java | 180 ++++++ .../search/lucene/LuceneIndexBuilder.java | 152 +++++ .../lucene/analyzer/CustomAnalyzer.java | 113 ++++ .../lucene/analyzer/TextDocumentTerms.java | 78 +++ .../search/ranking/DocRankMatrixBuilder.java | 197 +++++++ .../search/ranking/PageRankMatrixBuilder.java | 98 ++++ .../algos/search/ranking/PageRankMatrixH.java | 184 ++++++ .../yooreeka/algos/search/ranking/Rank.java | 294 ++++++++++ .../algos/search/ranking/RelevanceScore.java | 78 +++ .../algos/search/util/TermFreqMapUtils.java | 93 +++ .../algos/taxis/bayesian/NaiveBayes.java | 327 +++++++++++ .../boosting/BoostingARCX4Classifier.java | 190 ++++++ .../taxis/boosting/WeightBasedRandom.java | 80 +++ .../algos/taxis/core/AttributeValue.java | 113 ++++ .../algos/taxis/core/BaseConcept.java | 124 ++++ .../algos/taxis/core/BaseInstance.java | 239 ++++++++ .../algos/taxis/core/DoubleAttribute.java | 96 +++ .../algos/taxis/core/StringAttribute.java | 108 ++++ .../algos/taxis/core/TrainingSet.java | 173 ++++++ .../algos/taxis/core/intf/Attribute.java | 42 ++ .../algos/taxis/core/intf/Classifier.java | 51 ++ .../algos/taxis/core/intf/Concept.java | 44 ++ .../algos/taxis/core/intf/Instance.java | 46 ++ .../taxis/ensemble/ClassifierEnsemble.java | 106 ++++ .../taxis/ensemble/ConceptMajorityVoter.java | 87 +++ .../taxis/evaluation/ClassifierResults.java | 70 +++ .../algos/taxis/evaluation/CochransQTest.java | 128 ++++ .../algos/taxis/evaluation/Diff2PropTest.java | 84 +++ .../algos/taxis/evaluation/FTest.java | 182 ++++++ .../algos/taxis/evaluation/McNemarTest.java | 118 ++++ .../yooreeka/algos/taxis/evaluation/Test.java | 105 ++++ .../taxis/networks/neural/XORNetwork.java | 163 ++++++ .../taxis/networks/neural/core/BaseLayer.java | 139 +++++ .../taxis/networks/neural/core/BaseLink.java | 85 +++ .../taxis/networks/neural/core/BaseNN.java | 429 ++++++++++++++ .../taxis/networks/neural/core/BaseNode.java | 216 +++++++ .../networks/neural/core/LinearNode.java | 59 ++ .../networks/neural/core/SigmoidNode.java | 52 ++ .../networks/neural/core/intf/Layer.java | 57 ++ .../taxis/networks/neural/core/intf/Link.java | 53 ++ .../neural/core/intf/NeuralNetwork.java | 69 +++ .../taxis/networks/neural/core/intf/Node.java | 98 ++++ .../algos/taxis/tree/AttributeDefinition.java | 100 ++++ .../algos/taxis/tree/AttributeSelector.java | 145 +++++ .../algos/taxis/tree/AttributeUtils.java | 61 ++ src/org/yooreeka/algos/taxis/tree/Branch.java | 92 +++ .../algos/taxis/tree/BranchGroup.java | 130 +++++ .../algos/taxis/tree/ConceptUtils.java | 85 +++ .../taxis/tree/DecisionTreeClassifier.java | 248 ++++++++ .../yooreeka/algos/taxis/tree/InfoGain.java | 151 +++++ src/org/yooreeka/algos/taxis/tree/Node.java | 403 +++++++++++++ .../algos/taxis/tree/SplittingCriterion.java | 124 ++++ .../taxis/tree/TrueErrorRateEstimator.java | 71 +++ .../yooreeka/config/YooreekaConfigurator.java | 220 +++++++ .../credit/BaggingCreditClassifier.java | 79 +++ .../credit/BoostingCreditClassifier.java | 131 +++++ .../examples/credit/CreditConcept.java | 92 +++ .../examples/credit/CreditInstance.java | 121 ++++ .../examples/credit/DTCreditClassifier.java | 194 +++++++ .../examples/credit/NBCreditClassifier.java | 121 ++++ .../examples/credit/NNCreditClassifier.java | 406 +++++++++++++ .../examples/credit/UserCreditNN.java | 211 +++++++ .../examples/credit/data/UseCaseData.java | 194 +++++++ .../examples/credit/data/UserDataset.java | 80 +++ .../examples/credit/data/UserLoader.java | 70 +++ .../credit/data/users/BadUserType.java | 53 ++ .../credit/data/users/DangerousUserType.java | 53 ++ .../credit/data/users/ExcellentUserType.java | 53 ++ .../credit/data/users/GoodUserType.java | 53 ++ .../examples/credit/data/users/User.java | 319 ++++++++++ .../examples/credit/data/users/UserType.java | 512 ++++++++++++++++ .../credit/data/users/VeryGoodUserType.java | 53 ++ .../examples/credit/util/AttributeInfo.java | 68 +++ .../examples/credit/util/AttributeUtils.java | 88 +++ .../util/BootstrapTrainingSetBuilder.java | 121 ++++ .../credit/util/ClassifierResults.java | 70 +++ .../examples/credit/util/CreditDataUtils.java | 100 ++++ .../credit/util/CreditErrorEstimator.java | 231 ++++++++ .../examples/credit/util/DataGenerator.java | 130 +++++ .../credit/util/UserInstanceBuilder.java | 167 ++++++ .../examples/fraud/DTFraudClassifier.java | 136 +++++ .../examples/fraud/NNFraudClassifier.java | 356 ++++++++++++ .../examples/fraud/TransactionConcept.java | 92 +++ .../examples/fraud/TransactionInstance.java | 99 ++++ .../examples/fraud/TransactionNN.java | 106 ++++ .../examples/fraud/data/Transaction.java | 124 ++++ .../fraud/data/TransactionDataset.java | 134 +++++ .../data/TransactionInstanceBuilder.java | 224 +++++++ .../fraud/data/TransactionLoader.java | 59 ++ .../fraud/data/TransactionLocation.java | 94 +++ .../examples/fraud/util/DataGenerator.java | 119 ++++ .../examples/fraud/util/FraudDataUtils.java | 148 +++++ .../fraud/util/FraudErrorEstimator.java | 123 ++++ .../examples/fraud/util/TenUsersSample.java | 363 ++++++++++++ .../fraud/util/TransactionSetProfile.java | 145 +++++ .../examples/fraud/util/UserStatistics.java | 153 +++++ .../fraud/util/UserStatisticsCalculator.java | 164 ++++++ .../examples/newsgroups/NewsCrawler.java | 195 +++++++ .../recommender/MovieLensRMSESample.java | 61 ++ .../examples/recommender/RatingGrapher.java | 174 ++++++ .../examples/recommender/Recommender.java | 119 ++++ src/org/yooreeka/examples/search/DocRank.java | 57 ++ .../examples/search/LuceneIndexer.java | 87 +++ .../yooreeka/examples/search/MySearcher.java | 360 ++++++++++++ .../yooreeka/examples/search/PageRank.java | 56 ++ .../examples/spamfilter/EmailClassifier.java | 247 ++++++++ .../examples/spamfilter/EmailInstance.java | 86 +++ .../examples/spamfilter/data/Email.java | 119 ++++ .../examples/spamfilter/data/EmailData.java | 223 +++++++ .../spamfilter/data/EmailDataset.java | 137 +++++ src/org/yooreeka/util/C.java | 64 ++ src/org/yooreeka/util/P.java | 57 ++ src/org/yooreeka/util/gui/GraphGui.java | 152 +++++ src/org/yooreeka/util/gui/XyGui.java | 203 +++++++ .../util/internet/behavior/UserClick.java | 157 +++++ .../util/internet/behavior/UserQuery.java | 159 +++++ .../crawling/FetchAndProcessCrawler.java | 310 ++++++++++ .../util/internet/crawling/YCrawler.java | 197 +++++++ .../crawling/core/BasicWebCrawler.java | 332 +++++++++++ .../internet/crawling/core/CrawlData.java | 99 ++++ .../crawling/core/CrawlDataProcessor.java | 46 ++ .../crawling/core/DocumentFilter.java | 44 ++ .../internet/crawling/core/URLFilter.java | 79 +++ .../internet/crawling/core/URLNormalizer.java | 77 +++ .../internet/crawling/db/FetchedDocsDB.java | 305 ++++++++++ .../util/internet/crawling/db/KnownUrlDB.java | 279 +++++++++ .../util/internet/crawling/db/PageLinkDB.java | 163 ++++++ .../internet/crawling/db/ProcessedDocsDB.java | 413 +++++++++++++ .../crawling/model/FetchedDocument.java | 143 +++++ .../crawling/model/KnownUrlEntry.java | 77 +++ .../util/internet/crawling/model/Outlink.java | 55 ++ .../crawling/transport/common/Transport.java | 43 ++ .../transport/common/TransportException.java | 47 ++ .../transport/file/FileTransport.java | 134 +++++ .../file/FileTransportException.java | 49 ++ .../transport/http/HTTPTransport.java | 260 +++++++++ .../http/HTTPTransportException.java | 46 ++ .../crawling/transport/http/HTTPUtils.java | 142 +++++ .../crawling/util/DocumentIdUtils.java | 56 ++ .../internet/crawling/util/FileUtils.java | 130 +++++ .../util/internet/crawling/util/UrlGroup.java | 71 +++ .../util/internet/crawling/util/UrlUtils.java | 65 +++ .../crawling/util/ValueToIndexMapping.java | 93 +++ .../yooreeka/util/metrics/CosineDistance.java | 58 ++ .../util/metrics/CosineSimilarity.java | 76 +++ .../util/metrics/CosineSimilarityMeasure.java | 56 ++ .../util/metrics/EuclideanDistance.java | 55 ++ .../util/metrics/JaccardCoefficient.java | 77 +++ .../util/metrics/JaccardDistance.java | 57 ++ .../util/metrics/NumericDistance.java | 40 ++ .../util/metrics/SimilarityMeasure.java | 43 ++ .../util/metrics/TermFrequencyBuilder.java | 78 +++ .../util/parsing/common/AbstractDocument.java | 48 ++ .../util/parsing/common/DataEntry.java | 40 ++ .../util/parsing/common/DataField.java | 68 +++ .../util/parsing/common/DataType.java | 40 ++ .../util/parsing/common/DocumentParser.java | 44 ++ .../common/DocumentParserException.java | 45 ++ .../parsing/common/DocumentParserFactory.java | 68 +++ .../parsing/common/ProcessedDocument.java | 198 +++++++ .../util/parsing/csv/CSVDocument.java | 93 +++ .../yooreeka/util/parsing/csv/CSVEntry.java | 108 ++++ .../yooreeka/util/parsing/csv/CSVFile.java | 149 +++++ .../yooreeka/util/parsing/csv/CSVParser.java | 135 +++++ .../yooreeka/util/parsing/csv/CSVSchema.java | 58 ++ .../util/parsing/html/CompositeFilter.java | 64 ++ .../util/parsing/html/ElementNodeFilter.java | 61 ++ .../util/parsing/html/HTMLDocumentParser.java | 457 +++++++++++++++ .../html/HTMLDocumentParserException.java | 49 ++ .../util/parsing/html/HTMLWriter.java | 119 ++++ .../util/parsing/html/LinkNodeFilter.java | 58 ++ .../util/parsing/html/MultiFilter.java | 61 ++ .../parsing/msword/MSWordDocumentParser.java | 103 ++++ .../msword/MSWordDocumentParserException.java | 49 ++ .../util/text/AlphabetProjection.java | 313 ++++++++++ 255 files changed, 35422 insertions(+) create mode 100644 src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/MST.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/model/Attribute.java create mode 100644 src/org/yooreeka/algos/clustering/model/Cluster.java create mode 100644 src/org/yooreeka/algos/clustering/model/DataPoint.java create mode 100644 src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/rock/LinkMatrix.java create mode 100644 src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java create mode 100644 src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java create mode 100644 src/org/yooreeka/algos/clustering/rock/ROCKClusters.java create mode 100644 src/org/yooreeka/algos/clustering/rock/SimilarCluster.java create mode 100644 src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java create mode 100644 src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java create mode 100644 src/org/yooreeka/algos/clustering/test/SFData.java create mode 100644 src/org/yooreeka/algos/clustering/test/SFDataset.java create mode 100644 src/org/yooreeka/algos/clustering/utils/Attributes.java create mode 100644 src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java create mode 100644 src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java create mode 100644 src/org/yooreeka/algos/reco/collab/cache/FileStore.java create mode 100644 src/org/yooreeka/algos/reco/collab/cache/Store.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/BaseDataset.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/ContentItem.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/DiggData.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/HTMLContent.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MovieLensData.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicData.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicItem.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicRating.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/MusicUser.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsData.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsItem.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/NewsUser.java create mode 100644 src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java create mode 100644 src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/Content.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/Dataset.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/Item.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/Rating.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/RecommendationType.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/SimilarItem.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/SimilarUser.java create mode 100644 src/org/yooreeka/algos/reco/collab/model/User.java create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/Delphi.java create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/DiggDelphi.java create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/MovieLensDelphi.java create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/PredictedItemRating.java create mode 100644 src/org/yooreeka/algos/reco/collab/recommender/Recommender.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensItemSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensUserSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedItemBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedUserBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemContentBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/ItemPenaltyBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrix.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrixImpl.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserContentBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/naive/UserItemContentBasedSimilarity.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrix.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrixImpl.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/PearsonCorrelation.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/RatingCountMatrix.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixCache.java create mode 100644 src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggCategory.java create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggService.java create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggStoryItem.java create mode 100644 src/org/yooreeka/algos/reco/content/digg/DiggUser.java create mode 100644 src/org/yooreeka/algos/search/data/SearchResult.java create mode 100644 src/org/yooreeka/algos/search/lucene/LuceneIndexBuilder.java create mode 100644 src/org/yooreeka/algos/search/lucene/analyzer/CustomAnalyzer.java create mode 100644 src/org/yooreeka/algos/search/lucene/analyzer/TextDocumentTerms.java create mode 100644 src/org/yooreeka/algos/search/ranking/DocRankMatrixBuilder.java create mode 100644 src/org/yooreeka/algos/search/ranking/PageRankMatrixBuilder.java create mode 100644 src/org/yooreeka/algos/search/ranking/PageRankMatrixH.java create mode 100644 src/org/yooreeka/algos/search/ranking/Rank.java create mode 100644 src/org/yooreeka/algos/search/ranking/RelevanceScore.java create mode 100644 src/org/yooreeka/algos/search/util/TermFreqMapUtils.java create mode 100644 src/org/yooreeka/algos/taxis/bayesian/NaiveBayes.java create mode 100644 src/org/yooreeka/algos/taxis/boosting/BoostingARCX4Classifier.java create mode 100644 src/org/yooreeka/algos/taxis/boosting/WeightBasedRandom.java create mode 100644 src/org/yooreeka/algos/taxis/core/AttributeValue.java create mode 100644 src/org/yooreeka/algos/taxis/core/BaseConcept.java create mode 100644 src/org/yooreeka/algos/taxis/core/BaseInstance.java create mode 100644 src/org/yooreeka/algos/taxis/core/DoubleAttribute.java create mode 100644 src/org/yooreeka/algos/taxis/core/StringAttribute.java create mode 100644 src/org/yooreeka/algos/taxis/core/TrainingSet.java create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Attribute.java create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Classifier.java create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Concept.java create mode 100644 src/org/yooreeka/algos/taxis/core/intf/Instance.java create mode 100644 src/org/yooreeka/algos/taxis/ensemble/ClassifierEnsemble.java create mode 100644 src/org/yooreeka/algos/taxis/ensemble/ConceptMajorityVoter.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/ClassifierResults.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/CochransQTest.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/Diff2PropTest.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/FTest.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/McNemarTest.java create mode 100644 src/org/yooreeka/algos/taxis/evaluation/Test.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/XORNetwork.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseLayer.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseLink.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseNN.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/BaseNode.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/LinearNode.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/SigmoidNode.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Layer.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Link.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/NeuralNetwork.java create mode 100644 src/org/yooreeka/algos/taxis/networks/neural/core/intf/Node.java create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeDefinition.java create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeSelector.java create mode 100644 src/org/yooreeka/algos/taxis/tree/AttributeUtils.java create mode 100644 src/org/yooreeka/algos/taxis/tree/Branch.java create mode 100644 src/org/yooreeka/algos/taxis/tree/BranchGroup.java create mode 100644 src/org/yooreeka/algos/taxis/tree/ConceptUtils.java create mode 100644 src/org/yooreeka/algos/taxis/tree/DecisionTreeClassifier.java create mode 100644 src/org/yooreeka/algos/taxis/tree/InfoGain.java create mode 100644 src/org/yooreeka/algos/taxis/tree/Node.java create mode 100644 src/org/yooreeka/algos/taxis/tree/SplittingCriterion.java create mode 100644 src/org/yooreeka/algos/taxis/tree/TrueErrorRateEstimator.java create mode 100644 src/org/yooreeka/config/YooreekaConfigurator.java create mode 100644 src/org/yooreeka/examples/credit/BaggingCreditClassifier.java create mode 100644 src/org/yooreeka/examples/credit/BoostingCreditClassifier.java create mode 100644 src/org/yooreeka/examples/credit/CreditConcept.java create mode 100644 src/org/yooreeka/examples/credit/CreditInstance.java create mode 100644 src/org/yooreeka/examples/credit/DTCreditClassifier.java create mode 100644 src/org/yooreeka/examples/credit/NBCreditClassifier.java create mode 100644 src/org/yooreeka/examples/credit/NNCreditClassifier.java create mode 100644 src/org/yooreeka/examples/credit/UserCreditNN.java create mode 100644 src/org/yooreeka/examples/credit/data/UseCaseData.java create mode 100644 src/org/yooreeka/examples/credit/data/UserDataset.java create mode 100644 src/org/yooreeka/examples/credit/data/UserLoader.java create mode 100644 src/org/yooreeka/examples/credit/data/users/BadUserType.java create mode 100644 src/org/yooreeka/examples/credit/data/users/DangerousUserType.java create mode 100644 src/org/yooreeka/examples/credit/data/users/ExcellentUserType.java create mode 100644 src/org/yooreeka/examples/credit/data/users/GoodUserType.java create mode 100644 src/org/yooreeka/examples/credit/data/users/User.java create mode 100644 src/org/yooreeka/examples/credit/data/users/UserType.java create mode 100644 src/org/yooreeka/examples/credit/data/users/VeryGoodUserType.java create mode 100644 src/org/yooreeka/examples/credit/util/AttributeInfo.java create mode 100644 src/org/yooreeka/examples/credit/util/AttributeUtils.java create mode 100644 src/org/yooreeka/examples/credit/util/BootstrapTrainingSetBuilder.java create mode 100644 src/org/yooreeka/examples/credit/util/ClassifierResults.java create mode 100644 src/org/yooreeka/examples/credit/util/CreditDataUtils.java create mode 100644 src/org/yooreeka/examples/credit/util/CreditErrorEstimator.java create mode 100644 src/org/yooreeka/examples/credit/util/DataGenerator.java create mode 100644 src/org/yooreeka/examples/credit/util/UserInstanceBuilder.java create mode 100644 src/org/yooreeka/examples/fraud/DTFraudClassifier.java create mode 100644 src/org/yooreeka/examples/fraud/NNFraudClassifier.java create mode 100644 src/org/yooreeka/examples/fraud/TransactionConcept.java create mode 100644 src/org/yooreeka/examples/fraud/TransactionInstance.java create mode 100644 src/org/yooreeka/examples/fraud/TransactionNN.java create mode 100644 src/org/yooreeka/examples/fraud/data/Transaction.java create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionDataset.java create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionInstanceBuilder.java create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionLoader.java create mode 100644 src/org/yooreeka/examples/fraud/data/TransactionLocation.java create mode 100644 src/org/yooreeka/examples/fraud/util/DataGenerator.java create mode 100644 src/org/yooreeka/examples/fraud/util/FraudDataUtils.java create mode 100644 src/org/yooreeka/examples/fraud/util/FraudErrorEstimator.java create mode 100644 src/org/yooreeka/examples/fraud/util/TenUsersSample.java create mode 100644 src/org/yooreeka/examples/fraud/util/TransactionSetProfile.java create mode 100644 src/org/yooreeka/examples/fraud/util/UserStatistics.java create mode 100644 src/org/yooreeka/examples/fraud/util/UserStatisticsCalculator.java create mode 100644 src/org/yooreeka/examples/newsgroups/NewsCrawler.java create mode 100644 src/org/yooreeka/examples/recommender/MovieLensRMSESample.java create mode 100644 src/org/yooreeka/examples/recommender/RatingGrapher.java create mode 100644 src/org/yooreeka/examples/recommender/Recommender.java create mode 100644 src/org/yooreeka/examples/search/DocRank.java create mode 100644 src/org/yooreeka/examples/search/LuceneIndexer.java create mode 100644 src/org/yooreeka/examples/search/MySearcher.java create mode 100644 src/org/yooreeka/examples/search/PageRank.java create mode 100644 src/org/yooreeka/examples/spamfilter/EmailClassifier.java create mode 100644 src/org/yooreeka/examples/spamfilter/EmailInstance.java create mode 100644 src/org/yooreeka/examples/spamfilter/data/Email.java create mode 100644 src/org/yooreeka/examples/spamfilter/data/EmailData.java create mode 100644 src/org/yooreeka/examples/spamfilter/data/EmailDataset.java create mode 100644 src/org/yooreeka/util/C.java create mode 100644 src/org/yooreeka/util/P.java create mode 100644 src/org/yooreeka/util/gui/GraphGui.java create mode 100644 src/org/yooreeka/util/gui/XyGui.java create mode 100644 src/org/yooreeka/util/internet/behavior/UserClick.java create mode 100644 src/org/yooreeka/util/internet/behavior/UserQuery.java create mode 100644 src/org/yooreeka/util/internet/crawling/FetchAndProcessCrawler.java create mode 100644 src/org/yooreeka/util/internet/crawling/YCrawler.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/BasicWebCrawler.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/CrawlData.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/CrawlDataProcessor.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/DocumentFilter.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/URLFilter.java create mode 100644 src/org/yooreeka/util/internet/crawling/core/URLNormalizer.java create mode 100644 src/org/yooreeka/util/internet/crawling/db/FetchedDocsDB.java create mode 100644 src/org/yooreeka/util/internet/crawling/db/KnownUrlDB.java create mode 100644 src/org/yooreeka/util/internet/crawling/db/PageLinkDB.java create mode 100644 src/org/yooreeka/util/internet/crawling/db/ProcessedDocsDB.java create mode 100644 src/org/yooreeka/util/internet/crawling/model/FetchedDocument.java create mode 100644 src/org/yooreeka/util/internet/crawling/model/KnownUrlEntry.java create mode 100644 src/org/yooreeka/util/internet/crawling/model/Outlink.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/common/Transport.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/common/TransportException.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/file/FileTransport.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/file/FileTransportException.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransport.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransportException.java create mode 100644 src/org/yooreeka/util/internet/crawling/transport/http/HTTPUtils.java create mode 100644 src/org/yooreeka/util/internet/crawling/util/DocumentIdUtils.java create mode 100644 src/org/yooreeka/util/internet/crawling/util/FileUtils.java create mode 100644 src/org/yooreeka/util/internet/crawling/util/UrlGroup.java create mode 100644 src/org/yooreeka/util/internet/crawling/util/UrlUtils.java create mode 100644 src/org/yooreeka/util/internet/crawling/util/ValueToIndexMapping.java create mode 100644 src/org/yooreeka/util/metrics/CosineDistance.java create mode 100644 src/org/yooreeka/util/metrics/CosineSimilarity.java create mode 100644 src/org/yooreeka/util/metrics/CosineSimilarityMeasure.java create mode 100644 src/org/yooreeka/util/metrics/EuclideanDistance.java create mode 100644 src/org/yooreeka/util/metrics/JaccardCoefficient.java create mode 100644 src/org/yooreeka/util/metrics/JaccardDistance.java create mode 100644 src/org/yooreeka/util/metrics/NumericDistance.java create mode 100644 src/org/yooreeka/util/metrics/SimilarityMeasure.java create mode 100644 src/org/yooreeka/util/metrics/TermFrequencyBuilder.java create mode 100644 src/org/yooreeka/util/parsing/common/AbstractDocument.java create mode 100644 src/org/yooreeka/util/parsing/common/DataEntry.java create mode 100644 src/org/yooreeka/util/parsing/common/DataField.java create mode 100644 src/org/yooreeka/util/parsing/common/DataType.java create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParser.java create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParserException.java create mode 100644 src/org/yooreeka/util/parsing/common/DocumentParserFactory.java create mode 100644 src/org/yooreeka/util/parsing/common/ProcessedDocument.java create mode 100644 src/org/yooreeka/util/parsing/csv/CSVDocument.java create mode 100644 src/org/yooreeka/util/parsing/csv/CSVEntry.java create mode 100644 src/org/yooreeka/util/parsing/csv/CSVFile.java create mode 100644 src/org/yooreeka/util/parsing/csv/CSVParser.java create mode 100644 src/org/yooreeka/util/parsing/csv/CSVSchema.java create mode 100644 src/org/yooreeka/util/parsing/html/CompositeFilter.java create mode 100644 src/org/yooreeka/util/parsing/html/ElementNodeFilter.java create mode 100644 src/org/yooreeka/util/parsing/html/HTMLDocumentParser.java create mode 100644 src/org/yooreeka/util/parsing/html/HTMLDocumentParserException.java create mode 100644 src/org/yooreeka/util/parsing/html/HTMLWriter.java create mode 100644 src/org/yooreeka/util/parsing/html/LinkNodeFilter.java create mode 100644 src/org/yooreeka/util/parsing/html/MultiFilter.java create mode 100644 src/org/yooreeka/util/parsing/msword/MSWordDocumentParser.java create mode 100644 src/org/yooreeka/util/parsing/msword/MSWordDocumentParserException.java create mode 100644 src/org/yooreeka/util/text/AlphabetProjection.java diff --git a/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java b/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java new file mode 100644 index 0000000..82dfd83 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/dbscan/DBSCANAlgorithm.java @@ -0,0 +1,448 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.dbscan; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping; +import org.yooreeka.util.P; +import org.yooreeka.util.metrics.NumericDistance; +import org.yooreeka.util.metrics.TermFrequencyBuilder; + +/** + * Implementation of DBSCAN clustering algorithm. + *

+ * Algorithm parameters: + *

    + *
  1. Eps - threshold value to determine point neighbors. Two points are + * neighbors if the distance between them does not exceed this threshold value.
  2. + *
  3. MinPts - minimum number of points in any cluster.
  4. + *
+ * Choice of parameter values depends on the data. + *

+ *

+ * Point types: + *

    + *
  1. Core point - point that belongs to the core of the cluster. It has at + * least MinPts neighboring points.
  2. + *
  3. Border point - is a neighbor to at least one core point but it doesn't + * have enough neighbors to be a core point.
  4. + *
  5. Noise point - is a point that doesn't belong to any cluster because it is + * not close to any of the core points.
  6. + *
+ */ +public class DBSCANAlgorithm { + + private static final Logger LOG = Logger.getLogger(DBSCANAlgorithm.class + .getName()); + + private static double[][] calculateAdjacencyMatrix(NumericDistance distance, + DataPoint[] points, boolean useTermFrequencies) { + int n = points.length; + double[][] a = new double[n][n]; + for (int i = 0; i < n; i++) { + double[] x = points[i].getNumericAttrValues(); + for (int j = i + 1; j < n; j++) { + double[] y; + if (useTermFrequencies) { + double[][] tfVectors = TermFrequencyBuilder + .buildTermFrequencyVectors( + points[i].getTextAttrValues(), + points[j].getTextAttrValues()); + x = tfVectors[0]; + y = tfVectors[1]; + } else { + y = points[j].getNumericAttrValues(); + } + a[i][j] = distance.getDistance(x, y); + a[j][i] = a[i][j]; + } + a[i][i] = 0.0; + } + return a; + } + + public static void main(String[] args) { + + DataPoint[] elements = new DataPoint[5]; + elements[0] = new DataPoint("A", new double[] {}); + elements[1] = new DataPoint("B", new double[] {}); + elements[2] = new DataPoint("C", new double[] {}); + elements[3] = new DataPoint("D", new double[] {}); + elements[4] = new DataPoint("E", new double[] {}); + + double[][] a = new double[][] { { 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 2 }, + { 2, 2, 2, 11, 31 }, { 2, 2, 2, 10, 30 }, { 60, 60, 60, 0, 0 } }; + + double eps = 0.5; + int minPoints = 2; + + DBSCANAlgorithm dbscan = new DBSCANAlgorithm(elements, a, eps, + minPoints); + + printResults(dbscan.cluster(), eps,minPoints); + } + + /* + * Data points for clustering. + */ + private DataPoint[] points; + + /* + * Adjacency matrix. Contains distances between points. + */ + private double[][] adjacencyMatrix; + + /* + * Threshold value. Determines which points will be considered as neighbors. + * Two points are neighbors if the distance between them does not exceed + * threshold value. + */ + private double eps; + + /* + * Identifies a set of Noise points. + */ + private static int CLUSTER_ID_NOISE = -1; + + /* + * Identifies a set of Unclassified points. + */ + private int CLUSTER_ID_UNCLASSIFIED = 0; + + /* + * Sequence that is used to generate next cluster id. + */ + private int nextClusterId = 1; + + /* + * Sets of points. Initially all points will be assigned into Unclassified + * points set. + */ + private Map> clusters = new LinkedHashMap>(); + + /* + * Number of points that should exist in the neighborhood for a point to be + * a core point. + * + * Best value for this parameter depends on the data set. + */ + private int minPoints; + + private ObjectToIndexMapping idxMapping = new ObjectToIndexMapping(); + + private boolean verbose = true; + + /** + * Initializes algorithm with all data that it needs. + * + * @param points + * all points to cluster + * @param distance + * metric distance function + * @param eps + * threshold value used to calculate point neighborhood. + * @param minPoints + * number of neighbors for point to be considered a core point. + */ + public DBSCANAlgorithm(DataPoint[] points, NumericDistance distance, double eps, + int minPoints, boolean useTermFrequencies) { + + init(points, eps, minPoints); + this.adjacencyMatrix = calculateAdjacencyMatrix(distance, points, + useTermFrequencies); + } + + /** + * Initializes algorithm with all data that it needs. + * + * @param points + * points to cluster + * @param adjacencyMatrix + * adjacency matrix with distances + * @param eps + * distance threshold value + * @param minPoints + * number of neighbors for point to be considered a core point. + */ + public DBSCANAlgorithm(DataPoint[] points, double[][] adjacencyMatrix, + double eps, int minPoints) { + init(points, eps, minPoints); + this.adjacencyMatrix = adjacencyMatrix; + } + + private void assignPointToCluster(DataPoint p, int clusterId) { + + // Remove point from the group that it currently belongs to... + if (isNoise(p)) { + removePointFromCluster(p, CLUSTER_ID_NOISE); + } else if (isUnclassified(p)) { + removePointFromCluster(p, CLUSTER_ID_UNCLASSIFIED); + } else { + if (clusterId != CLUSTER_ID_UNCLASSIFIED) { + throw new RuntimeException( + "Trying to move point that has already been" + + "assigned to some other cluster. Point: " + p + + ", clusterId=" + clusterId); + } else { + // do nothing. we are registering a brand new point in + // UNCLASSIFIED set. + } + } + + Set points = clusters.get(clusterId); + if (points == null) { + points = new HashSet(); + clusters.put(clusterId, points); + } + points.add(p); + } + + private void assignPointToCluster(Set points, int clusterId) { + for (DataPoint p : points) { + assignPointToCluster(p, clusterId); + } + } + + public List cluster() { + int clusterId = getNextClusterId(); + + for (DataPoint p : points) { + if (isUnclassified(p)) { + + boolean isClusterCreated = createCluster(p, clusterId); + + if (isClusterCreated) { + // Generate id for the next cluster + clusterId = getNextClusterId(); + } + } + } + + // Convert sets of points into clusters... + List allClusters = new ArrayList(); + + for (Map.Entry> e : clusters.entrySet()) { + + String label = String.valueOf(e.getKey()); + + Set points = e.getValue(); + + if (points != null && !points.isEmpty()) { + + Cluster cluster = new Cluster(label, e.getValue()); + + allClusters.add(cluster); + } + } + + // Group with Noise elements returned as well + return allClusters; + } + + private boolean createCluster(DataPoint p, Integer clusterId) { + + boolean isClusterCreated = false; + + Set nPoints = findNeighbors(p, eps); + + if (nPoints.size() < minPoints) { + // Assign point into "Noise" group. + // It will have a chance to become a border point later on. + assignPointToCluster(p, CLUSTER_ID_NOISE); + + // return false to indicate that we didn't create any cluster + isClusterCreated = false; + + } else { + + // All points are reachable from the core point... + assignPointToCluster(nPoints, clusterId); + + // Remove point itself. + nPoints.remove(p); + + // Process the rest of the neighbors... + while (nPoints.size() > 0) { + + // pick the first neighbor + DataPoint nPoint = nPoints.iterator().next(); + + // process neighbor + Set nnPoints = findNeighbors(nPoint, eps); + + if (nnPoints.size() >= minPoints) { + + // nPoint is another core point. + for (DataPoint nnPoint : nnPoints) { + + if (isNoise(nnPoint)) { + + /* + * It's a border point. We know that it doesn't have + * enough neighbors to be a core point. Just add it + * to the cluster. + */ + assignPointToCluster(nnPoint, clusterId); + + } else if (isUnclassified(nnPoint)) { + + /* + * We don't know if this point has enough neighbors + * to be a core point... add it to the list of + * points to be checked. + */ + nPoints.add(nnPoint); + + /* + * And assign it to the cluster + */ + assignPointToCluster(nnPoint, clusterId); + } + } + } else { + // do nothing. The neighbor is just a border point. + } + + nPoints.remove(nPoint); + } + + // return true to indicate that we did create a cluster + isClusterCreated = true; + } + + return isClusterCreated; + } + + private Set findNeighbors(DataPoint p, double threshold) { + Set neighbors = new HashSet(); + int i = idxMapping.getIndex(p); + for (int j = 0, n = idxMapping.getSize(); j < n; j++) { + if (adjacencyMatrix[i][j] <= threshold) { + neighbors.add(idxMapping.getObject(j)); + } + } + return neighbors; + } + + private int getNextClusterId() { + return nextClusterId++; + } + + private void init(DataPoint[] points, double neighborThreshold, + int minPoints) { + + LOG.setLevel(Level.FINEST); //YooreekaConfigurator.getLevel(DBSCANAlgorithm.class.getName())); + + this.points = points; + this.eps = neighborThreshold; + this.minPoints = minPoints; + + for (DataPoint p : points) { + // Creating a Point <-> Index mappping for all points + idxMapping.getIndex(p); + // Assign all points into "Unclassified" group + assignPointToCluster(p, CLUSTER_ID_UNCLASSIFIED); + } + } + + private boolean isNoise(DataPoint p) { + return isPointInCluster(p, CLUSTER_ID_NOISE); + } + + private boolean isPointInCluster(DataPoint p, int clusterId) { + boolean inCluster = false; + Set points = clusters.get(clusterId); + if (points != null) { + inCluster = points.contains(p); + } + return inCluster; + } + + private boolean isUnclassified(DataPoint p) { + return isPointInCluster(p, CLUSTER_ID_UNCLASSIFIED); + + } + + public boolean isVerbose() { + return verbose; + } + + public void printDistances() { + LOG.info("Point Similarity matrix:"); + for (int i = 0; i < adjacencyMatrix.length; i++) { + LOG.info(Arrays.toString(adjacencyMatrix[i])); + } + } + + public static void printResults(List allClusters, double eps, int minPoints) { + StringBuilder sb = new StringBuilder(); + sb.append("DBSCAN Clustering with NeighborThreshold=").append(eps); + sb.append(", minPoints=").append(minPoints).append("\n"); + sb.append("Clusters:\n"); + String noiseElements = "no noise elements"; + for (Cluster c : allClusters) { + if (String.valueOf(CLUSTER_ID_NOISE).equals(c.getLabel())) { + // print noise data at the end + noiseElements = c.getElementsAsString(); + } else { + sb.append("____________________________________________________________\n"); + sb.append(c.getLabel()).append(": \n").append(c.getElementsAsString()); + sb.append("____________________________________________________________\n\n"); + } + } + sb.append("Noise Elements:\n ").append(noiseElements).append("\n"); + P.println(sb.toString()); + } + private boolean removePointFromCluster(DataPoint p, int clusterId) { + boolean removed = false; + Set points = clusters.get(clusterId); + if (points != null) { + removed = points.remove(p); + } + return removed; + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java new file mode 100644 index 0000000..e5963e0 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/AverageLinkAlgorithm.java @@ -0,0 +1,145 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping; + +/** A hierarchical agglomerative clustering algorithm based on the average link */ +public class AverageLinkAlgorithm { + + public static void main(String[] args) { + // Define data + DataPoint[] elements = new DataPoint[5]; + elements[0] = new DataPoint("A", new double[] {}); + elements[1] = new DataPoint("B", new double[] {}); + elements[2] = new DataPoint("C", new double[] {}); + elements[3] = new DataPoint("D", new double[] {}); + elements[4] = new DataPoint("E", new double[] {}); + + double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 }, + { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } }; + + AverageLinkAlgorithm ca = new AverageLinkAlgorithm(elements, a); + Dendrogram dnd = ca.cluster(); + dnd.printAll(); + } + private DataPoint[] elements; + private double[][] a; + + private ClusterSet allClusters; + + public AverageLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) { + this.elements = elements; + this.a = adjacencyMatrix; + this.allClusters = new ClusterSet(); + } + + public Dendrogram cluster() { + + Dendrogram dnd = new Dendrogram("Distance"); + double d = 0.0; + + // initially load all elements as individual clusters + for (DataPoint e : elements) { + Cluster c = new Cluster(e); + allClusters.add(c); + } + + dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); + + d = 1.0; + + while (allClusters.size() > 1) { + int K = allClusters.size(); + mergeClusters(d); + // it is possible that there were no clusters to merge for current + // d. + if (K > allClusters.size()) { + dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); + K = allClusters.size(); + } + + d = d + 0.5; + } + return dnd; + } + + private void mergeClusters(double distanceThreshold) { + int nClusters = allClusters.size(); + + ObjectToIndexMapping idxMapping = new ObjectToIndexMapping(); + + double[][] clusterDistances = new double[nClusters][nClusters]; + + for (int i = 0, n = a.length; i < n; i++) { + for (int j = i + 1, k = a.length; j < k; j++) { + double d = a[i][j]; + if (d > 0) { + DataPoint e1 = elements[i]; + DataPoint e2 = elements[j]; + Cluster c1 = allClusters.findClusterByElement(e1); + Cluster c2 = allClusters.findClusterByElement(e2); + if (!c1.equals(c2)) { + int ci = idxMapping.getIndex(c1); + int cj = idxMapping.getIndex(c2); + clusterDistances[ci][cj] += d; + clusterDistances[cj][ci] += d; + } + } + } + } + + boolean[] merged = new boolean[clusterDistances.length]; + for (int i = 0, n = clusterDistances.length; i < n; i++) { + for (int j = i + 1, k = clusterDistances.length; j < k; j++) { + Cluster ci = idxMapping.getObject(i); + Cluster cj = idxMapping.getObject(j); + int ni = ci.size(); + int nj = cj.size(); + clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj); + clusterDistances[j][i] = clusterDistances[i][j]; + // merge clusters if distance is below the threshold + if (merged[i] == false && merged[j] == false) { + if (clusterDistances[i][j] <= distanceThreshold) { + allClusters.remove(ci); + allClusters.remove(cj); + Cluster mergedCluster = new Cluster(ci, cj); + allClusters.add(mergedCluster); + merged[i] = true; + merged[j] = true; + } + } + } + } + } +} diff --git a/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java b/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java new file mode 100644 index 0000000..236fb38 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/ClusterSet.java @@ -0,0 +1,83 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; + +/** + * Set of clusters. + */ +public class ClusterSet { + + private Set allClusters = new HashSet(); + + public boolean add(Cluster c) { + return allClusters.add(c); + } + + public Cluster findClusterByElement(DataPoint e) { + Cluster cluster = null; + for (Cluster c : allClusters) { + if (c.contains(e)) { + cluster = c; + break; + } + } + return cluster; + } + + public List getAllClusters() { + return new ArrayList(allClusters); + } + + public boolean remove(Cluster c) { + return allClusters.remove(c); + } + + public int size() { + return allClusters.size(); + } + + // public ClusterSet copy() { + // ClusterSet clusterSet = new ClusterSet(); + // for(Cluster c : this.allClusters ) { + // Cluster clusterCopy = c.copy(); + // clusterSet.add(clusterCopy); + // } + // return clusterSet; + // } +} diff --git a/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java b/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java new file mode 100644 index 0000000..bdfd51f --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/Dendrogram.java @@ -0,0 +1,162 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.config.YooreekaConfigurator; + +public class Dendrogram { + + private static final Logger LOG = Logger.getLogger(Dendrogram.class.getName()); + + /* + * Clusters by level. + */ + private Map entryMap; + private Map levelLabels; + private Integer nextLevel; + private String levelLabelName; + + public Dendrogram(String levelLabelName) { + + LOG.setLevel(YooreekaConfigurator.getLevel(Dendrogram.class.getName())); + + entryMap = new LinkedHashMap(); + levelLabels = new LinkedHashMap(); + nextLevel = 1; + this.levelLabelName = levelLabelName; + } + + public int addLevel(String label, Cluster cluster) { + List values = new ArrayList(); + values.add(cluster); + return addLevel(label, values); + } + + /** + * Creates a new dendrogram level using copies of provided clusters. + */ + public int addLevel(String label, Collection clusters) { + + ClusterSet clusterSet = new ClusterSet(); + + for (Cluster c : clusters) { + // copy cluster before adding - over time cluster elements may + // change + // but for dendrogram we want to keep current state. + clusterSet.add(c.copy()); + } + + int level = nextLevel; + + entryMap.put(level, clusterSet); + levelLabels.put(level, label); + + nextLevel++; + return level; + } + + public List getAllLevels() { + return new ArrayList(entryMap.keySet()); + } + + public List getClustersForLevel(int level) { + ClusterSet cs = entryMap.get(level); + return cs.getAllClusters(); + } + + public String getLabelForLevel(int level) { + return levelLabels.get(level); + } + + public int getTopLevel() { + return nextLevel - 1; + } + + public void print(int level) { + String label = levelLabels.get(level); + ClusterSet clusters = entryMap.get(level); + LOG.info("Clusters for: level=" + level + ", " + + levelLabelName + "=" + label); + for (Cluster c : clusters.getAllClusters()) { + if (c.getElements().size() > 1) { + LOG.info("____________________________________________________________\n"); + LOG.info(c.getElementsAsString()); + LOG.info("____________________________________________________________\n\n"); + } + } + } + + public void printAll() { + for (Map.Entry e : entryMap.entrySet()) { + Integer level = e.getKey(); + print(level); + } + } + + /** + * Replaces clusters in the specified level. If level doesn't exist it will + * be created. + * + * @param level + * dendrogram level. + * @param label + * level description. + * @param clusters + * clusters for the level. + * @return + */ + public void setLevel(int level, String label, Collection clusters) { + + ClusterSet clusterSet = new ClusterSet(); + + for (Cluster c : clusters) { + clusterSet.add(c.copy()); + } + + LOG.fine("Setting cluster level: " + level); + + entryMap.put(level, clusterSet); + levelLabels.put(level, label); + + if (level >= nextLevel) { + nextLevel = level + 1; + } + } + +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/clustering/hierarchical/MST.java b/src/org/yooreeka/algos/clustering/hierarchical/MST.java new file mode 100644 index 0000000..6b78307 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/MST.java @@ -0,0 +1,130 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +/** + * Basic implementation of Prim's algorithm to build Minimal Spanning Tree + * (MST). + * + */ +public class MST { + + class Edge { + + private int i; + private int j; + private double w; + + Edge(int i, int j, double w) { + this.i = i; + this.j = j; + this.w = w; + } + + public int getI() { + return i; + } + + public int getJ() { + return j; + } + + public double getW() { + return w; + } + + } + + /** The adjacency matrix of the graph */ + private double[][] adjM; + + public MST() { + } + + public double[][] buildMST(double[][] adjM) { + + this.adjM = adjM; + + // Marks nodes that belong to MST. Initial MST has only one node. + boolean[] allV = new boolean[adjM.length]; + allV[0] = true; + + // Adjacency matrix defining MST + double[][] mst = new double[adjM.length][adjM.length]; + for (int i = 0, n = mst.length; i < n; i++) { + for (int j = 0; j < n; j++) { + /* + * Using -1 to indicate that there is no edge between nodes i + * and j. Can't use 0 because it is a valid distance. + */ + mst[i][j] = -1; + } + } + + Edge e = null; + while ((e = findMinimumEdge(allV)) != null) { + allV[e.getJ()] = true; + mst[e.getI()][e.getJ()] = e.getW(); + mst[e.getJ()][e.getI()] = e.getW(); + } + + return mst; + } + + private Edge findMinimumEdge(boolean[] mstV) { + Edge e = null; + double minW = Double.POSITIVE_INFINITY; + int minI = -1; + int minJ = -1; + + for (int i = 0, n = adjM.length; i < n; i++) { + // part of MST + if (mstV[i] == true) { + for (int j = 0, k = adjM.length; j < k; j++) { + // not part of MST + if (mstV[j] == false) { + if (minW > adjM[i][j]) { + minW = adjM[i][j]; + minI = i; + minJ = j; + } + } + } + } + } + + if (minI > -1) { + e = new Edge(minI, minJ, minW); + } + + return e; + } +} diff --git a/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java new file mode 100644 index 0000000..2162623 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/MSTSingleLinkAlgorithm.java @@ -0,0 +1,142 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.config.YooreekaConfigurator; + +public class MSTSingleLinkAlgorithm { + + private static final Logger LOG = Logger.getLogger(MSTSingleLinkAlgorithm.class.getName()); + + public static void main(String[] args) { + // Define data + DataPoint[] elements = new DataPoint[5]; + elements[0] = new DataPoint("A", new double[] {}); + elements[1] = new DataPoint("B", new double[] {}); + elements[2] = new DataPoint("C", new double[] {}); + elements[3] = new DataPoint("D", new double[] {}); + elements[4] = new DataPoint("E", new double[] {}); + + double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 }, + { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } }; + + MSTSingleLinkAlgorithm ca = new MSTSingleLinkAlgorithm(elements, a); + Dendrogram dnd = ca.cluster(); + dnd.printAll(); + } + private DataPoint[] elements; + private double[][] a; + private double[][] m; + + private ClusterSet allClusters; + + public MSTSingleLinkAlgorithm(DataPoint[] elements, + double[][] adjacencyMatrix) { + + LOG.setLevel(YooreekaConfigurator.getLevel(MSTSingleLinkAlgorithm.class.getName())); + + this.elements = elements; + this.a = adjacencyMatrix; + this.allClusters = new ClusterSet(); + } + + public Dendrogram cluster() { + + m = (new MST()).buildMST(a); + + Dendrogram dnd = new Dendrogram("Distance"); + double d = 0.0; + + // initially load all elements as individual clusters + for (DataPoint e : elements) { + Cluster c = new Cluster(e); + allClusters.add(c); + } + + int lastDndLevel = dnd.addLevel(String.valueOf(d), + allClusters.getAllClusters()); + + double previousD = d; + + while (allClusters.size() > 1) { + d = mergeTwoClosestClusters(); + if (previousD == d) { + dnd.setLevel(lastDndLevel, String.valueOf(d), + allClusters.getAllClusters()); + } else { + lastDndLevel = dnd.addLevel(String.valueOf(d), + allClusters.getAllClusters()); + } + previousD = d; + } + + return dnd; + } + + private double mergeTwoClosestClusters() { + int minI = -1; + int minJ = -1; + double minWeight = Double.POSITIVE_INFINITY; + + for (int i = 0, n = m.length; i < n; i++) { + for (int j = 0, k = m.length; j < k; j++) { + if (m[i][j] >= 0 && minWeight > m[i][j]) { + minI = i; + minJ = j; + minWeight = m[i][j]; + } + } + } + + double d = Double.NaN; + if (minI > -1) { + DataPoint e1 = elements[minI]; + Cluster c1 = allClusters.findClusterByElement(e1); + DataPoint e2 = elements[minJ]; + Cluster c2 = allClusters.findClusterByElement(e2); + allClusters.remove(c1); + allClusters.remove(c2); + d = minWeight; + Cluster mergedCluster = new Cluster(c1, c2); + allClusters.add(mergedCluster); + m[minI][minJ] = -1; // remove link. Using -1 because 0 is a valid + // distance. + m[minJ][minI] = -1; // remove link. Using -1 because 0 is a valid + // distance. + } + + return d; + } +} diff --git a/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java b/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java new file mode 100644 index 0000000..a0090d4 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/hierarchical/SingleLinkAlgorithm.java @@ -0,0 +1,126 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.hierarchical; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.config.YooreekaConfigurator; + +/** A hierarchical agglomerative clustering algorithm based on single link */ +public class SingleLinkAlgorithm { + + private static final Logger LOG = Logger.getLogger(SingleLinkAlgorithm.class.getName()); + + public static void main(String[] args) { + // Define data + DataPoint[] elements = new DataPoint[5]; + elements[0] = new DataPoint("A", new double[] {}); + elements[1] = new DataPoint("B", new double[] {}); + elements[2] = new DataPoint("C", new double[] {}); + elements[3] = new DataPoint("D", new double[] {}); + elements[4] = new DataPoint("E", new double[] {}); + + double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 }, + { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } }; + + SingleLinkAlgorithm ca = new SingleLinkAlgorithm(elements, a); + Dendrogram dnd = ca.cluster(); + dnd.printAll(); + // dnd.print(3); + } + private DataPoint[] elements; + + private double[][] a; + + // Hierarchical Agglomerative Algorithm + public SingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) { + + LOG.setLevel(YooreekaConfigurator.getLevel(SingleLinkAlgorithm.class.getName())); + + this.elements = elements; + this.a = adjacencyMatrix; + } + + // Implements Single Link Technique + private List buildClusters(double distanceThreshold) { + boolean[] usedElementFlags = new boolean[elements.length]; + List clusters = new ArrayList(); + for (int i = 0, n = a.length; i < n; i++) { + List clusterPoints = new ArrayList(); + for (int j = i, k = a.length; j < k; j++) { + if (a[i][j] <= distanceThreshold + && usedElementFlags[j] == false) { + clusterPoints.add(elements[j]); + usedElementFlags[j] = true; + } + } + if (clusterPoints.size() > 0) { + Cluster c = new Cluster(clusterPoints); + clusters.add(c); + } + } + return clusters; + } + + public Dendrogram cluster() { + Dendrogram dnd = new Dendrogram("Distance"); + double d = 0; + + // initially load all elements as individual clusters + List initialClusters = new ArrayList(); + for (DataPoint e : elements) { + Cluster c = new Cluster(e); + initialClusters.add(c); + } + + dnd.addLevel(String.valueOf(d), initialClusters); + + d = 1.0; + + int k = initialClusters.size(); + + while (k > 1) { + int oldK = k; + List clusters = buildClusters(d); + k = clusters.size(); + if (oldK != k) { + dnd.addLevel(String.valueOf(d), clusters); + } + + d = d + 1; + } + return dnd; + } +} diff --git a/src/org/yooreeka/algos/clustering/model/Attribute.java b/src/org/yooreeka/algos/clustering/model/Attribute.java new file mode 100644 index 0000000..375f3f1 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/model/Attribute.java @@ -0,0 +1,119 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.model; + +/** + * Attribute for text or numeric values. + */ +public class Attribute { + + private String name; + private Object value; + + public Attribute(String name, Double numericValue) { + init(name, numericValue); + } + + public Attribute(String name, String textValue) { + init(name, textValue); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final Attribute other = (Attribute) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (!value.equals(other.value)) + return false; + return true; + } + + public String getName() { + return name; + } + + public Double getNumericValue() { + return (Double) value; + } + + public String getTextValue() { + return (String) value; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + private void init(String name, Object value) { + this.name = name; + this.value = value; + } + + public boolean isNumeric() { + if (value instanceof java.lang.Double) { + return true; + } else { + return false; + } + } + + public boolean isText() { + if (value instanceof java.lang.String) { + return true; + } else { + return false; + } + } + + @Override + public String toString() { + return "[name=" + this.name + ", value=" + value + ", isText=" + + this.isText() + ", isNumeric=" + this.isNumeric() + "]"; + } + +} diff --git a/src/org/yooreeka/algos/clustering/model/Cluster.java b/src/org/yooreeka/algos/clustering/model/Cluster.java new file mode 100644 index 0000000..5038059 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/model/Cluster.java @@ -0,0 +1,197 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.model; + +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.Set; + +/* + * Group of data points. + */ +public class Cluster { + + private String label; + + private Set elements; + + // Empty cluster with no elements. + public Cluster() { + init(""); + } + + // New cluster that contains all elements from provided clusters. + public Cluster(Cluster c1, Cluster c2) { + init(""); + add(c1); + add(c2); + } + + public Cluster(Collection elements) { + init(""); + for (DataPoint e : elements) { + add(e); + } + } + + public Cluster(DataPoint element) { + init(""); + add(element); + } + + public Cluster(String label) { + init(label); + } + + public Cluster(String label, Collection elements) { + init(label); + for (DataPoint e : elements) { + add(e); + } + } + + /** + * Modifies existing cluster by adding all elements from provided cluster. + * + * @param c + */ + public void add(Cluster c) { + for (DataPoint e : c.getElements()) { + elements.add(e); + } + } + + /** + * Modifies existing cluster by adding a new element. + * + * @param e + */ + public void add(DataPoint e) { + elements.add(e); + } + + public boolean contains(Cluster c) { + boolean result = true; + for (DataPoint e : c.getElements()) { + if (!contains(e)) { + result = false; + break; + } + } + return result; + } + + public boolean contains(DataPoint e) { + return elements.contains(e); + } + + public Cluster copy() { + Cluster copy = new Cluster(); + for (DataPoint e : this.getElements()) { + // DataPoint is immutable. No need to create a copy. + copy.add(e); + } + return copy; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final Cluster other = (Cluster) obj; + if (elements == null) { + if (other.elements != null) + return false; + } else if (!elements.equals(other.elements)) + return false; + return true; + } + + /* + * Returns number of attributes used to define points in the cluster. + */ + public int getDimensionCount() { + if (elements == null || elements.isEmpty()) { + return 0; + } + + return elements.iterator().next().getAttributeCount(); + } + + public Set getElements() { + return new LinkedHashSet(elements); + } + + public String getElementsAsString() { + StringBuffer buf = new StringBuffer("{"); + for (DataPoint e : elements) { + if (buf.length() > 1) { + buf.append(",\n"); + } + buf.append(e.getLabel()); + } + buf.append("}"); + + return buf.toString(); + } + + public String getLabel() { + return label; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((elements == null) ? 0 : elements.hashCode()); + return result; + } + + private void init(String label) { + this.label = label; + elements = new LinkedHashSet(); + } + + public int size() { + return elements.size(); + } + + @Override + public String toString() { + return getElementsAsString(); + } + +} diff --git a/src/org/yooreeka/algos/clustering/model/DataPoint.java b/src/org/yooreeka/algos/clustering/model/DataPoint.java new file mode 100644 index 0000000..7ccadc6 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/model/DataPoint.java @@ -0,0 +1,181 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.yooreeka.algos.clustering.utils.Attributes; +import org.yooreeka.util.metrics.EuclideanDistance; + +/** + * A Thing to be clustered. Defined by a set of attributes. + */ +public class DataPoint { + + /** + * Descriptive label or name. We also use it as unique ID for the instance. + */ + private String label; + + /** + * Collection of attributes that define this point. + */ + private Attribute[] attributes; + + /* + * Values derived from attributes. + */ + private String[] attributeNames; + private double[] numericAttributeValues; + private String[] textAttributeValues; + + public DataPoint(String label, Attribute[] attributes) { + init(label, attributes); + } + + /** + * Creates a new point with numerical attributes. Attribute names are + * auto-generated. + */ + public DataPoint(String label, double[] attrValues) { + // create attributes with auto-generated names + init(label, Attributes.createAttributes(attrValues)); + } + + public DataPoint(String label, String[] attrValues) { + // create attributes with auto-generated names + init(label, Attributes.createAttributes(attrValues)); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final DataPoint other = (DataPoint) obj; + if (!Arrays.equals(attributes, other.attributes)) + return false; + if (label == null) { + if (other.label != null) + return false; + } else if (!label.equals(other.label)) + return false; + return true; + } + + public int getAttributeCount() { + return numericAttributeValues.length; + } + + public String[] getAttributeNames() { + return attributeNames; + } + + public Attribute[] getAttributes() { + return attributes; + } + + public String getLabel() { + return label; + } + + public double[] getNumericAttrValues() { + return numericAttributeValues; + } + + public double getR() { + + EuclideanDistance euclid = new EuclideanDistance(); + + int n = attributes.length; + + double[] x = new double[n]; + + for (int i = 0; i < n; i++) { + x[i] = 0d; + } + + return euclid.getDistance(x, this.numericAttributeValues); + } + + public String[] getTextAttrValues() { + return textAttributeValues; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + Arrays.hashCode(attributes); + result = prime * result + ((label == null) ? 0 : label.hashCode()); + return result; + } + + private void init(String label, Attribute[] attributes) { + this.label = label; + this.attributes = attributes; + this.attributeNames = Attributes.getNames(attributes); + if (Attributes.allText(attributes)) { + this.textAttributeValues = Attributes.getTextValues(attributes); + } else { + this.textAttributeValues = null; + } + if (Attributes.allNumeric(attributes)) { + this.numericAttributeValues = Attributes + .getNumericValues(attributes); + } else { + this.numericAttributeValues = null; + } + } + + public String toShortString() { + List attrValues = new ArrayList(); + for (Attribute a : attributes) { + if (a.isNumeric()) { + attrValues.add(String.valueOf(a.getNumericValue())); + } else { + attrValues.add(a.getTextValue()); + } + } + return label + "(" + attrValues.toString() + ")"; + } + + @Override + public String toString() { + return label + "(" + Arrays.toString(attributes) + ")"; + } + +} diff --git a/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java b/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java new file mode 100644 index 0000000..43b07c1 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/partitional/KMeansAlgorithm.java @@ -0,0 +1,306 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.partitional; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.utils.Attributes; +import org.yooreeka.config.YooreekaConfigurator; + +public class KMeansAlgorithm { + + private static final Logger LOG = Logger.getLogger(KMeansAlgorithm.class.getName()); + + public static void main(String[] args) { + + DataPoint[] dataPoints = new DataPoint[] { + new DataPoint("2", new double[] { 2.0 }), + new DataPoint("4", new double[] { 4.0 }), + new DataPoint("10", new double[] { 10.0 }), + new DataPoint("12", new double[] { 12.0 }), + new DataPoint("3", new double[] { 3.0 }), + new DataPoint("20", new double[] { 20.0 }), + new DataPoint("30", new double[] { 30.0 }), + new DataPoint("11", new double[] { 11.0 }), + new DataPoint("25", new double[] { 25.0 }) }; + + DataPoint[] clusterMeans = new DataPoint[] { + new DataPoint("Mean-2", new double[] { 2.0 }), + new DataPoint("Mean-4", new double[] { 4.0 }) }; + + KMeansAlgorithm kmeans = new KMeansAlgorithm(clusterMeans, dataPoints); + kmeans.cluster(); + + kmeans.print(); + + } + public static DataPoint[] pickInitialCentroids(int k, DataPoint[] data) { + + Random randGen = new Random(); + DataPoint[] centroids = new DataPoint[k]; + + // Calculate random mean values for each cluster based on the data + /** + * TODO: 4.2 -- Selecting the means for seeding + * + * In large datasets, the selection of the initial centroids can be + * important from a computational (time) complexity perspective. + * + * In general, how can we improve the seeding of the initial mean + * values? For example, consider the following heuristic: + * + * 1. pick randomly one node 2. calculate the distance between that node + * and O (10*k) other nodes 3. sort the list of nodes according to their + * distance from the first node 4. pick every 10th node in the sequence + * 5. calculate the mean distance between each one of these nodes and + * the original node + * + * This algorithmic choice is as ad hoc as they come, however, it does + * have some key principles embedded in it? What are these principles? + * How can you generalize this algorithm? + * + * Discuss advantages/disadvantages of the initial seeding with your + * friends. + * + */ + Set previouslyUsedIds = new HashSet(); + for (int i = 0; i < k; i++) { + // pick point index that we haven't used yet + int centroidId; + do { + centroidId = randGen.nextInt(data.length); + } while (previouslyUsedIds.add(centroidId) == false); + + // Create DataPoint that will represent the cluster's centroid. + String label = "Mean-" + i + "(" + data[centroidId].getLabel() + + ")"; + double[] values = data[centroidId].getNumericAttrValues(); + String[] attrNames = data[centroidId].getAttributeNames(); + centroids[i] = new DataPoint(label, Attributes.createAttributes( + attrNames, values)); + } + + return centroids; + } + private int k; + private DataPoint[] allCentroids; + + private Cluster[] allClusters; + + private DataPoint[] allDataPoints; + + /** + * @param initialCentroids + * - starting values for the centroids of each cluster. + */ + public KMeansAlgorithm(DataPoint[] initialCentroids, DataPoint[] dataPoints) { + init(initialCentroids, dataPoints); + } + + /** + * + * @param k + * - desired number of clusters. + * + */ + public KMeansAlgorithm(int k, DataPoint[] dataPoints) { + DataPoint[] initialCentroids = KMeansAlgorithm.pickInitialCentroids(k, + dataPoints); + init(initialCentroids, dataPoints); + } + + public void cluster() { + + boolean centroidsChanged = true; + + while (centroidsChanged == true) { + // Create a set points for each cluster + List> clusters = new ArrayList>(k); + for (int i = 0; i < k; i++) { + clusters.add(new HashSet()); + } + + // Assign points to each set based on minimum distance from the + // centroids + for (DataPoint p : allDataPoints) { + int i = findClosestCentroid(allCentroids, p); + clusters.get(i).add(p); + } + + for (int i = 0; i < k; i++) { + allClusters[i] = new Cluster(clusters.get(i)); + } + + // Calculate new cluster centroids, and + // check if any of the centroids has changed + centroidsChanged = false; + for (int i = 0; i < allClusters.length; i++) { + if (clusters.get(i).size() > 0) { + double[] newCentroidValues = findCentroid(allClusters[i]); + double[] oldCentroidValues = allCentroids[i] + .getNumericAttrValues(); + if (!Arrays.equals(oldCentroidValues, newCentroidValues)) { + allCentroids[i] = new DataPoint( + allCentroids[i].getLabel(), newCentroidValues); + centroidsChanged = true; + } + } else { + // keep mean unchanged if cluster has no elements. + } + } + } + } + + private double distance(DataPoint x, DataPoint y) { + return distance(x.getNumericAttrValues(), y.getNumericAttrValues()); + } + + private double distance(double[] x, double[] y) { + double sumXY2 = 0.0; + for (int i = 0, n = x.length; i < n; i++) { + sumXY2 += Math.pow(x[i] - y[i], 2); + } + return Math.sqrt(sumXY2); + } + + private double[] findCentroid(Cluster c) { + + Set clusterPoints = c.getElements(); + int n = clusterPoints.size(); + + if (n == 0) { + return new double[0]; + } + + int d = c.getDimensionCount(); + double[] meanAttributes = new double[d]; + + for (DataPoint p : clusterPoints) { + double[] pointAttributes = p.getNumericAttrValues(); + for (int i = 0; i < d; i++) { + meanAttributes[i] += pointAttributes[i]; + } + } + + for (int i = 0; i < d; i++) { + meanAttributes[i] = meanAttributes[i] / n; + } + + return meanAttributes; + } + + /** + * This method calculates the closest centroid for a given data point + * + * @param centroids + * @param x + * is the DataPoint for which we seek the closest + * centroid + * @return the index (from the centroids array) of the closest centroid + */ + private int findClosestCentroid(DataPoint[] centroids, DataPoint x) { + double minDistance = Double.POSITIVE_INFINITY; + int closestCentroid = -1; + for (int i = 0, n = centroids.length; i < n; i++) { + double d = distance(centroids[i], x); + // if the d == minDistance then keep current selection + if (d < minDistance) { + minDistance = d; + closestCentroid = i; + } + + } + return closestCentroid; + } + + public DataPoint[] getAllCentroids() { + return this.allCentroids; + } + + public Cluster[] getAllClusters() { + return this.allClusters; + } + + public int getK() { + return this.k; + } + + private void init(DataPoint[] initialCentroids, DataPoint[] dataPoints) { + + LOG.setLevel(YooreekaConfigurator.getLevel(KMeansAlgorithm.class.getName())); + + this.k = initialCentroids.length; + this.allDataPoints = dataPoints; + this.allCentroids = initialCentroids; + this.allClusters = new Cluster[k]; + } + + public void print() { + // show results + Cluster[] clusters = this.getAllClusters(); + + System.out.println("Clusters:"); + for (Cluster c : clusters) { + System.out.println(c.getElementsAsString()); + } + } + + public void printAll() { + + Cluster[] clusters = this.getAllClusters(); + System.out.println("Clusters:"); + for (Cluster c : clusters) { + System.out.println(c.getElementsAsString()); + } + System.out + .println("___________________________________________________"); + DataPoint[] means = this.getAllCentroids(); + System.out.println("Cluster means:"); + for (DataPoint p : means) { + System.out.println(p.toString()); + } + } + + public void printMeans() { + System.out.println("Cluster means:"); + for (DataPoint mean : this.allCentroids) { + System.out.println(mean); + } + } +} diff --git a/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java b/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java new file mode 100644 index 0000000..b1a67f6 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/partitional/NearestNeighborAlgorithm.java @@ -0,0 +1,230 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.partitional; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.metrics.NumericDistance; +import org.yooreeka.util.metrics.EuclideanDistance; + +public class NearestNeighborAlgorithm { + + private static final Logger LOG = Logger.getLogger(NearestNeighborAlgorithm.class.getName()); + + public static void main(String[] args) { + + DataPoint[] elements = new DataPoint[5]; + elements[0] = new DataPoint("A", new double[] {}); + elements[1] = new DataPoint("B", new double[] {}); + elements[2] = new DataPoint("C", new double[] {}); + elements[3] = new DataPoint("D", new double[] {}); + elements[4] = new DataPoint("E", new double[] {}); + + double[][] a = new double[][] { { 0, 1, 2, 2, 3 }, { 1, 0, 2, 4, 3 }, + { 2, 2, 0, 1, 5 }, { 2, 4, 1, 0, 3 }, { 3, 3, 5, 3, 0 } }; + + double threshold = 2; + + NearestNeighborAlgorithm nn = new NearestNeighborAlgorithm(elements, a, + threshold); + + nn.run(); + } + + /* + * All elements for clustering. + */ + private DataPoint[] allDataPoints; + + /* + * Matrix with distances between elements. + */ + private double[][] a; + + /* + * Threshold value that is used to determine if elements will be added to + * one of the existing clusters or if a new cluster will be created. + */ + private double t = 0.5; + + /* + * List of clusters. + */ + private List allClusters; + + /* + * Distance metric that will be used to calculate distance between elements. + */ + private NumericDistance dist = new EuclideanDistance(); + + /* + * DataPoint -> Index mapping. Used to access data in distance matrix. + */ + ObjectToIndexMapping idxMapping = null; + + private boolean verbose = true; + + public NearestNeighborAlgorithm(DataPoint[] dataPoints, double t) { + this(dataPoints, null, t); + } + + /** + * + * @param dataPoints + * elements to cluster. Element order should correspond to + * elements in distance matrix. + * @param a + * matrix showing distance between elements. Can be null. + * @param t + * threshold value for new cluster creation. + */ + public NearestNeighborAlgorithm(DataPoint[] dataPoints, double[][] a, + double t) { + + LOG.setLevel(YooreekaConfigurator.getLevel(NearestNeighborAlgorithm.class.getName())); + + this.t = t; + this.allDataPoints = dataPoints; + this.a = a; + this.allClusters = new ArrayList(); + + /* + * Create DataPoint -> Index mapping for all data points. + */ + idxMapping = new ObjectToIndexMapping(); + + for (int i = 0, n = dataPoints.length; i < n; i++) { + idxMapping.getIndex(dataPoints[i]); + } + + } + + private void assignPointToCluster(DataPoint x) { + + /* find min distance between current point and all clusters */ + double minNNDist = Double.POSITIVE_INFINITY; + Cluster closestCluster = null; + for (Cluster c : allClusters) { + double nnDist = getNNDistance(c, x); + if (nnDist < minNNDist) { + minNNDist = nnDist; + closestCluster = c; + } + } + + /* Assign point to cluster based on calculated distance and threshold */ + if (minNNDist <= t) { + closestCluster.add(x); + } else { + /* Best distance exceeds the threshold - create a new cluster. */ + Cluster newCluster = new Cluster(); + newCluster.add(x); + allClusters.add(newCluster); + } + } + + private void calculateDistanceMatrix() { + a = new double[allDataPoints.length][allDataPoints.length]; + for (int i = 0, n = allDataPoints.length; i < n; i++) { + DataPoint x = allDataPoints[i]; + for (int j = i + 1; j < n; j++) { + DataPoint y = allDataPoints[j]; + a[i][j] = dist.getDistance(x.getNumericAttrValues(), + y.getNumericAttrValues()); + a[j][i] = a[i][j]; + } + a[i][i] = 0.0; + } + } + + public List getAllClusters() { + return allClusters; + } + + /** + * Calculates distance between cluster and element using Nearest Neighbor + * approach. + */ + private double getNNDistance(Cluster c, DataPoint x) { + + double nnDist = Double.POSITIVE_INFINITY; + + if (c.contains(x)) { + nnDist = 0.0; + } else { + int i = idxMapping.getIndex(x); + for (DataPoint y : c.getElements()) { + int j = idxMapping.getIndex(y); + double xyDist = a[i][j]; + nnDist = Math.min(nnDist, xyDist); + } + } + + return nnDist; + } + + private void printResults() { + System.out.println("Nearest Neighbor Clustering with t = " + t); + System.out.println("Clusters:"); + for (Cluster c : allClusters) { + System.out.println(c.getElementsAsString()); + } + } + + public void run() { + + if (allDataPoints == null || allDataPoints.length == 0) { + return; + } + + if (a == null) { + calculateDistanceMatrix(); + } + + for (int i = 0, n = allDataPoints.length; i < n; i++) { + assignPointToCluster(allDataPoints[i]); + } + + if (verbose) { + printResults(); + } + } + + public void setDistance(NumericDistance dist) { + this.dist = dist; + } +} diff --git a/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java b/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java new file mode 100644 index 0000000..4728cca --- /dev/null +++ b/src/org/yooreeka/algos/clustering/rock/LinkMatrix.java @@ -0,0 +1,195 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.rock; + +import java.util.Arrays; +import java.util.Set; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.utils.ObjectToIndexMapping; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.metrics.SimilarityMeasure; + +/** + * Calculates number of links between data points. + */ +public class LinkMatrix { + + private static final Logger LOG = Logger.getLogger(LinkMatrix.class.getName()); + + private double th; + double[][] pointSimilarityMatrix; + int[][] pointNeighborMatrix; + int[][] pointLinkMatrix; + private ObjectToIndexMapping objToIndexMapping; + + public LinkMatrix(DataPoint[] points, double[][] similarityMatrix, double th) { + init(points, similarityMatrix, th); + } + + public LinkMatrix(DataPoint[] points, SimilarityMeasure pointSim, double th) { + + double[][] similarityMatrix = calculatePointSimilarities(points, + pointSim); + init(points, similarityMatrix, th); + } + + /* + * Calculates similarity matrix for all points. + */ + private double[][] calculatePointSimilarities(DataPoint[] points, + SimilarityMeasure pointSim) { + + int n = points.length; + double[][] simMatrix = new double[n][n]; + for (int i = 0; i < n; i++) { + DataPoint itemX = points[i]; + String[] attributesX = itemX.getTextAttrValues(); + for (int j = i + 1; j < n; j++) { + DataPoint itemY = points[j]; + String[] attributesY = itemY.getTextAttrValues(); + simMatrix[i][j] = pointSim.similarity(attributesX, attributesY); + simMatrix[j][i] = simMatrix[i][j]; + } + simMatrix[i][i] = 1.0; + } + + return simMatrix; + } + + /** + * Calculates number of links between two clusters. Number of links between + * two clusters is the sum of links between all point pairs( p1, p2) where + * p1 belongs to the first cluster and p2 belongs to the other cluster. + * + * @param clusterX + * @param clusterY + * + * @return link count between two clusters. + */ + public int getLinks(Cluster clusterX, Cluster clusterY) { + Set itemsX = clusterX.getElements(); + Set itemsY = clusterY.getElements(); + + int linkSum = 0; + + for (DataPoint x : itemsX) { + for (DataPoint y : itemsY) { + linkSum += getLinks(x, y); + } + } + return linkSum; + } + + public int getLinks(DataPoint p1, DataPoint p2) { + int i = objToIndexMapping.getIndex(p1); + int j = objToIndexMapping.getIndex(p2); + return pointLinkMatrix[i][j]; + } + + private void init(DataPoint[] points, double[][] similarityMatrix, double th) { + + LOG.setLevel(YooreekaConfigurator.getLevel(LinkMatrix.class.getName())); + + this.th = th; + + objToIndexMapping = new ObjectToIndexMapping(); + + // Create DataPoint <-> Index mapping. + for (DataPoint point : points) { + objToIndexMapping.getIndex(point); + } + + pointSimilarityMatrix = similarityMatrix; + + // Identify neighbors: a[i][j] == 1 if (i,j) are neighbors and 0 + // otherwise. + int n = points.length; + + pointNeighborMatrix = new int[n][n]; + for (int i = 0; i < n; i++) { + for (int j = i + 1; j < n; j++) { + if (pointSimilarityMatrix[i][j] >= th) { + pointNeighborMatrix[i][j] = 1; + } else { + pointNeighborMatrix[i][j] = 0; + } + pointNeighborMatrix[j][i] = pointNeighborMatrix[i][j]; + } + pointNeighborMatrix[i][i] = 1; + } + + // Calculate number of links between points + pointLinkMatrix = new int[n][n]; + for (int i = 0; i < n; i++) { + for (int j = i; j < n; j++) { + pointLinkMatrix[i][j] = nLinksBetweenPoints( + pointNeighborMatrix, i, j); + pointLinkMatrix[j][i] = pointLinkMatrix[i][j]; + } + } + + } + + private int nLinksBetweenPoints(int[][] neighbors, int indexX, int indexY) { + int nLinks = 0; + for (int i = 0, n = neighbors.length; i < n; i++) { + nLinks += neighbors[indexX][i] * neighbors[i][indexY]; + } + return nLinks; + } + + public void printPointLinkMatrix() { + System.out + .println("Point Link matrix (th=" + String.valueOf(th) + "):"); + for (int i = 0; i < pointLinkMatrix.length; i++) { + System.out.println(Arrays.toString(pointLinkMatrix[i])); + } + } + + public void printPointNeighborMatrix() { + System.out.println("Point Neighbor matrix (th=" + String.valueOf(th) + + "):"); + for (int i = 0; i < pointNeighborMatrix.length; i++) { + System.out.println(Arrays.toString(pointNeighborMatrix[i])); + } + } + + public void printSimilarityMatrix() { + System.out.println("Point Similarity matrix:"); + for (int i = 0; i < pointSimilarityMatrix.length; i++) { + System.out.println(Arrays.toString(pointSimilarityMatrix[i])); + } + } + +} diff --git a/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java b/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java new file mode 100644 index 0000000..21d217a --- /dev/null +++ b/src/org/yooreeka/algos/clustering/rock/MergeGoodnessMeasure.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.rock; + +/** + * Goodness measure for merging two clusters. + */ +public class MergeGoodnessMeasure { + + /* + * Threshold value that was used to identify neighbors among points. + */ + private double linkThreshold; + + /* + * Intermediate value that is used in calculation of goodness measure and + * stays the same for different clusters. + */ + private double p; + + public MergeGoodnessMeasure(double th) { + this.linkThreshold = th; + this.p = 1.0 + 2.0 * f(th); + } + + /** + * This is just one of the possible implementations. + * + * @param linkThreshold + * threshold value that was used to identify neighbors among + * points. + */ + private double f(double th) { + + /* + * This implementation assumes that linkThreshold was a threshold for + * similarity measure (as opposed to dissimilarity/distance). + */ + return (1.0 - th) / (1.0 + th); + } + + public double g(int nLinks, int nX, int nY) { + double a = Math.pow(nX + nY, p); + double b = Math.pow(nX, p); + double c = Math.pow(nY, p); + + return nLinks / (a - b - c); + } + + /** + * @return the linkThreshold + */ + public double getTh() { + return linkThreshold; + } + + /** + * @param linkThreshold + * the linkThreshold to set + */ + public void setTh(double th) { + this.linkThreshold = th; + } +} diff --git a/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java b/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java new file mode 100644 index 0000000..2932f0c --- /dev/null +++ b/src/org/yooreeka/algos/clustering/rock/ROCKAlgorithm.java @@ -0,0 +1,142 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.rock; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.hierarchical.Dendrogram; +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.metrics.JaccardCoefficient; +import org.yooreeka.util.metrics.SimilarityMeasure; + +public class ROCKAlgorithm { + + private static final Logger LOG = Logger.getLogger(ROCKAlgorithm.class.getName()); + + public static void main(String[] args) { + // Define data + DataPoint[] elements = new DataPoint[4]; + elements[0] = new DataPoint("Doc1", new String[] { "book" }); + elements[1] = new DataPoint("Doc2", new String[] { "water", "sun", + "sand", "swim" }); + elements[2] = new DataPoint("Doc3", new String[] { "water", "sun", + "swim", "read" }); + elements[3] = new DataPoint("Doc4", new String[] { "read", "sand" }); + + int k = 1; + double th = 0.2; + ROCKAlgorithm rock = new ROCKAlgorithm(elements, k, th); + Dendrogram dnd = rock.cluster(); + dnd.printAll(); + } + private DataPoint[] points; + private int k; + + private double th; + + private SimilarityMeasure similarityMeasure; + + private LinkMatrix linkMatrix; + + /** + * + * @param k + * desired number of clusters. + * @param th + * threshold value to identify neighbors among points. + */ + public ROCKAlgorithm(DataPoint[] points, int k, double th) { + + LOG.setLevel(YooreekaConfigurator.getLevel(ROCKAlgorithm.class.getName())); + + this.points = points; + this.k = k; + this.th = th; + this.similarityMeasure = new JaccardCoefficient(); + // this.similarityMeasure = new CosineSimilarity(); + this.linkMatrix = new LinkMatrix(points, similarityMeasure, th); + } + + public Dendrogram cluster() { + + // Create a new cluster out of every point. + List initialClusters = new ArrayList(); + for (int i = 0, n = points.length; i < n; i++) { + Cluster cluster = new Cluster(points[i]); + initialClusters.add(cluster); + } + double g = Double.POSITIVE_INFINITY; + Dendrogram dnd = new Dendrogram("Goodness"); + dnd.addLevel(String.valueOf(g), initialClusters); + + MergeGoodnessMeasure goodnessMeasure = new MergeGoodnessMeasure(th); + + ROCKClusters allClusters = new ROCKClusters(initialClusters, + linkMatrix, goodnessMeasure); + + int nClusters = allClusters.size(); + while (nClusters > k) { + int nClustersBeforeMerge = nClusters; + g = allClusters.mergeBestCandidates(); + nClusters = allClusters.size(); + if (nClusters == nClustersBeforeMerge) { + // there are no linked clusters to merge + break; + } + dnd.addLevel(String.valueOf(g), allClusters.getAllClusters()); + } + + System.out.println("Number of clusters: " + + allClusters.getAllClusters().size()); + return dnd; + } + + public int getK() { + return k; + } + + public LinkMatrix getLinkMatrix() { + return linkMatrix; + } + + public SimilarityMeasure getSimilarityMeasure() { + return similarityMeasure; + } + + public double getTh() { + return th; + } + +} diff --git a/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java b/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java new file mode 100644 index 0000000..447cb57 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/rock/ROCKClusters.java @@ -0,0 +1,205 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.rock; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.Cluster; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Set of clusters and link data for ROCK implementation. + */ +public class ROCKClusters { + + private static final Logger LOG = Logger.getLogger(ROCKClusters.class.getName()); + + /* + * Used to assign unique IDs to clusters. + */ + private int nextKey; + + /* + * Provides ID -> Cluster mapping. + */ + private Map clusterMap; + + /* + * Provides ID -> Similar Clusters mapping. + */ + private Map> similarClustersMap; + + /* + * Goodness measure between two clusters. It is used to determine cluster + * eligibility for merge. + */ + private MergeGoodnessMeasure goodnessMeasure; + + /* + * Links between data points and clusters. + */ + private LinkMatrix linkMatrix; + + public ROCKClusters(List initialClusters, LinkMatrix linkMatrix, + MergeGoodnessMeasure goodnessMeasure) { + + LOG.setLevel(YooreekaConfigurator.getLevel(ROCKClusters.class.getName())); + + this.linkMatrix = linkMatrix; + clusterMap = new HashMap(); + nextKey = 0; + this.goodnessMeasure = goodnessMeasure; + + for (Cluster c : initialClusters) { + addCluster(c); + } + calculateClusterSimilarities(); + } + + public int addCluster(Cluster c) { + int key = nextKey; + clusterMap.put(key, c); + nextKey++; + return key; + } + + public void calculateClusterSimilarities() { + similarClustersMap = new HashMap>(); + for (Integer clusterKey : getAllKeys()) { + List similarClusters = new LinkedList(); + Cluster cluster = getCluster(clusterKey); + for (Integer similarClusterKey : getAllKeys()) { + if (clusterKey != similarClusterKey) { + Cluster similarCluster = getCluster(similarClusterKey); + int nLinks = linkMatrix.getLinks(cluster, similarCluster); + if (nLinks > 0) { + double goodness = goodnessMeasure.g(nLinks, + cluster.size(), similarCluster.size()); + similarClusters.add(new SimilarCluster( + similarClusterKey, goodness)); + } + } + } + setSimilarClusters(clusterKey, similarClusters); + } + } + + /** + * Finds a pair of cluster indexes with the best goodness measure. + */ + public List findBestMergeCandidates() { + Integer bestKey = null; + SimilarCluster bestSimilarCluster = null; + Double bestGoodness = Double.NEGATIVE_INFINITY; + for (Map.Entry> e : similarClustersMap + .entrySet()) { + List similarClusters = e.getValue(); + if (similarClusters != null && similarClusters.size() > 0) { + SimilarCluster topSimilarCluster = similarClusters.get(0); + if (topSimilarCluster.getGoodness() > bestGoodness) { + bestGoodness = topSimilarCluster.getGoodness(); + bestKey = e.getKey(); + bestSimilarCluster = topSimilarCluster; + } + } + } + List bestMergeCandidates = new ArrayList(); + if (bestKey != null) { + bestMergeCandidates.add(bestKey); + bestMergeCandidates.add(bestSimilarCluster.getClusterKey()); + } + return bestMergeCandidates; + } + + public Collection getAllClusters() { + return clusterMap.values(); + } + + public Set getAllKeys() { + return new HashSet(clusterMap.keySet()); + } + + public Cluster getCluster(Integer key) { + return clusterMap.get(key); + } + + public double mergeBestCandidates() { + List mergeCandidates = findBestMergeCandidates(); + + double goodness = Double.NaN; + + if (mergeCandidates.size() > 1) { + + Integer key1 = mergeCandidates.get(0); + Integer key2 = mergeCandidates.get(1); + goodness = similarClustersMap.get(key1).get(0).getGoodness(); + + mergeClusters(key1, key2); + } + + return goodness; + } + + public Integer mergeClusters(Integer key1, Integer key2) { + + Cluster cluster1 = getCluster(key1); + Cluster cluster2 = getCluster(key2); + Cluster cluster3 = new Cluster(cluster1, cluster2); + removeCluster(key1); + removeCluster(key2); + Integer key3 = addCluster(cluster3); + + calculateClusterSimilarities(); + + return key3; + } + + public Cluster removeCluster(Integer key) { + return clusterMap.remove(key); + } + + private void setSimilarClusters(Integer key, List list) { + SimilarCluster.sortByGoodness(list); + similarClustersMap.put(key, list); + } + + public int size() { + return clusterMap.size(); + } +} diff --git a/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java b/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java new file mode 100644 index 0000000..e4ad7dc --- /dev/null +++ b/src/org/yooreeka/algos/clustering/rock/SimilarCluster.java @@ -0,0 +1,85 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.rock; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +public class SimilarCluster { + /** + * Sorts list by goodness value in descending order. Higher goodness values + * will be in the head of the list. + * + * @param values + * list to sort. + */ + public static void sortByGoodness(List values) { + Collections.sort(values, new Comparator() { + + public int compare(SimilarCluster f1, SimilarCluster f2) { + + int result = 0; + if (f1.getGoodness() < f2.getGoodness()) { + result = 1; // order in the decreasing order of goodness + // value + } else if (f1.getGoodness() > f2.getGoodness()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + private Integer clusterKey; + + private Double goodness; + + public SimilarCluster(Integer clusterKey, Double goodness) { + this.clusterKey = clusterKey; + this.goodness = goodness; + } + + public Integer getClusterKey() { + return clusterKey; + } + + public Double getGoodness() { + return goodness; + } + + @Override + public String toString() { + return "[clusterKey=" + this.clusterKey + ",goodness=" + this.goodness + + "]"; + } +} diff --git a/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java new file mode 100644 index 0000000..5489bbd --- /dev/null +++ b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceData.java @@ -0,0 +1,125 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.test; + +import java.util.List; + +import org.yooreeka.algos.clustering.dbscan.DBSCANAlgorithm; +import org.yooreeka.algos.clustering.hierarchical.Dendrogram; +import org.yooreeka.algos.clustering.model.Attribute; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.rock.ROCKAlgorithm; +import org.yooreeka.algos.clustering.utils.Attributes; +import org.yooreeka.algos.reco.collab.data.DiggData; +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.content.digg.DiggStoryItem; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.metrics.CosineDistance; + +public class MyDiggSpaceData { + + private static DataPoint createDataPoint(DiggStoryItem story, int topNTerms) { + String storyLabel = String.valueOf(story.getId() + ":" + + story.getTitle()); + String storyText = story.getTitle() + " " + story.getDescription(); + Content content = new Content(storyLabel, storyText, topNTerms); + String[] terms = content.getTerms(); + // using term as attribute name and value. + Attribute[] attributes = Attributes.createAttributes(terms, terms); + return new DataPoint(storyLabel, attributes); + } + + public static MyDiggSpaceDataset createDataset() { + return createDataset(10); + } + + public static MyDiggSpaceDataset createDataset(int topNTerms) { + DiggData.loadData(YooreekaConfigurator.getHome() + + "/data/ch04/ch4_digg_stories.csv"); + + List allStories = DiggData.allStories; + + DataPoint[] allDataPoints = new DataPoint[allStories.size()]; + + for (int i = 0, n = allDataPoints.length; i < n; i++) { + DiggStoryItem story = allStories.get(i); + DataPoint di = createDataPoint(story, topNTerms); + allDataPoints[i] = di; + } + return new MyDiggSpaceDataset(allDataPoints); + } + + public static MyDiggSpaceDataset createDataset(int topNTerms, + List allStories) { + + DataPoint[] allDataPoints = new DataPoint[allStories.size()]; + + for (int i = 0, n = allDataPoints.length; i < n; i++) { + + DiggStoryItem story = allStories.get(i); + story.print(); + + DataPoint di = createDataPoint(story, topNTerms); + allDataPoints[i] = di; + } + return new MyDiggSpaceDataset(allDataPoints); + } + + public static void main(String[] args) { + // testRockOnDigg(); + testDBSCAN(); + } + + private static void testDBSCAN() { + MyDiggSpaceDataset ds = MyDiggSpaceData.createDataset(3); + double eps = 0.8; + int minPts = 2; + boolean useTermFreq = true; + DBSCANAlgorithm dbscan = new DBSCANAlgorithm(ds.getData(), + new CosineDistance(), eps, minPts, useTermFreq); + + dbscan.cluster(); + // dbscan.printDistances(); + } + + public static void testRockOnDigg() { + MyDiggSpaceDataset ds = MyDiggSpaceData.createDataset(10); + ROCKAlgorithm rock = new ROCKAlgorithm(ds.getData(), 4, 0.1); + // rock.getLinkMatrix().printSimilarityMatrix(); + // rock.getLinkMatrix().printPointNeighborMatrix(); + // rock.getLinkMatrix().printPointLinkMatrix(); + Dendrogram dnd = rock.cluster(); + dnd.print(130); // if you get NPE here it means that level doesn't + // exist. + + // ROCK stops clustering if there are no links between clusters. + } +} diff --git a/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java new file mode 100644 index 0000000..a374d6f --- /dev/null +++ b/src/org/yooreeka/algos/clustering/test/MyDiggSpaceDataset.java @@ -0,0 +1,56 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.test; + +import org.yooreeka.algos.clustering.model.DataPoint; + +public class MyDiggSpaceDataset { + + private DataPoint[] data; + + private boolean verbose = true; + + public MyDiggSpaceDataset(DataPoint[] data) { + this.data = data; + + if (verbose) { + System.out.println("\nCreated " + this.getClass().getSimpleName() + + " dataset with " + data.length + " items:\n"); + for (DataPoint item : data) { + System.out.println(item.toShortString()); + } + } + } + + public DataPoint[] getData() { + return data; + } +} diff --git a/src/org/yooreeka/algos/clustering/test/SFData.java b/src/org/yooreeka/algos/clustering/test/SFData.java new file mode 100644 index 0000000..94e4f42 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/test/SFData.java @@ -0,0 +1,212 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.test; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.supercsv.io.CsvListReader; +import org.supercsv.prefs.CsvPreference; +import org.yooreeka.algos.clustering.model.Attribute; +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.algos.clustering.partitional.NearestNeighborAlgorithm; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.metrics.NumericDistance; +import org.yooreeka.util.metrics.EuclideanDistance; + +public class SFData { + + /* + * All available attributes. + */ + private static String[] allAvailableAttributeNames = { "Age", + "IncomeRange", "Education", "Skills", "Social", "isPaid" }; + + public static SFDataset createDataset() { + return createDataset(allAvailableAttributeNames); + } + + /** + * Creates dataset that uses only attributes with specified names. Other + * attributes will not be loaded. + * + * @param attrNames + * attribute names to use. + * @return dataset that uses only specified attributes. + */ + public static SFDataset createDataset(String[] attrNames) { + + // check that attribute names are valid + validateAttrNames(attrNames, allAvailableAttributeNames); + + DataPoint[] allData = loadDataFromFile(YooreekaConfigurator.getHome() + + "/data/ch04/clusteringSF.dat", attrNames); + + NumericDistance dist = new EuclideanDistance(); + SFDataset sfDataset = new SFDataset(allData, dist); + return sfDataset; + } + + private static DataPoint[] loadDataFromFile(String filename, + String[] attrNames) { + List allData = new ArrayList(); + CsvListReader csvReader = null; + try { + csvReader = new CsvListReader(new BufferedReader(new FileReader( + filename)), CsvPreference.EXCEL_PREFERENCE); + + // Load all available headers from CSV file + String[] csvHeaders = csvReader.getCSVHeader(true); + + // Map attribute names to field IDs from CSV file using header names + int[] attrFieldIndexes = new int[attrNames.length]; + for (int i = 0; i < attrFieldIndexes.length; i++) { + String header = attrNames[i]; + int csvHeaderId = -1; + for (int j = 0; j < csvHeaders.length; j++) { + if (header.equalsIgnoreCase(csvHeaders[j])) { + csvHeaderId = j; + break; + } + } + // If there is no header found it means we have wrong attribute + // name or wrong file. + if (csvHeaderId == -1) { + throw new IllegalStateException( + "Attribute name mismatch. " + + "Failed to find attribute name: '" + + header + + "' among cvs file headers. All available headers: " + + Arrays.toString(csvHeaders)); + } else { + attrFieldIndexes[i] = csvHeaderId; + } + } + + // Read file and include only selected attributes + List line = null; + while ((line = csvReader.read()) != null) { + try { + String label = line.get(0); + Attribute[] attributes = new Attribute[attrNames.length]; + for (int i = 0, n = attrNames.length; i < n; i++) { + int attrFieldIndex = attrFieldIndexes[i]; + String value = line.get(attrFieldIndex); + attributes[i] = new Attribute(attrNames[i], + Double.valueOf(value)); + } + DataPoint dataPoint = new DataPoint(label, attributes); + allData.add(dataPoint); + } catch (Exception e) { + throw new RuntimeException("Error while reading line: '" + + line + "'", e); + } + } + + } catch (IOException e) { + throw new RuntimeException( + "Error while reading SF data from csv file: '" + filename + + "'. ", e); + } finally { + try { + if (csvReader != null) { + csvReader.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + System.out.println("From file: " + filename); + System.out.println("Using attribute names: " + + Arrays.toString(attrNames)); + System.out.println("Loaded " + allData.size() + " data points."); + + return allData.toArray(new DataPoint[allData.size()]); + } + + public static void main(String[] args) { + + // Creates dataset that uses all available attributes + SFDataset ds = SFData.createDataset(); + + // Creates dataset that uses only a subset of available attributes + // SFDataset ds = SFData.createDataset(new String[] {"IncomeRange", + // "Age"}); + // SFDataset ds = SFData.createDataset(new String[] {"Age"}); + + ds.printDistanceMatrix(); + + // Dendrogram dnd = null; + + // Uncomment one of these two run clustering + + // // Run Single Link Clustering + // SingleLinkAlgorithm sla = new SingleLinkAlgorithm(ds.getData(), + // ds.getDistanceMatrix()); + // dnd = sla.cluster(); + // dnd.print(); + + // // Run MST Single Link Clustering + // MSTSingleLinkAlgorithm msla = new + // MSTSingleLinkAlgorithm(ds.getData(), ds.getDistanceMatrix()); + // dnd = msla.cluster(); + // dnd.print(); + + // // Run Average Link Clustering + // AverageLinkAlgorithm ala = new AverageLinkAlgorithm(ds.getData(), + // ds.getDistanceMatrix()); + // dnd = ala.cluster(); + // dnd.print(); + + // double T = 5.0; + + NearestNeighborAlgorithm nna = new NearestNeighborAlgorithm( + ds.getData(), ds.getAdjacencyMatrix(), 5.0); + nna.run(); + } + + private static void validateAttrNames(String[] actualAttrNames, + String[] validAttrNames) { + List validNames = Arrays.asList(validAttrNames); + for (String actualAttrName : actualAttrNames) { + if (!validNames.contains(actualAttrName)) { + throw new IllegalArgumentException("Invalid attribute name: '" + + actualAttrName + "'. " + "Valid names are: " + + Arrays.toString(allAvailableAttributeNames)); + } + } + } +} diff --git a/src/org/yooreeka/algos/clustering/test/SFDataset.java b/src/org/yooreeka/algos/clustering/test/SFDataset.java new file mode 100644 index 0000000..1f65d16 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/test/SFDataset.java @@ -0,0 +1,93 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.test; + +import java.util.Arrays; + +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.util.metrics.NumericDistance; + +public class SFDataset { + + private DataPoint[] data; + private NumericDistance distance; + private double[][] adjacencyMatrix; + + public SFDataset(DataPoint[] data, NumericDistance distance) { + this.data = data; + this.distance = distance; + this.adjacencyMatrix = calculateAdjacencyMatrix(); + } + + /** + * Adjacency matrix for all data instances in the dataset. Each element + * represents distance between corresponding elements. + * + * @return + */ + private double[][] calculateAdjacencyMatrix() { + int n = data.length; + double[][] adjMatrix = new double[n][n]; + + DataPoint x = null; + DataPoint y = null; + + for (int i = 0; i < n; i++) { + x = data[i]; + for (int j = i + 1; j < n; j++) { + y = data[j]; + adjMatrix[i][j] = distance.getDistance( + x.getNumericAttrValues(), y.getNumericAttrValues()); + adjMatrix[j][i] = adjMatrix[i][j]; + } + adjMatrix[i][i] = 0.0; + } + + return adjMatrix; + } + + // We might need to move Matrix related methods to separate class + // eventually. + + public double[][] getAdjacencyMatrix() { + return adjacencyMatrix; + } + + public DataPoint[] getData() { + return data; + } + + public void printDistanceMatrix() { + for (int i = 0, n = adjacencyMatrix.length; i < n; i++) { + System.out.println(Arrays.toString(adjacencyMatrix[i])); + } + } +} diff --git a/src/org/yooreeka/algos/clustering/utils/Attributes.java b/src/org/yooreeka/algos/clustering/utils/Attributes.java new file mode 100644 index 0000000..4234fd5 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/utils/Attributes.java @@ -0,0 +1,143 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.utils; + +import org.yooreeka.algos.clustering.model.Attribute; + +/* + * Utility methods to simplify operations on attributes. + */ +public class Attributes { + + public static boolean allNumeric(Attribute[] attributes) { + boolean allNumeric = true; + for (Attribute a : attributes) { + if (a.isNumeric() == false) { + allNumeric = false; + break; + } + } + return allNumeric; + } + + public static boolean allText(Attribute[] attributes) { + boolean allText = true; + for (Attribute a : attributes) { + if (a.isText() == false) { + allText = false; + break; + } + } + return allText; + } + + public static Attribute[] createAttributes(double[] attrValues) { + int n = attrValues.length; + Attribute[] attrs = new Attribute[n]; + for (int i = 0; i < n; i++) { + String attrName = "a-" + i; + Attribute a = new Attribute(attrName, attrValues[i]); + attrs[i] = a; + } + return attrs; + } + + public static Attribute[] createAttributes(String[] attrValues) { + int n = attrValues.length; + Attribute[] attrs = new Attribute[n]; + for (int i = 0; i < n; i++) { + String attrName = "a-" + i; + Attribute a = new Attribute(attrName, attrValues[i]); + attrs[i] = a; + } + return attrs; + } + + public static Attribute[] createAttributes(String[] names, double[] values) { + int n = names.length; + Attribute[] attributes = new Attribute[n]; + for (int i = 0; i < n; i++) { + attributes[i] = new Attribute(names[i], values[i]); + } + return attributes; + } + + public static Attribute[] createAttributes(String[] names, String[] values) { + int n = names.length; + Attribute[] attributes = new Attribute[n]; + for (int i = 0; i < n; i++) { + attributes[i] = new Attribute(names[i], values[i]); + } + return attributes; + } + + public static String[] getNames(Attribute[] attributes) { + int n = attributes.length; + String[] names = new String[n]; + for (int i = 0; i < n; i++) { + Attribute a = attributes[i]; + names[i] = a.getName(); + } + return names; + } + + public static double[] getNumericValues(Attribute[] attributes) { + int n = attributes.length; + double[] values = new double[n]; + for (int i = 0; i < n; i++) { + Attribute a = attributes[i]; + if (a.isNumeric()) { + values[i] = a.getNumericValue(); + } else { + throw new RuntimeException( + "Non-numeric attribute encountered. " + "Attribute: " + + a.toString()); + } + } + return values; + } + + public static String[] getTextValues(Attribute[] attributes) { + int n = attributes.length; + String[] values = new String[n]; + for (int i = 0; i < n; i++) { + Attribute a = attributes[i]; + if (a.isText()) { + values[i] = a.getTextValue(); + } else { + throw new RuntimeException("Non-text attribute encountered. " + + "Attribute: " + a.toString()); + } + } + return values; + } + +} diff --git a/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java b/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java new file mode 100644 index 0000000..1ff7a1f --- /dev/null +++ b/src/org/yooreeka/algos/clustering/utils/ObjectToIndexMapping.java @@ -0,0 +1,90 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.utils; + +import java.util.HashMap; +import java.util.Map; + +/** + * Maps object values to an index. Index is zero-based. + */ +public class ObjectToIndexMapping implements java.io.Serializable { + + private static final long serialVersionUID = 2031098306406708902L; + + /* + * Index value that will be returned for the next new value. + */ + private int nextIndex = 0; + + /* + * Maintains mapping from object to index. + */ + private Map objMapping = new HashMap(); + + /* + * Maintains mapping from index to value. + */ + private Map indexMapping = new HashMap(); + + public ObjectToIndexMapping() { + // empty + } + + /** + * Returns index assigned to the value. For new values new index will be + * assigned and returned. + */ + public int getIndex(T value) { + Integer index = objMapping.get(value); + if (index == null) { + index = nextIndex; + objMapping.put(value, index); + indexMapping.put(index, value); + nextIndex++; + } + return index; + } + + /** + * Returns value mapped to the index or null if mapping doesn't exist. + */ + public T getObject(int index) { + return indexMapping.get(index); + } + + /** + * Current number of elements. + */ + public int getSize() { + return objMapping.size(); + } +} diff --git a/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java b/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java new file mode 100644 index 0000000..87be6e8 --- /dev/null +++ b/src/org/yooreeka/algos/clustering/utils/SortedArrayClustering.java @@ -0,0 +1,71 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.clustering.utils; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.logging.Logger; + +import org.yooreeka.algos.clustering.model.DataPoint; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * @author Babis Marmanis + * + */ +public class SortedArrayClustering { + + private static final Logger LOG = Logger.getLogger(SortedArrayClustering.class.getName()); + + public static void cluster(DataPoint[] points) { + + LOG.setLevel(YooreekaConfigurator.getLevel(SortedArrayClustering.class.getName())); + + Arrays.sort(points, new Comparator() { + public int compare(DataPoint p1, DataPoint p2) { + int result = 0; + // sort based on score value + if (p1.getR() < p2.getR()) { + result = 1; // sorting in descending order + } else if (p1.getR() > p2.getR()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + + for (int i = 0; i < points.length; i++) { + System.out.println(points[i].toShortString()); + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/cache/FileStore.java b/src/org/yooreeka/algos/reco/collab/cache/FileStore.java new file mode 100644 index 0000000..74feb15 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/cache/FileStore.java @@ -0,0 +1,134 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.cache; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.logging.Logger; + +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Implementation of Store interface. Uses files to store objects + * using java serialization. Each object instance is stored in a separate file. + */ +public class FileStore implements Store { + + private static final Logger LOG = Logger.getLogger(FileStore.class.getName()); + + private File dataDir; + + public FileStore(File dir) { + + LOG.setLevel(YooreekaConfigurator.getLevel(FileStore.class.getName())); + + if (!dir.exists()) { + dir.mkdirs(); + } + this.dataDir = dir; + } + + /** + * Creates a new instance that will use specified directory to store + * objects. + * + * @param dir + * directory that should be used to store/retrieve objects. + */ + public FileStore(String dir) { + this(new File(dir)); + } + + public boolean exists(String key) { + File f = getFile(key); + return f.exists(); + } + + public Object get(String key) { + Object o = null; + try { + File f = getFile(key); + if (f.exists()) { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file (dir: '" + dataDir + + "', filename: '" + key + "').", e); + } + return o; + } + + /* + * Derives filename from the key and returns instance of File + */ + private File getFile(String key) { + // key is used as a filename + return new File(dataDir, key + ".tmp"); + } + + public void put(String key, Object o) { + try { + File f = getFile(key); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(o); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException( + "Error while saving data into file (dir: '" + dataDir + + "', filename: '" + key + "').", e); + } + } + + public void remove(String key) { + File f = getFile(key); + if (f.exists()) { + f.delete(); + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/cache/Store.java b/src/org/yooreeka/algos/reco/collab/cache/Store.java new file mode 100644 index 0000000..fdecebd --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/cache/Store.java @@ -0,0 +1,72 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.cache; + +/** + * A Store provides service for persisting pre-calculated data. + */ +public interface Store { + /** + * Checks if key already exists. + * + * @param key + * object id. + * @return true if the key already exists. + */ + public boolean exists(String key); + + /** + * Retrieves object by key. + * + * @param key + * identifies data to retrieve. + * @return + */ + public Object get(String key); + + /** + * Persists object. Overwrites previously stored data with the same id. + * + * @param key + * id to identify the object. + * @param o + * object to be stored. + */ + public void put(String key, Object o); + + /** + * Deletes object. + * + * @param key + * identifies object to retrieve. + */ + public void remove(String key); +} diff --git a/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java b/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java new file mode 100644 index 0000000..e57ebac --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/BaseDataset.java @@ -0,0 +1,431 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * Dataset implementation that we will use to work with sample data. + * + * @author Babis Marmanis + */ +public class BaseDataset implements Serializable, Dataset { + + // private static final Logger logger = Logger.getLogger(BaseDataset.class); + + /** + * + */ + private static final long serialVersionUID = 8414181723065929475L; + + public static BaseDataset load(String file) { + Object o = null; + File f = new File(file); + if (f.exists()) { + try { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file: '" + file + "'", e); + } + } else { + throw new IllegalArgumentException("File doesn't exist: '" + file + + "'."); + } + System.out.println("loaded dataset from file"); + return (BaseDataset) o; + } + + public static void save(String file, BaseDataset o) { + try { + File f = new File(file); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(o); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException("Error while saving data into file: '" + + file + "'", e); + } + } + + /* + * Dataset name + */ + private String name = getClass().getSimpleName() + + System.currentTimeMillis(); + + /* + * All item ratings. + */ + private List allRatings = new ArrayList(); + + /* + * Map of all users. + */ + private Map allUsers = new HashMap(); + + /* + * Map of all items. + */ + private Map allItems = new HashMap(); + + /* + * Map of item ratings by user id. + */ + Map> ratingsByUserId = new HashMap>(); + + Set allTermsSet = new HashSet(); + + /** + * Auxiliary method for loading users one by one. This is for demonstration + * purposes. Use other kind of loaders for loading data en mass. + * + * @param u + * denotes a User who has rated certain items and we want to add + * his ratings in this dataset + * @return true if no errors occurred and all data have been added. + * Otherwise, return false but do add whatever we can. + */ + public boolean add(User u) { + + boolean addedUser = true; + + // Auxiliary + Item item; + + // Add the ratings + Collection urc = u.getAllRatings(); + Rating[] uRatings = urc.toArray(new Rating[urc.size()]); + + // Add the user + if (!allUsers.containsKey(u.getId())) { + this.allUsers.put(u.getId(), u); + + for (Content content : u.getUserContent()) { + updateTerms(content.getTerms()); + } + } + + for (Rating r : uRatings) { + if (!this.allRatings.add(r)) { + System.out.println("________________________________"); + System.out.println("ERROR >> Could not add rating! "); + System.out.println(" >> User ID: " + r.getUserId()); + System.out.println(" >> Item ID: " + r.getItemId()); + System.out.println(" >> Rating : " + r.getRating()); + System.out.println("________________________________"); + + addedUser = false; + } + + item = r.getItem(); + + /* + * Reuse existing item if it is available. Existing item contains + * ratings of previously added users and we don't want to overwrite + * them in case new item is a different instance with the same id. + */ + if (!allItems.containsKey(item.getId())) { + this.allItems.put(item.getId(), item); + } + + // Populate item ratings if item doesn't have them + // Note that here we rely on all users/ratings sharing the same + // instance of an item. + if (item.getUserRating(u.getId()) == null) { + item.addUserRating(r); + } + } + + return addedUser; + } + + /* + * Auxiliary method for loading items one by one. This is for demonstration + * purposes. Can be used when we want to link users and items using item + * content instead of rating. In such cases ratings won't be available and + * as a result add(User) won't be able to derive any Items + * through user ratings. + */ + public boolean addItem(Item item) { + boolean addedItem = false; + if (!allItems.containsKey(item.getId())) { + this.allItems.put(item.getId(), item); + addedItem = true; + + Content content = item.getItemContent(); + updateTerms(content.getTerms()); + } + return addedItem; + } + + public Item findItemByName(String name) { + Item matchedItem = null; + for (Item item : this.allItems.values()) { + if (name.equalsIgnoreCase(item.getName())) { + matchedItem = item; + break; + } + } + return matchedItem; + + } + + public User findUserByName(String name) { + User matchedUser = null; + for (User user : this.allUsers.values()) { + if (name.equalsIgnoreCase(user.getName())) { + matchedUser = user; + break; + } + } + return matchedUser; + } + + public String[] getAllTerms() { + return allTermsSet.toArray(new String[allTermsSet.size()]); + } + + public double getAverageItemRating(int itemId) { + return getItem(itemId).getAverageRating(); + } + + public double getAverageUserRating(int userId) { + return getUser(userId).getAverageRating(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getItem(java.lang.Integer) + */ + public Item getItem(Integer itemId) { + return allItems.get(itemId); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getItemCount() + */ + public int getItemCount() { + return allItems.size(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getItems() + */ + public Collection getItems() { + return allItems.values(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getName() + */ + public String getName() { + return name; + } + + public List getRatedItems(Integer userId) { + List ratedItems = new ArrayList(); + User user = getUser(userId); + Collection userRatings = user.getAllRatings(); + for (Rating r : userRatings) { + Item ratedItem = getItem(r.getItemId()); + ratedItems.add(ratedItem); + } + return ratedItems; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getRatings() + */ + public Collection getRatings() { + return this.allRatings; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getRatingsCount() + */ + public int getRatingsCount() { + return allRatings.size(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getUser(java.lang.Integer) + */ + public User getUser(Integer userId) { + return allUsers.get(userId); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getUserCount() + */ + public int getUserCount() { + return allUsers.size(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch3.collaborative.model.Dataset#getUsers() + */ + public Collection getUsers() { + return allUsers.values(); + } + + public boolean isIdMappingRequired() { + return true; + } + + public ContentItem pickContentItem(String name) { + ContentItem contentItem = null; + + for (Map.Entry entry : allItems.entrySet()) { + Item anItem = entry.getValue(); + if (name.equals(anItem.getName())) { + contentItem = new ContentItem(entry.getValue()); + break; + } + } + return contentItem; + } + + public Item pickItem(String name) { + Item item = null; + for (Map.Entry entry : allItems.entrySet()) { + Item anItem = entry.getValue(); + if (name.equals(anItem.getName())) { + item = entry.getValue(); + break; + } + } + return item; + } + + public User pickUser(String name) { + User user = null; + for (Map.Entry entry : allUsers.entrySet()) { + User aUser = entry.getValue(); + if (name.equals(aUser.getName())) { + user = entry.getValue(); + break; + } + } + return user; + } + + /** + * Prints all ratings by item. + */ + public void printItemRatings() { + System.out.println("\nItem ratings:\n"); + for (Item item : allItems.values()) { + System.out.println("Item: " + item.getName()); + for (Rating r : item.getAllRatings()) { + User user = this.allUsers.get(r.getUserId()); + System.out.println(" Rated by " + user.getName() + " as " + + r.getRating()); + } + } + } + + /** + * Prints all ratings by item. + */ + public void printUserRatings() { + System.out.println("\nUser ratings:\n"); + for (User user : allUsers.values()) { + System.out.println("User: " + user.getName()); + for (Rating r : user.getAllRatings()) { + Item item = allItems.get(r.getItemId()); + System.out.println(" Rated " + item.getName() + " as " + + r.getRating()); + } + } + } + + public void save(String file) { + BaseDataset.save(file, this); + System.out.println("saved dataset into file"); + } + + private void updateTerms(String[] terms) { + for (String term : terms) { + allTermsSet.add(term); + } + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/ContentItem.java b/src/org/yooreeka/algos/reco/collab/data/ContentItem.java new file mode 100644 index 0000000..d754247 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/ContentItem.java @@ -0,0 +1,59 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Item for news dataset. + */ +public class ContentItem extends Item { + + /** + * SVUID + */ + private static final long serialVersionUID = 6349342365379966975L; + + public ContentItem(int id, String name, Content content) { + super(id, name, new ArrayList(3)); + setItemContent(content); + } + + public ContentItem(Item val) { + super(val.getId(), val.getName(), new ArrayList(3)); + this.setItemContent(val.getItemContent()); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/DiggData.java b/src/org/yooreeka/algos/reco/collab/data/DiggData.java new file mode 100644 index 0000000..7da7fad --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/DiggData.java @@ -0,0 +1,361 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.supercsv.io.CsvListReader; +import org.supercsv.io.CsvListWriter; +import org.supercsv.prefs.CsvPreference; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.model.SimilarItem; +import org.yooreeka.algos.reco.collab.recommender.Delphi; +import org.yooreeka.algos.reco.content.digg.DiggService; +import org.yooreeka.algos.reco.content.digg.DiggStoryItem; +import org.yooreeka.algos.reco.content.digg.DiggUser; + +public class DiggData { + + public static List allUsers = new ArrayList(); + public static List allStories = new ArrayList(); + + private static final String[] CSV_ITEM_HEADERS = new String[] { "id", + "username", "title", "category", "topic", "description", "link", + "userid" }; + + public static BaseDataset createDataset() { + + BaseDataset ds = new BaseDataset(); + + Delphi delphiIC = createItemContentDelphi(); + int topN = 10; + for (DiggUser user : allUsers) { + List userItems = findItemsByUsername(user.getName()); + for (DiggStoryItem item : userItems) { + + // similar items across all categories + SimilarItem[] similarItems = delphiIC.findSimilarItems(item, + topN); + + // Create a set of biased ratings for user using a subset from + // similar items + int lowRating = 0; + int highRating = 0; + if (user.getName().toLowerCase().charAt(0) <= 'd') { + // range of ratings for users whose name starts from A to D + lowRating = 4; + highRating = 5; + } else { + // range of ratings for users whose name starts from E to Z + lowRating = 1; + highRating = 3; + } + + // select 70% of similar items + Item[] randomItems = pickRandomItems(similarItems, 0.7); + RatingBuilder ratingBuider = new RatingBuilder(); + List ratings = ratingBuider.createBiasedRatings( + user.getId(), randomItems, lowRating, highRating); + for (Rating r : ratings) { + user.addRating(r); + } + } + ds.add(user); + System.out.println("Generated " + user.getAllRatings().size() + + " ratings for user id: " + user.getId() + ", name: " + + user.getName() + ", average rating: " + + user.getAverageRating()); + } + + System.out.println("Created Dataset with " + ds.getUserCount() + + " users, " + ds.getItemCount() + " items, " + + ds.getRatingsCount() + " ratings."); + + return ds; + } + + private static Delphi createItemContentDelphi() { + BaseDataset ds = new BaseDataset(); + for (DiggUser user : allUsers) { + ds.add(user); + } + + for (DiggStoryItem item : allStories) { + System.out.println("Description:" + item.getDescription()); + ds.addItem(item); + } + + return new Delphi(ds, RecommendationType.ITEM_CONTENT_BASED, true); + } + + private static List findItemsByUsername(String username) { + List items = new ArrayList(); + for (DiggStoryItem item : allStories) { + if (item.getUsername().equals(username)) { + items.add(item); + } + } + return items; + } + + private static DiggUser findUserByUsername(String username) { + DiggUser matchedUser = null; + for (DiggUser u : allUsers) { + if (u.getName().equals(username)) { + matchedUser = u; + break; + } + } + return matchedUser; + } + + /** + * Load data from csv file. + * + * @param filename + */ + public static BaseDataset loadData(String filename) { + + allStories = new ArrayList(); + allUsers = new ArrayList(); + + CsvListReader csvReader = null; + try { + csvReader = new CsvListReader(new BufferedReader(new FileReader( + filename)), CsvPreference.EXCEL_PREFERENCE); + + csvReader.getCSVHeader(true); + + List line = null; + while ((line = csvReader.read()) != null) { + try { + int id = Integer.valueOf(line.get(0)); + String username = line.get(1); + String title = line.get(2); + String category = line.get(3); + String topic = line.get(4); + String description = line.get(5); + String link = line.get(6); + int userid = Integer.valueOf(line.get(7)); + + DiggUser user = findUserByUsername(username); + if (user == null) { + user = new DiggUser(userid, username); + allUsers.add(user); + } + + DiggStoryItem item = new DiggStoryItem(id, title, + description); + item.setUsername(username); + item.setCategory(category); + item.setTopic(topic); + item.setLink(link); + allStories.add(item); + + // adding item content to the user + user.addUserContent(item.getItemContent()); + } catch (Exception e) { + throw new RuntimeException("Error while reading item: ", e); + } + } + } catch (IOException e) { + throw new RuntimeException( + "Error while reading digg items from csv file.", e); + } finally { + try { + if (csvReader != null) { + csvReader.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + System.out.println("From file: " + filename); + System.out.println("Loaded " + allUsers.size() + " users."); + System.out.println("Loaded " + allStories.size() + " stories (items)."); + + return DiggData.createDataset(); + } + + /* + * Loading data from Digg. + * + * @param filename file that will be used to save the data. + */ + public static BaseDataset loadDataFromDigg(String filename) { + + allUsers.clear(); + allStories.clear(); + + Set allKnownUsers = new HashSet(); + Set allKnownStories = new HashSet(); + + DiggService news = new DiggService(); + news.setItemCountPerCategory(5); + // Top stories across a set of categories (Technology, Sports, ...) + List topStories = news.getAllStories(); + + // used to assign unique id to each user + int nextUserId = 1; + + // iterate through top stories and collect a set of users + for (DiggStoryItem item : topStories) { + String username = item.getUsername(); + if (!allKnownUsers.contains(username)) { + allKnownUsers.add(username); + int userId = nextUserId++; + DiggUser diggUser = new DiggUser(userId, username); + allUsers.add(diggUser); + } + } + + // for every user retrieve up to 5 stories + int maxStories = 5; + for (DiggUser user : allUsers) { + List userItems = news.getUserStories(user.getName(), + maxStories); + + for (DiggStoryItem i : userItems) { + if (!allKnownStories.contains(i.getId())) { + allStories.add(i); + allKnownStories.add(i.getId()); + } else { + System.out.println("Duplicate story: id=" + i.getId() + + ", name=" + i.getName()); + } + // adding item content to the user + user.addUserContent(i.getItemContent()); + } + } + System.out.println("From Digg:"); + System.out.println("Loaded " + allUsers.size() + " users."); + System.out.println("Loaded " + allStories.size() + " stories (items)."); + + DiggData.saveData(filename); + return DiggData.createDataset(); + } + + private static Item[] pickRandomItems(SimilarItem[] items, + double percentOfAllItems) { + + if (percentOfAllItems < 0.0 || percentOfAllItems > 1.0) { + throw new IllegalArgumentException( + "Value for 'percentOfAllItems' argument should be between 0 and 1."); + } + Random rand = new Random(); + int sampleSize = (int) Math.round(percentOfAllItems * items.length); + Map pickedItems = new HashMap(); + while (pickedItems.size() < sampleSize) { + int itemId = rand.nextInt(items.length); + Item item = items[itemId].getItem(); + if (!pickedItems.containsKey(item.getId())) { + pickedItems.put(item.getId(), item); + } + } + + return pickedItems.values().toArray(new Item[pickedItems.size()]); + } + + /** + * Save data into csv file. + * + * @param filename + */ + public static void saveData(String filename) { + String[] data = new String[CSV_ITEM_HEADERS.length]; + + CsvListWriter csvWriter = null; + try { + csvWriter = new CsvListWriter(new BufferedWriter(new FileWriter( + filename)), CsvPreference.EXCEL_PREFERENCE); + + csvWriter.writeHeader(CSV_ITEM_HEADERS); + + for (DiggStoryItem item : allStories) { + try { + data[0] = String.valueOf(item.getId()); + data[1] = item.getUsername(); + data[2] = item.getTitle(); + data[3] = item.getCategory(); + data[4] = item.getTopic(); + data[5] = item.getDescription(); + data[6] = item.getLink(); + DiggUser user = findUserByUsername(item.getUsername()); + data[7] = String.valueOf(user.getId()); + csvWriter.write(data); + } catch (Exception e) { + throw new RuntimeException("Error while writing item " + + item.getName() + ": ", e); + } + } + } catch (IOException e) { + throw new RuntimeException( + "Error while writing digg items into csv file.", e); + } finally { + try { + if (csvWriter != null) { + csvWriter.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + System.out.println("Saved data into file: " + filename); + System.out.println("saved " + allUsers.size() + " users."); + System.out.println("saved " + allStories.size() + " stories (items)."); + + } + + public static void showUsers() { + System.out.println("All Users:"); + for (DiggUser user : allUsers) { + System.out.println("User id:" + user.getId() + ", name: " + + user.getName()); + } + + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java b/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java new file mode 100644 index 0000000..479e35d --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/HTMLContent.java @@ -0,0 +1,99 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.util.parsing.html.HTMLDocumentParser; +import org.yooreeka.util.parsing.html.HTMLDocumentParserException; + +public class HTMLContent extends Content { + + /** + * SVUID + */ + private static final long serialVersionUID = -354667863913509004L; + + private static String extractContentFromHtmlDoc(File htmlFile) { + + String htmlText = null; + FileInputStream fis = null; + + try { + fis = new FileInputStream(htmlFile); + Reader reader = new InputStreamReader(new BufferedInputStream(fis)); + HTMLDocumentParser htmlParser = new HTMLDocumentParser(reader); + + htmlText = htmlParser.getHtmlDoc().getText(); + + } catch (IOException e) { + + throw new RuntimeException(e); + + } catch (HTMLDocumentParserException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } finally { + if (fis != null) { + try { + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return htmlText; + } + + public HTMLContent(String id, File htmlDocFile) { + super(id, extractContentFromHtmlDoc(htmlDocFile)); + } + + public HTMLContent(String id, File htmlDocFile, int topNTerms) { + super(id, extractContentFromHtmlDoc(htmlDocFile), topNTerms); + } + + public HTMLContent(String id, String htmlDocFilename) { + super(id, extractContentFromHtmlDoc(new File(htmlDocFilename))); + } + + public HTMLContent(String id, String htmlDocFilename, int topNTerms) { + super(id, extractContentFromHtmlDoc(new File(htmlDocFilename)), + topNTerms); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java b/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java new file mode 100644 index 0000000..6ad655c --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MovieLensData.java @@ -0,0 +1,83 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.File; + +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Utility class to create MovieLens dataset. + */ +public class MovieLensData { + + /** + * Loads MovieLens dataset from default directory. + */ + public static MovieLensDataset createDataset() { + int numOfTestRatings = 0; + return createDataset(numOfTestRatings); + } + + public static MovieLensDataset createDataset(int numOfTestRatings) { + return createDataset( + YooreekaConfigurator.getProperty("iweb2.movielens.data.dir"), + numOfTestRatings); + } + + /** + * Loads MovieLens dataset from specified directory. + * + * @param dataDir + * directory that contains MovieLens files. + * @return + */ + public static MovieLensDataset createDataset(String dataDir, + int numOfTestRatings) { + File users = new File(dataDir, MovieLensDataset.USERS_FILENAME); + File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME); + File ratings = new File(dataDir, MovieLensDataset.RATINGS_FILENAME); + + System.out.println("*** Loading MovieLens dataset..."); + System.out.println("make sure that you are using at least: -Xmx1024m"); + + MovieLensDataset dataSet = new MovieLensDataset(users, items, ratings, + numOfTestRatings); + + System.out.println("\n*** Loaded MovieLens dataset."); + System.out.println("users: " + dataSet.getUserCount()); + System.out.println("movies: " + dataSet.getItemCount()); + System.out.println("ratings: " + dataSet.getRatingsCount()); + System.out.println("test ratings: " + dataSet.getTestRatings().size()); + + return dataSet; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java b/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java new file mode 100644 index 0000000..74af7cf --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MovieLensDataset.java @@ -0,0 +1,385 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * Dataset implementation that we will use to work with MovieLens data. All data + * is loaded from three files: users, movies (items), and ratings. + */ +public class MovieLensDataset implements Dataset { + + public static final String USERS_FILENAME = "users.dat"; + public static final String ITEMS_FILENAME = "movies.dat"; + public static final String RATINGS_FILENAME = "ratings.dat"; + + /* + * Delimiter that is used by MovieLens data files. + */ + private static final String FIELD_DELIMITER = "::"; + + /** + * Saves provided ratings into a new file. Used to split ratings provided as + * part of MovieLens data set into files that represent various rating sets + * for training and testing. + * + * @param f + * file to write to. + * @param ratings + * ratings to save. + */ + public static void createNewRatingsFile(File f, Collection ratings) { + try { + PrintWriter pw = new PrintWriter(new BufferedWriter(new FileWriter( + f))); + for (Rating rating : ratings) { + pw.println(rating.getUserId() + FIELD_DELIMITER + + rating.getItemId() + FIELD_DELIMITER + + rating.getRating()); + } + pw.flush(); + pw.close(); + } catch (IOException e) { + throw new RuntimeException( + "Failed to save rating into file (file: '" + + f.getAbsolutePath() + "').", e); + } + } + + private static BufferedReader getReader(File f) + throws FileNotFoundException { + return new BufferedReader(new FileReader(f)); + } + + public static List loadRatings(File f) { + List allRatings = new ArrayList(); + + BufferedReader reader = null; + String line = null; + try { + reader = getReader(f); + while ((line = reader.readLine()) != null) { + String[] tokens = parseLine(line); + int userId = Integer.parseInt(tokens[0]); + int itemId = Integer.parseInt(tokens[1]); + int rating = Integer.parseInt(tokens[2]); + allRatings.add(new Rating(userId, itemId, rating)); + } + } catch (IOException e) { + throw new RuntimeException( + "Failed to load rating from file (file: '" + + f.getAbsolutePath() + "'): ", e); + } finally { + if (reader != null) { + try { + reader.close(); + } catch (Exception e) { + System.out.println("ERROR: \n"); + System.out.println(e.getMessage() + + "\n while closing file reader (file: '" + + f.getAbsolutePath() + "'): "); + } + } + } + + return allRatings; + } + + private static String[] parseLine(String line) { + // possible field delimiters: "::", "\t", "|" + return line.split("::|\t|\\|"); + } + /* + * All item ratings. + */ + private List allRatings = new ArrayList(); + + /* + * Map of all users. + */ + private Map allUsers = new HashMap(); + + /* + * Map of all items. + */ + private Map allItems = new HashMap(); + + /* + * Parameters for test dataset + */ + private int numberOfTestRatings = 0; + + private List testRatings = new ArrayList(); + + /* + * Map of item ratings by item id. + */ + private Map> ratingsByItemId = new HashMap>(); + + /* + * Map of item ratings by user id. + */ + Map> ratingsByUserId = new HashMap>(); + + private String name; + + public MovieLensDataset(File users, File movies, File ratings) { + name = getClass().getSimpleName() + System.currentTimeMillis(); + loadData(users, movies, ratings, null); + } + + public MovieLensDataset(File users, File movies, File ratings, + int numOfTestRatings) { + name = getClass().getSimpleName() + System.currentTimeMillis(); + this.numberOfTestRatings = numOfTestRatings; + loadData(users, movies, ratings, null); + } + + public MovieLensDataset(String name, File users, File movies, File ratings) { + + this.name = name; + loadData(users, movies, ratings, null); + } + + public MovieLensDataset(String name, File users, File items, + List ratings) { + + this.name = name; + loadData(users, items, null, ratings); + } + + private void addRatingToMap(Map> map, Integer key, + Rating rating) { + List ratingsForKey = map.get(key); + if (ratingsForKey == null) { + ratingsForKey = new ArrayList(); + map.put(key, ratingsForKey); + } + ratingsForKey.add(rating); + } + + private Item createNewItem(int itemId, String name) { + List ratings = ratingsByItemId.get(itemId); + if (ratings == null) { + ratings = new ArrayList(); + } + + Item item = new Item(itemId, name, ratings); + + // establish link between rating and item + for (Rating r : ratings) { + r.setItem(item); + } + + return item; + } + + public String[] getAllTerms() { + return new String[0]; + } + + public double getAverageItemRating(int itemId) { + return getItem(itemId).getAverageRating(); + } + + public double getAverageUserRating(int userId) { + return getUser(userId).getAverageRating(); + } + + public Item getItem(Integer itemId) { + return allItems.get(itemId); + } + + public int getItemCount() { + return allItems.size(); + } + + public Collection getItems() { + return allItems.values(); + } + + public String getName() { + return name; + } + + public Collection getRatings() { + return this.allRatings; + } + + public int getRatingsCount() { + return allRatings.size(); + } + + public Collection getTestRatings() { + return this.testRatings; + } + + public User getUser(Integer userId) { + return allUsers.get(userId); + } + + public int getUserCount() { + return allUsers.size(); + } + + public Collection getUsers() { + return allUsers.values(); + } + + public boolean isIdMappingRequired() { + return false; + } + + private void loadData(File usersFile, File itemsFile, File ratingsFile, + List ratings) { + try { + /* Load all available ratings */ + if (ratings == null) { + allRatings = loadRatings(ratingsFile); + } else { + allRatings = ratings; + } + + /* Exclude ratings if needed */ + withholdRatings(); + + /* build maps that provide access to ratings by userId or itemId */ + for (Rating rating : allRatings) { + addRatingToMap(ratingsByItemId, rating.getItemId(), rating); + addRatingToMap(ratingsByUserId, rating.getUserId(), rating); + } + /* + * load users and item. Each instance will have a set of ratings + * relevant to it + */ + allUsers = loadUsers(usersFile); + allItems = loadItems(itemsFile); + } catch (IOException e) { + throw new RuntimeException("Failed to load MovieLens data: ", e); + } + } + + private Map loadItems(File moviesFile) throws IOException { + + Map items = new HashMap(); + + BufferedReader reader = getReader(moviesFile); + String line = null; + int lastId = 0; + while ((line = reader.readLine()) != null) { + + String[] tokens = parseLine(line); + + /* at the moment we are only interested in movie id */ + int itemId = Integer.parseInt(tokens[0]); + String title = tokens[1]; + + /* + * In some cases we need to create items for missing ids. Movies + * file from MovieLens dataset skips over some ids. To keep things + * simple we made assumption that user and movie (item) ids are + * sequences without gaps that start with 1. + */ + if (itemId > lastId + 1) { + + for (int i = lastId + 1; i < itemId; i++) { + // System.out.println("DEBUG:\n"); + // System.out.println("Movies file has a gap in ID sequence. "); + // System.out.println("Creating artificial item for ID: " + + // i); + + Item missingItem = createNewItem(i, "Missing-Item-" + i); + items.put(missingItem.getId(), missingItem); + } + } + + Item item = createNewItem(itemId, title); + + items.put(item.getId(), item); + lastId = item.getId(); + } + return items; + } + + private Map loadUsers(File usersFile) throws IOException { + Map users = new HashMap(); + + BufferedReader reader = getReader(usersFile); + String line = null; + + while ((line = reader.readLine()) != null) { + String[] tokens = parseLine(line); + /* at the moment we are only interested in user id */ + int userId = Integer.parseInt(tokens[0]); + List userRatings = ratingsByUserId.get(userId); + if (userRatings == null) { + userRatings = new ArrayList(); + } + User user = new User(userId, userRatings); + users.put(user.getId(), user); + } + + return users; + } + + public void setTestRatingsCount(int numberOfRatings) { + this.numberOfTestRatings = numberOfRatings; + } + + private void withholdRatings() { + Random rnd = new Random(); + while (testRatings.size() < this.numberOfTestRatings) { + int randomIndex = rnd.nextInt(allRatings.size()); + Rating rating = allRatings.remove(randomIndex); + testRatings.add(rating); + } + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicData.java b/src/org/yooreeka/algos/reco/collab/data/MusicData.java new file mode 100644 index 0000000..d3682a8 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MusicData.java @@ -0,0 +1,256 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Utility class that we use as the source for Music data. + */ +public class MusicData { + + public static final String[] USERS = { "Albert", "Alexandra", "Athena", + "Aurora", "Babis", "Bill", "Bob", "Carl", "Catherine", "Charlie", + "Constantine", "Dmitry", "Elena", "Eric", "Frank", "George", + "Jack", "John", "Maria", "Lukas", "Nick", "Terry", "Todd" }; + + public static final String[] MUSIC_SAMPLES = { + "You've Lost That Lovin' Feelin'", "Mrs. Robinson", + "Wind Beneath My Wings", "Fiddler On The Roof", "La Bamba", + "Wizard Of Oz", "White Christmas", "Let It Be", "Yesterday", + "Singing In The Rain", "Sunday, Bloody Sunday", "Tears In Heaven", + "Beethoven: Symphony No. 9 in D minor", + "Bach: The Brandenburg Concerti", "Mozart: Symphony #41 (Jupiter)", + "What A Wonderful World", "I Love Rock And Roll", + "Albinoni: Adagio In G Minor", "Vivaldi: Four Seasons" }; + + /** + * Builds data set with all the users where each user rates 80% of all the + * songs. User ratings created randomly with bias: + *
    + *
  • Users whose name starts from A to D will have ratings between 3 and + * 5.
  • + *
  • Users whose name starts from E to Z will have ratings between 1 and + * 3.
  • + *
+ */ + public static BaseDataset createDataset() { + BaseDataset ds = new BaseDataset(); + + double percentOfAllSongs = 0.80; + + /* Create items first */ + MusicItem[] allItems = loadAllMusicItems(); + + for (int i = 0, n = USERS.length; i < n; i++) { + int userId = i; + String userName = USERS[i]; + int lowRating = 1; + int highRating = 5; + + if (userName.toLowerCase().charAt(0) <= 'd') { + // range of ratings for users whose name starts from A to D + lowRating = 4; + highRating = 5; + } else { + // range of ratings for users whose name starts from E to Z + lowRating = 1; + highRating = 3; + } + MusicItem[] items = pickRandomSongs(allItems, percentOfAllSongs); + + RatingBuilder ratingBuider = new RatingBuilder(); + List ratings = ratingBuider.createBiasedRatings(userId, + items, lowRating, highRating); + + MusicUser mu = new MusicUser(userId, userName, ratings); + + ds.add(mu); + } + return ds; + } + + private static MusicItem createItem(String song) { + int id = -1; + for (int i = 0, n = MUSIC_SAMPLES.length; i < n; i++) { + if (MUSIC_SAMPLES[i].equalsIgnoreCase(song)) { + id = i; + break; + } + } + if (id < 0) { + throw new IllegalArgumentException("Invalid song name: '" + song + + "'. This song is not on the list of predefined songs."); + } + + return new MusicItem(id, MUSIC_SAMPLES[id]); + } + + private static MusicUser createUser(String name) { + int id = -1; + for (int i = 0, n = USERS.length; i < n; i++) { + if (USERS[i].equalsIgnoreCase(name)) { + id = i; + break; + } + } + if (id < 0) { + throw new IllegalArgumentException("Invalid user name: '" + name + + "'. Name is not on the list of predefined user names."); + } + + return new MusicUser(id, name); + } + + /** + * Returns array of new MusicItem instances for every songs listed in + * MUSIC_SAMPLES array. + */ + private static MusicItem[] loadAllMusicItems() { + MusicItem[] allItems = new MusicItem[MusicData.MUSIC_SAMPLES.length]; + for (int i = 0, n = allItems.length; i < n; i++) { + int id = i; + String name = MusicData.MUSIC_SAMPLES[i]; + MusicItem item = new MusicItem(id, name); + allItems[i] = item; + } + return allItems; + } + + public static MusicUser[] loadExample() { + MusicUser[] mu = new MusicUser[3]; + + mu[0] = createUser("Frank"); + mu[1] = createUser("Constantine"); + mu[2] = createUser("Catherine"); + + MusicItem[] mi = new MusicItem[11]; + + mi[0] = createItem("Tears In Heaven"); + mi[1] = createItem("La Bamba"); + mi[2] = createItem("Mrs. Robinson"); + mi[3] = createItem("Yesterday"); + mi[4] = createItem("Wizard Of Oz"); + mi[5] = createItem("Mozart: Symphony #41 (Jupiter)"); + mi[6] = createItem("Beethoven: Symphony No. 9 in D minor"); + mi[7] = createItem("Fiddler On The Roof"); + mi[8] = createItem("What A Wonderful World"); + mi[9] = createItem("Let It Be"); + mi[10] = createItem("Sunday, Bloody Sunday"); + + ArrayList mr0 = new ArrayList(); + ArrayList mr1 = new ArrayList(); + ArrayList mr2 = new ArrayList(); + + /* + * Tears In Heaven <- 0 La Bamba <- 1 Mrs. Robinson <- 2 Yesterday <- 3 + * Wizard Of Oz <- 4 Mozart: Symphony #41 (Jupiter) <- 5 Beethoven: + * Symphony No. 9 in D <- 6 + */ + mr0.add(new MusicRating(mu[0].getId(), mi[0].getId(), 5)); + mr0.add(new MusicRating(mu[0].getId(), mi[1].getId(), 4)); + mr0.add(new MusicRating(mu[0].getId(), mi[2].getId(), 5)); + mr0.add(new MusicRating(mu[0].getId(), mi[3].getId(), 4)); + mr0.add(new MusicRating(mu[0].getId(), mi[4].getId(), 5)); + mr0.add(new MusicRating(mu[0].getId(), mi[5].getId(), 4)); + mr0.add(new MusicRating(mu[0].getId(), mi[6].getId(), 5)); + + /* + * Tears In Heaven <- 0 Fiddler On The Roof <- 7 Mrs. Robinson <- 2 What + * A Wonderful World <- 8 Wizard Of Oz <- 4 Let It Be <- 9 Mozart: + * Symphony #41 (Jupiter) <- 5 + */ + + mr1.add(new MusicRating(mu[1].getId(), mi[0].getId(), 5)); + mr1.add(new MusicRating(mu[1].getId(), mi[7].getId(), 5)); + mr1.add(new MusicRating(mu[1].getId(), mi[2].getId(), 5)); + mr1.add(new MusicRating(mu[1].getId(), mi[8].getId(), 4)); + mr1.add(new MusicRating(mu[1].getId(), mi[4].getId(), 4)); + mr1.add(new MusicRating(mu[1].getId(), mi[9].getId(), 5)); + mr1.add(new MusicRating(mu[1].getId(), mi[5].getId(), 5)); + + /* + * Tears In Heaven <- 0 Mrs. Robinson <- 2 Yesterday <- 3 Beethoven: + * Symphony No. 9 in D <- 6 Sunday, Bloody Sunday <- 10 Yesterday <- 3 + * Let It Be <- 9 + */ + mr2.add(new MusicRating(mu[2].getId(), mi[0].getId(), 1)); + mr2.add(new MusicRating(mu[2].getId(), mi[2].getId(), 2)); + mr2.add(new MusicRating(mu[2].getId(), mi[3].getId(), 2)); + mr2.add(new MusicRating(mu[2].getId(), mi[6].getId(), 3)); + mr2.add(new MusicRating(mu[2].getId(), mi[10].getId(), 1)); + mr2.add(new MusicRating(mu[2].getId(), mi[3].getId(), 1)); + mr2.add(new MusicRating(mu[2].getId(), mi[9].getId(), 2)); + + mu[0].setRatings(mr0); + mu[1].setRatings(mr1); + mu[2].setRatings(mr2); + + return mu; + } + + /** + * Returns a random selection of songs. + * + * @param songs + * list of songs to pick from + * @param percentOfAllSongs + * determines size of returned selection. + * + * @return array of songs. + */ + private static MusicItem[] pickRandomSongs(MusicItem[] songs, + double percentOfAllSongs) { + + if (percentOfAllSongs < 0.0 || percentOfAllSongs > 1.0) { + throw new IllegalArgumentException( + "Value for 'percentOfAllSongs' argument should be between 0 and 1."); + } + Random rand = new Random(); + int sampleSize = (int) Math.round(percentOfAllSongs * songs.length); + Map pickedItems = new HashMap(); + while (pickedItems.size() < sampleSize) { + int songId = rand.nextInt(songs.length); + MusicItem song = songs[songId]; + if (!pickedItems.containsKey(song.getId())) { + pickedItems.put(song.getId(), song); + } + } + + return pickedItems.values().toArray(new MusicItem[pickedItems.size()]); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicItem.java b/src/org/yooreeka/algos/reco/collab/data/MusicItem.java new file mode 100644 index 0000000..ab5e823 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MusicItem.java @@ -0,0 +1,71 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; + +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Item for music dataset. + * + * @author Babis Marmanis + */ +public class MusicItem extends Item { + + /** + * + */ + private static final long serialVersionUID = 3219691524340585231L; + + String artist; + + public MusicItem(int id, String name) { + super(id, name, new ArrayList(3)); + } + + /** + * @return the artist + */ + public String getArtist() { + return artist; + } + + /** + * @param artist + * the artist to set + */ + public void setArtist(String artist) { + this.artist = artist; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicRating.java b/src/org/yooreeka/algos/reco/collab/data/MusicRating.java new file mode 100644 index 0000000..9046889 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MusicRating.java @@ -0,0 +1,52 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Rating for music dataset. + * + * @author Babis Marmanis + */ +public class MusicRating extends Rating { + + /** + * + */ + private static final long serialVersionUID = 4015578066768031191L; + + public MusicRating(int userId, int songId, int rating) { + + super(userId, songId, rating); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/MusicUser.java b/src/org/yooreeka/algos/reco/collab/data/MusicUser.java new file mode 100644 index 0000000..1c8e89a --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/MusicUser.java @@ -0,0 +1,249 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.util.gui.XyGui; + +/** + * User for music dataset. + * + * @author Babis Marmanis + */ +public class MusicUser extends User { + + /** + * + */ + private static final long serialVersionUID = 4866915806848833932L; + + public MusicUser(int userId, String name) { + super(userId, name); + } + + public MusicUser(int userId, String name, List ratings) { + super(userId, name, ratings); + } + + public double getSimilarity(MusicUser u, int simType) { + + double sim = 0.0d; + int commonItems = 0; + + /** + * TODO: 3.1 -- Types of similarity (Book section 3.1.2) + * + * In the following switch, we include two types of similarity You can + * extend the functionality of this method by adding more types. For + * example, the Jaccard similarity could be defined as the ratio of the + * intersection over the union of the items between two users. In other + * words, Number of songs in common Jaccard Similarity = + * ------------------------------------------- Number of all songs + * listened by either user + * + * Are more complicated similarity metrics more accurate? + */ + + switch (simType) { + + case 0: + for (Rating r : this.ratingsByItemId.values()) { + for (Rating r2 : u.ratingsByItemId.values()) { + + // Find the same item + if (r.getItemId() == r2.getItemId()) { + commonItems++; + sim += Math.pow((r.getRating() - r2.getRating()), 2); + } + } + } + + // If there are not common items, we cannot tell whether + // the users are similar or not. So, we let it return 0. + if (commonItems > 0) { + + // This is the RMSE, which is more like the distance + sim = Math.sqrt(sim / commonItems); + + // Similarity should be between 0 and 1 + // For the value 0, the two users are as dissimilar as they come + // For the value 1, their preferences (based on the available + // data) are identical. + // + // Here is a function that accomplishes exactly that + sim = 1.0d - Math.tanh(sim); + } + + break; + + // --------------------------------------------------------- + case 1: + for (Rating r : this.ratingsByItemId.values()) { + for (Rating r2 : u.ratingsByItemId.values()) { + + // Find the same item + if (r.getItemId() == r2.getItemId()) { + commonItems++; + sim += Math.pow((r.getRating() - r2.getRating()), 2); + } + } + } + + // If there are not common items, we cannot tell whether + // the users are similar or not. So, we let it return 0. + if (commonItems > 0) { + // Same as before (case 0) + sim = Math.sqrt(sim / commonItems); + + // Similarity should be between 0 and 1 + // For the value 0, the two users are as disimilar as they come + // For the value 1, their preferences (based on the available + // data) are identical. + // + // Here is a function that accomplishes exactly that + sim = 1.0d - Math.tanh(sim); + + // However, the above calculation takes into account only the + // common items + // It does not account for the number of items that could have + // in common + // So, let us consider the following + + // This is the maximum number of items that the two users can + // have in common + int maxCommonItems = Math.min(this.ratingsByItemId.size(), + u.ratingsByItemId.size()); + + // Adjust the similarity to account for the importance of the + // common terms + // through the ratio of the common items over the number of all + // possible common items + + sim = sim * ((double) commonItems / (double) maxCommonItems); + } + + break; + } + + // Let us know what it is + System.out.print("\n"); // Just for pretty printing in the Shell + System.out.print(" User Similarity between"); + System.out.print(" " + this.getName()); + System.out.print(" and " + u.getName()); + System.out.println(" is equal to " + sim); + System.out.print("\n"); // Just for pretty printing in the Shell + + return sim; + } + + public void plot() { + + int n = this.ratingsByItemId.size(); + + double[] x = new double[n]; + double[] y = new double[n]; + + double xCount = 0; + int i; + for (Integer itemId : this.ratingsByItemId.keySet()) { + i = (int) xCount; + x[i] = xCount; + y[i] = this.getItemRating(itemId).getRating(); + } + + XyGui gui = new XyGui("", x, y); + gui.plot(); + } + + public void plot(MusicUser anotherUser) { + // ratings for items rated by both users + List sharedRatings = new ArrayList(); + + // iterate through user ratings and check if another user rated the same + // items + for (Rating r : ratingsByItemId.values()) { + Rating anotherUserRating = anotherUser.getItemRating(r.getItemId()); + if (anotherUserRating != null) { + // item was rated by both users. Add both ratings to the list + Rating[] itemRatings = new Rating[2]; + itemRatings[0] = r; + itemRatings[1] = anotherUserRating; + sharedRatings.add(itemRatings); + } + } + + // sort shared ratings based on the difference of opinions + Collections.sort(sharedRatings, new Comparator() { + public int compare(Rating[] x, Rating[] y) { + int result = 0; + + double xDiff = Math.abs(x[0].getRating() - x[1].getRating()); + double yDiff = Math.abs(y[0].getRating() - y[1].getRating()); + + if (xDiff < yDiff) { + result = -1; + } else if (xDiff > yDiff) { + result = 1; + } else { + result = 0; + } + + return result; + } + }); + + double[] data1 = new double[sharedRatings.size()]; + double[] data2 = new double[sharedRatings.size()]; + String[] itemNames = new String[sharedRatings.size()]; + for (int i = 0, n = itemNames.length; i < n; i++) { + Rating[] itemRatings = sharedRatings.get(i); + // Right now there is no way to get to Item from User or Rating. + // Only itemId is available from User or Rating instance. + // I'll change loading to include Item in Rating if we need to show + // song name on the chart. + itemNames[i] = String.valueOf(itemRatings[0].getItemId()); + data1[i] = itemRatings[0].getRating(); + data2[i] = itemRatings[1].getRating(); + } + + XyGui gui = new XyGui("User Similarity", this.getName(), + anotherUser.getName(), itemNames, data1, data2); + + gui.plot(); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsData.java b/src/org/yooreeka/algos/reco/collab/data/NewsData.java new file mode 100644 index 0000000..4077e68 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/NewsData.java @@ -0,0 +1,202 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Utility class that we use as the source for Music data. + */ +public class NewsData { + + public static final String[] USERS = { "Albert", "Alexandra", "Athena", + "Aurora", "Babis", "Bill", "Bob", "Carl", "Catherine", "Charlie", + "Constantine", "Dmitry", "Elena", "Eric", "Frank", "George", + "Jack", "John", "Maria", "Lukas", "Nick", "Terry", "Todd" }; + + public static final String[] DOC_SAMPLES = { "biz-01.html", "biz-02.html", + "biz-03.html", "biz-04.html", "biz-05.html", "biz-06.html", + "biz-07.html", "sport-01.html", "sport-02.html", "sport-03.html", + "usa-01.html", "usa-02.html", "usa-03.html", "usa-04.html", + "world-01.html", "world-02.html", "world-03.html", "world-04.html", + "world-05.html" }; + + /** + * Builds data set with all the users where each user is assigned 80% of all + * the eligible content, as defined below: + *
    + *
  • Users whose name starts from A to D will have 'business' and 'sport' + * content.
  • + *
  • Users whose name starts from E to Z will have 'usa' and 'world' + * content.
  • + *
+ */ + public static BaseDataset createDataset() { + BaseDataset ds = new BaseDataset(); + + /* Create items first */ + ContentItem[] allItems = loadAllNewsItems(); + + for (ContentItem item : allItems) { + ds.addItem(item); + } + + for (int i = 0, n = USERS.length; i < n; i++) { + int userId = i; + String userName = USERS[i]; + ContentItem[] eligibleDocs = null; + if (userName.toLowerCase().charAt(0) <= 'd') { + eligibleDocs = selectEligibleDocs(allItems, new String[] { + "biz", "sport" }); + } else { + eligibleDocs = selectEligibleDocs(allItems, new String[] { + "usa", "world" }); + } + + /* + * Percent of document items that will be selected from provided + * group of items. + */ + double percentOfDocs = 0.80; + + ContentItem[] docs = pickRandomDocs(eligibleDocs, percentOfDocs); + + NewsUser u = new NewsUser(userId, userName); + for (ContentItem doc : docs) { + u.addUserContent(doc.getItemContent()); + } + + ds.add(u); + } + + return ds; + } + + // private static Item createItem(String docName) { + // int id = -1; + // for(int i = 0, n = DOC_SAMPLES.length; i < n; i++) { + // if( DOC_SAMPLES[i].equals(docName)) { + // id = i; + // break; + // } + // } + // + // if( id < 0 ) { + // throw new IllegalArgumentException("Invalid document name: '" + docName + + // "'. This document is not on the list of predefined documents."); + // } + // + // return createDocItem(id, docName); + // } + + private static ContentItem createNewsItem(int docId, String docName) { + Content content = loadContent(docName); + ContentItem docItem = new ContentItem(docId, docName, content); + // docItem.setItemContent(content); + return docItem; + } + + /** + * Returns array of new ContentItem instances for every document listed in + * DOC_SAMPLES array. + */ + private static ContentItem[] loadAllNewsItems() { + ContentItem[] allItems = new ContentItem[NewsData.DOC_SAMPLES.length]; + for (int i = 0, n = allItems.length; i < n; i++) { + int id = i; + String name = NewsData.DOC_SAMPLES[i]; + ContentItem item = createNewsItem(id, name); + allItems[i] = item; + } + return allItems; + } + + private static Content loadContent(String docName) { + return new HTMLContent(docName, YooreekaConfigurator.getHome() + + "/data/ch02/" + docName); + } + + /** + * Returns a random selection of documents. + * + * @param newsItems + * list of documents to pick from + * @param percentOfDocs + * determines size of returned selection. + * + * @return array of songs. + */ + private static ContentItem[] pickRandomDocs(ContentItem[] newsItems, + double percentOfDocs) { + + if (percentOfDocs < 0.0 || percentOfDocs > 1.0) { + throw new IllegalArgumentException( + "Value for 'percentOfDocs' argument should be " + + "between 0 and 1."); + } + + Random rand = new Random(); + int sampleSize = (int) Math.round(percentOfDocs * newsItems.length); + Map pickedItems = new HashMap(); + while (pickedItems.size() < sampleSize) { + int itemId = rand.nextInt(newsItems.length); + Item item = newsItems[itemId]; + if (!pickedItems.containsKey(item.getId())) { + pickedItems.put(item.getId(), item); + } + } + + return pickedItems.values() + .toArray(new ContentItem[pickedItems.size()]); + } + + private static ContentItem[] selectEligibleDocs(ContentItem[] docs, + String[] prefixes) { + List eligibleDocs = new ArrayList(); + for (ContentItem doc : docs) { + for (String prefix : prefixes) { + if (doc.getName().startsWith(prefix)) { + eligibleDocs.add(doc); + break; + } + } + } + return eligibleDocs.toArray(new ContentItem[eligibleDocs.size()]); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsItem.java b/src/org/yooreeka/algos/reco/collab/data/NewsItem.java new file mode 100644 index 0000000..7dedef4 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/NewsItem.java @@ -0,0 +1,54 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Item for news dataset. + */ +public class NewsItem extends Item { + + /** + * SVUID + */ + private static final long serialVersionUID = 6349342365379966975L; + + public NewsItem(int id, String name, Content content) { + super(id, name, new ArrayList(3)); + setItemContent(content); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/NewsUser.java b/src/org/yooreeka/algos/reco/collab/data/NewsUser.java new file mode 100644 index 0000000..ae316b5 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/NewsUser.java @@ -0,0 +1,82 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.io.Serializable; +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * @author Babis Marmanis + * + */ +public class NewsUser extends User implements Serializable { + + /** + * SVUID + */ + private static final long serialVersionUID = 3415187707158663184L; + + /** + * @param id + */ + public NewsUser(int id) { + super(id); + } + + /** + * @param id + * @param ratings + */ + public NewsUser(int id, List ratings) { + super(id, ratings); + } + + /** + * @param id + * @param name + */ + public NewsUser(int id, String name) { + super(id, name); + } + + /** + * @param id + * @param name + * @param ratings + */ + public NewsUser(int id, String name, List ratings) { + super(id, name, ratings); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java b/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java new file mode 100644 index 0000000..0491ae2 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/data/RatingBuilder.java @@ -0,0 +1,94 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.data; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Utility class to generate random ratings. + */ +class RatingBuilder { + + private Random rand = null; + + public RatingBuilder() { + rand = new java.util.Random(); + } + + /** + * Creates biased ratings for all items. + * + * @param userId + * rating user. + * @param items + * to create ratings for. + * @param lowerBias + * low range for rating value + * @param upperBias + * high range for rating value + * @return + */ + public List createBiasedRatings(int userId, Item[] items, + int lowerBias, int upperBias) { + List ratings = new ArrayList(); + for (Item item : items) { + int biasedRandomRating = getRandomRating(lowerBias, upperBias); + Rating rating = new Rating(userId, item.getId(), biasedRandomRating); + rating.setItem(item); + ratings.add(rating); + } + return ratings; + } + + public int getRandomRating() { + // No bias + return getRandomRating(5); + } + + public int getRandomRating(int upperBias) { + + // Lower bias is 1 + return getRandomRating(1, upperBias); + } + + public int getRandomRating(int lowerBias, int upperBias) { + + // We add 1 at the end because the nextInt(n) call excludes n + int n = (upperBias - lowerBias) + 1; + return (lowerBias + rand.nextInt(n)); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java b/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java new file mode 100644 index 0000000..c742f7a --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/evaluation/EvaluationDataProvider.java @@ -0,0 +1,44 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.evaluation; + +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Rating; + +/** + * Interface to access previously generated evaluation data. + */ +public interface EvaluationDataProvider { + List loadTestRatings(int testSize, int testSequence); + + List loadTrainingRatings(int testSize, int testSequence); +} diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java new file mode 100644 index 0000000..70a45f4 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensEvaluationDataProvider.java @@ -0,0 +1,283 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.evaluation; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Random; + +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Rating; + +public class MovieLensEvaluationDataProvider implements EvaluationDataProvider { + + /* + * Location for files with test and training data. + */ + private String evaluationDataDir; + + /* + * Provides data that will be used to produce training and test files. + */ + private Dataset dataset; + + /* + * Prefix that will be used in filename for files with test ratings. + */ + private String testFilenamePrefix; + + /* + * Prefix that will be used in filename for files with training ratings. + */ + private String trainingFilenamePrefix; + + public MovieLensEvaluationDataProvider(Dataset dataset) { + this.dataset = dataset; + } + + public void createData(int testSize) { + createData(testSize, 1); + } + + /** + * Creates evaluation data by splitting original item rating set into two + * sets: training set and test sets. Test set is built by randomly selecting + * ratings from the original ratings set. Training set is built by selecting + * everything that is left from the original set. + * + * @param testSize + * number of ratings in test set. + * @param testSequence + * allows to generate multiple test sets with the same number or + * ratings. + */ + public void createData(int testSize, int sequence) { + + /* start with complete list of all available ratings */ + List allRatings = new ArrayList(dataset.getRatings()); + + /* extract required number of ratings and use them as testing set */ + List testRatings = removeRatings(allRatings, testSize); + /* use the of ratings as a training set */ + List trainingRatings = allRatings; + + String testRatingsFilename = createFilename(testFilenamePrefix, + testSize, sequence); + + String trainingRatingsFilename = createFilename(trainingFilenamePrefix, + testSize, sequence); + + saveRatings(testRatingsFilename, testRatings); + saveRatings(trainingRatingsFilename, trainingRatings); + } + + /** + * Builds unique filename for file that contains ratings for training or + * test. + * + * @param namePrefix + * identifies source of the data and the purpose (testing or + * training) of the file. + * @param n + * number or ratings that were randomly selected from the + * original set of ratings and put in test file. Both training + * and test files are identified by this number. + * @param sequence + * random selection sequence. In some cases when we need to + * generate multiple test files with the same number of ratings + * but with different selection every time. Defaults to 1. + * + * Example: + * + * MovieLensRatingsTrainingN16000Rnd1.dat - first training file + * that was obtained by removing 16000 ratings from original + * ratings file. MovieLensRatingsTestN16000Rnd1.dat - first test + * file with 16000 ratings that were removed from original + * ratings file. MovieLensRatingsTrainingN16000Rnd2.dat - second + * training file that was obtained by removing 16000 ratings from + * original ratings file. MovieLensRatingsTestN16000Rnd2.dat - + * second test file with 16000 ratings that were removed from + * original ratings file. + */ + public String createFilename(String namePrefix, int n, int sequence) { + return namePrefix + "N" + n + "Rnd" + sequence + ".dat"; + } + + public String getEvaluationDataDir() { + return evaluationDataDir; + } + + public String getTestFilenamePrefix() { + return testFilenamePrefix; + } + + public String getTrainingFilenamePrefix() { + return trainingFilenamePrefix; + } + + public List loadTestRatings(int testSize, int testSequence) { + String filename = createFilename(testFilenamePrefix, testSize, + testSequence); + File f = new File(evaluationDataDir, filename); + + return MovieLensDataset.loadRatings(f); + } + + public List loadTrainingRatings(int testSize, int testSequence) { + String filename = createFilename(trainingFilenamePrefix, testSize, + testSequence); + File f = new File(evaluationDataDir, filename); + return MovieLensDataset.loadRatings(f); + } + + /** + * Creates a set of training and test data. + * + * @param testSize + * number of ratings that will be used to create testing set. + * Size of training set is defined as AllAvailableRatings - + * testSize + */ + public void prepareTestData(int testSize) { + prepareTestData(testSize, 1); + } + + /** + * Creates multiple sets of training and test data. Should be used when we + * need to create multiple test files for the same tests. + * + * @param testSize + * number of test ratings. + * @param sequence + * test sequence. + */ + public void prepareTestData(int testSize, int sequence) { + if (!testDataExist(testSize, sequence)) { + createData(testSize, sequence); + } + } + + private void removeFile(String filename) { + File f = new File(evaluationDataDir, filename); + if (f.exists()) { + f.delete(); + } + } + + private List removeRatings(List allRatings, int n) { + + List removedRatings = new ArrayList(); + Random rnd = new Random(); + while (removedRatings.size() < n) { + int randomIndex = rnd.nextInt(allRatings.size()); + Rating rating = allRatings.remove(randomIndex); + removedRatings.add(rating); + } + return removedRatings; + } + + /** + * Deletes test data. Defaults sequence to 1. + * + * @param testSize + */ + public void removeTestData(int testSize) { + removeTestData(testSize, 1); + } + + /** + * Deletes test data. + * + * @param testSize + * @param sequence + */ + public void removeTestData(int testSize, int sequence) { + String testFilename = createFilename(testFilenamePrefix, testSize, + sequence); + removeFile(testFilename); + + String trainingFilename = createFilename(trainingFilenamePrefix, + testSize, sequence); + removeFile(trainingFilename); + } + + private void saveRatings(String filename, Collection ratings) { + File f = new File(evaluationDataDir, filename); + MovieLensDataset.createNewRatingsFile(f, ratings); + } + + public void setEvaluationDataDir(String value) { + this.evaluationDataDir = value; + } + + public void setTestFilenamePrefix(String testFilenamePrefix) { + this.testFilenamePrefix = testFilenamePrefix; + } + + public void setTrainingFilenamePrefix(String trainingFilenamePrefix) { + this.trainingFilenamePrefix = trainingFilenamePrefix; + } + + public boolean testDataExist(int testSize) { + return testDataExist(testSize, 1); + } + + /** + * Checks if the test set already exists. + * + * @param testSize + * @param sequence + * @return + */ + public boolean testDataExist(int testSize, int sequence) { + // create temporary directory if it doesn't exist yet. + File tmpDirFile = new File(evaluationDataDir); + if (!tmpDirFile.exists()) { + tmpDirFile.mkdirs(); + } + + boolean filesExist = false; + String testFilename = createFilename(testFilenamePrefix, testSize, + sequence); + String trainingFilename = createFilename(trainingFilenamePrefix, + testSize, sequence); + if (new File(evaluationDataDir, testFilename).exists() + && new File(evaluationDataDir, trainingFilename).exists()) { + filesExist = true; + } + + return filesExist; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java new file mode 100644 index 0000000..d33c7d7 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/evaluation/MovieLensRMSE.java @@ -0,0 +1,104 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.evaluation; + +import java.io.File; +import java.util.List; + +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.recommender.Delphi; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * + * @deprecated use RMSEEstimator instead. + */ +public class MovieLensRMSE { + + public static void main(String[] args) { + MovieLensRMSE rmse = new MovieLensRMSE(); + rmse.calculate(); + } + + public MovieLensRMSE() { + } + + public double[] calculate() { + + double similarityThreshold = 0.50; + + int N = 5; + + double[] rmse = new double[N]; + + RMSEEstimator rmseEstimator = new RMSEEstimator(); + + for (int i = 1; i <= N; i++) { + + Dataset ds = createTrainingDataset(i); + + Delphi delphi = new Delphi(ds, RecommendationType.ITEM_BASED); + delphi.setSimilarityThreshold(similarityThreshold); + + List testRatings = createTestRatings(i); + + double rmseValue = rmseEstimator.calculateRMSE(delphi, testRatings); + System.out.println(i + ": rmse = " + rmseValue); + + rmse[i - 1] = rmseValue; + } + + return rmse; + } + + public List createTestRatings(int n) { + String dataDir = YooreekaConfigurator + .getProperty("iweb2.movielens.data.dir"); + + File ratings = new File(dataDir, "u" + n + ".test"); + + return MovieLensDataset.loadRatings(ratings); + } + + public MovieLensDataset createTrainingDataset(int n) { + String dataDir = YooreekaConfigurator + .getProperty("iweb2.movielens.data.dir"); + + File users = new File(dataDir, MovieLensDataset.USERS_FILENAME); + File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME); + File ratings = new File(dataDir, "u" + n + ".base"); + + return new MovieLensDataset(users, items, ratings); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java new file mode 100644 index 0000000..34b053a --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEEstimator.java @@ -0,0 +1,173 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.evaluation; + +import java.util.Collection; +import java.util.logging.Logger; + +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.recommender.Recommender; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Calculates Root Mean Squared Error for the recommender. + */ +public class RMSEEstimator { + + private static final Logger LOG = Logger.getLogger(RMSEEstimator.class.getName()); + + public RMSEEstimator() { + LOG.setLevel(YooreekaConfigurator.getLevel(RMSEEstimator.class.getName())); + } + + /** + * Calculates Root Mean Squared Error for the recommender. Uses test rating + * values returned by recommender's dataset. + * + * @param delphi + * recommender. + * @return root mean squared error value. + */ + public double calculateRMSE(Recommender delphi) { + + MovieLensDataset ds = (MovieLensDataset) delphi.getDataset(); + Collection testRatings = ds.getTestRatings(); + + return calculateRMSE(delphi, testRatings); + } + + /** + * Calculates Root Mean Squared Error for the recommender. + * + * @param delphi + * recommender to evaluate. + * @param testRatings + * ratings that will be used to calculate the error. + * @return root mean squared error. + */ + public double calculateRMSE(Recommender delphi, + Collection testRatings) { + + double sum = 0.0; + + Dataset ds = delphi.getDataset(); + + int totalSamples = testRatings.size(); + + LOG.fine("Calculating RMSE ..."); + LOG.fine("Training ratings count: " + ds.getRatingsCount()); + LOG.fine("Test ratings count: " + testRatings.size()); + + for (Rating r : testRatings) { + User user = ds.getUser(r.getUserId()); + Item item = ds.getItem(r.getItemId()); + double predictedItemRating = delphi.predictRating(user, item); + + if (predictedItemRating > 5.0) { + predictedItemRating = 5.0; + LOG.finest("Predicted item rating: " + predictedItemRating); + } + LOG.finest( + "user: " + r.getUserId() + + ", item: " + r.getItemId() + + ", actual rating: " + r.getRating() + + ", predicted: " + String.valueOf(predictedItemRating)); + + sum += Math.pow((predictedItemRating - r.getRating()), 2); + + } + double rmse = Math.sqrt(sum / totalSamples); + + LOG.fine("RMSE:" + rmse); + + return rmse; + } + + public void compareRMSEs(Recommender delphi) { + + MovieLensDataset ds = (MovieLensDataset) delphi.getDataset(); + Collection testRatings = ds.getTestRatings(); + + compareRMSEs(delphi, testRatings); + } + + public void compareRMSEs(Recommender delphi, Collection testRatings) { + + double sum = 0.0; + double sumAvgItem = 0.0; + double sumAvgUser = 0.0; + + Dataset ds = delphi.getDataset(); + + int totalSamples = testRatings.size(); + + LOG.fine("Calculating RMSE ..."); + LOG.fine("Training ratings count: "+ds.getRatingsCount()); + LOG.fine("Test ratings count: " + testRatings.size()); + + for (Rating r : testRatings) { + User user = ds.getUser(r.getUserId()); + Item item = ds.getItem(r.getItemId()); + double predictedItemRating = delphi.predictRating(user, item); + double predictedAvgItemRating = delphi + .predictBasedOnItemAverage(item); + double predictedAvgUserRating = delphi + .predictBasedOnUserAverage(user); + + if (predictedItemRating > 5.0) { + predictedItemRating = 5.0; + LOG.finest("Predicted item rating: " + predictedItemRating); + } + LOG.finest( + "user: " + r.getUserId() + + ", item: " + r.getItemId() + + ", actual rating: " + r.getRating() + + ", predicted: " + String.valueOf(predictedItemRating)); + + sum += Math.pow((predictedItemRating - r.getRating()), 2); + sumAvgItem += Math.pow((predictedAvgItemRating - r.getRating()), 2); + sumAvgUser += Math.pow((predictedAvgUserRating - r.getRating()), 2); + + } + + double rmse = Math.sqrt(sum / totalSamples); + double rmseAvgItem = Math.sqrt(sumAvgItem / totalSamples); + double rmseAvgUser = Math.sqrt(sumAvgUser / totalSamples); + + System.out.println("RMSE:" + rmse); + System.out.println("RMSE (based on avg. Item rating):" + rmseAvgItem); + System.out.println("RMSE (based on avg. User rating):" + rmseAvgUser); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java new file mode 100644 index 0000000..298b6be --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/evaluation/RMSEResult.java @@ -0,0 +1,85 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.evaluation; + +public class RMSEResult { + private String type; + private long testSize; + private double similarityThreshold; + private double error; + + public RMSEResult(String type, long testSize, double simThreshold, + double error) { + this.type = type; + this.testSize = testSize; + this.similarityThreshold = simThreshold; + this.error = error; + } + + public double getError() { + return error; + } + + public double getSimilarityThreshold() { + return similarityThreshold; + } + + public long getTestSize() { + return testSize; + } + + public String getType() { + return type; + } + + public void setError(double error) { + this.error = error; + } + + public void setSimilarityThreshold(double similarityThreshold) { + this.similarityThreshold = similarityThreshold; + } + + public void setTestSize(long testSize) { + this.testSize = testSize; + } + + public void setType(String type) { + this.type = type; + } + + @Override + public String toString() { + return "RMSE (testSize=" + getTestSize() + ", type=" + getType() + + ", similarityThreshold=" + getSimilarityThreshold() + "): " + + getError(); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/model/Content.java b/src/org/yooreeka/algos/reco/collab/model/Content.java new file mode 100644 index 0000000..995eca6 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/Content.java @@ -0,0 +1,182 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; + +import org.yooreeka.algos.search.lucene.analyzer.TextDocumentTerms; +import org.yooreeka.config.YooreekaConfigurator; + +public class Content implements java.io.Serializable { + + /** + * SVUID + */ + private static final long serialVersionUID = 1098727290087922462L; + private static final Logger LOG = Logger.getLogger(Content.class.getName()); + + private String id; + private String text; + private String[] terms; + private int[] termFrequencies; + private Map tfMap; + + public Content(String id, String text) { + this(id, text, 10); + } + + public Content(String id, String text, int topNTerms) { + + LOG.setLevel(YooreekaConfigurator.getLevel(Content.class.getName())); + + this.id = id; + this.text = text; + + Map allTermFrequencyMap = (new TextDocumentTerms(text)) + .getTf(); + tfMap = getTopNTermFrequencies(allTermFrequencyMap, topNTerms); + + terms = new String[tfMap.size()]; + termFrequencies = new int[tfMap.size()]; + + int i = 0; + for (Map.Entry e : tfMap.entrySet()) { + terms[i] = e.getKey(); + termFrequencies[i] = e.getValue(); + i++; + } + } + + public String getId() { + return id; + } + + public int[] getTermFrequencies() { + return termFrequencies; + } + + public String[] getTerms() { + return terms; + } + + public double[] getTermVector(String[] terms) { + double[] termVector = new double[terms.length]; + for (int i = 0, n = terms.length; i < n; i++) { + if (tfMap.containsKey(terms[i])) { + termVector[i] = 1; + } else { + termVector[i] = 0; + } + } + return termVector; + } + + public String getText() { + return text; + } + + public Map getTFMap() { + return this.tfMap; + } + + // private Map buildTermFrequencyMap(String text) { + // + // CustomAnalyzer analyzer = new CustomAnalyzer(Version.LUCENE_40); + // TokenStream tokenStream = analyzer.tokenStream("content", new + // StringReader(text)); + // + // Map termFrequencyMap = new HashMap(); + // + // boolean hasTokens = true; + // try { + // while (hasTokens) { + // Token t = null;//tokenStream.next(); + // if (t == null) { + // hasTokens = false; + // } else { + // String term = new String(t.termBuffer(), 0, t.termLength()); + // Integer frequency = termFrequencyMap.get(term); + // if( frequency == null ) { + // termFrequencyMap.put(term, 1); + // } + // else { + // termFrequencyMap.put(term, frequency + 1); + // } + // } + // } + // } + // catch(IOException e) { + // throw new RuntimeException(e); + // } + // + // return termFrequencyMap; + // } + + private Map getTopNTermFrequencies( + Map termFrequencyMap, int topN) { + + List> terms = new ArrayList>( + termFrequencyMap.entrySet()); + + // Different terms can have the same frequency. + Collections.sort(terms, new Comparator>() { + public int compare(Map.Entry e1, + Map.Entry e2) { + int result = 0; + if (e1.getValue() < e2.getValue()) { + result = 1; // reverse order + } else if (e1.getValue() > e2.getValue()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + + Map topNTermsFrequencyMap = new HashMap(); + for (Map.Entry term : terms) { + topNTermsFrequencyMap.put(term.getKey(), term.getValue()); + if (topNTermsFrequencyMap.size() >= topN) { + break; + } + } + + return topNTermsFrequencyMap; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/model/Dataset.java b/src/org/yooreeka/algos/reco/collab/model/Dataset.java new file mode 100644 index 0000000..c1256df --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/Dataset.java @@ -0,0 +1,142 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.Collection; + +/** + * Defines service that provides access to all users, items, and ratings. + * Recommender and similarity implementations rely on this service to access + * data. + */ +public interface Dataset { + + /** + * For content-based dataset returns array of terms that represent document + * space. + * + * @return + */ + public String[] getAllTerms(); + + /** + * Provides the average rating for this item + * + * @param itemId + * @return + */ + public double getAverageItemRating(int itemId); + + /** + * Provides the average rating for this user + * + * @param userId + * @return + */ + public double getAverageUserRating(int userId); + + /** + * Retrieves a specific item. + * + * @param itemId + * item id. + * @return item. + */ + public Item getItem(Integer itemId); + + /** + * Total number of all available items. + * + * @return number of items. + */ + public int getItemCount(); + + /** + * Retrieves all items. + * + * @return collection of all items. + */ + public Collection getItems(); + + /** + * Logical name for the dataset instance. + * + * @return name + */ + public String getName(); + + /** + * Provides access to all ratings. + * + * @return collection of ratings. + */ + public Collection getRatings(); + + /** + * Total number of all available item ratings. + * + * @return number of item ratings by users. + */ + public int getRatingsCount(); + + /** + * Retrieves a specific user. + * + * @param userId + * user id. + * @return user. + */ + public User getUser(Integer userId); + + /** + * Total number of all available users. + * + * @return number of users. + */ + public int getUserCount(); + + /** + * Retrieves all users. + * + * @return collection of users. + */ + public Collection getUsers(); + + /** + * Provides information about user and item ids returned by this dataset. + * + * @return true if ids aren't in sequence and can't be used as array + * indexes. false if user or items ids can be treated as sequences + * that start with 1. In this case index will be derived from id: + * index = id - 1. + */ + public boolean isIdMappingRequired(); +} diff --git a/src/org/yooreeka/algos/reco/collab/model/Item.java b/src/org/yooreeka/algos/reco/collab/model/Item.java new file mode 100644 index 0000000..1c152ec --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/Item.java @@ -0,0 +1,171 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Generic representation of product or service that users can rate. + */ +public class Item implements java.io.Serializable { + + /** + * + */ + private static final long serialVersionUID = 6119040388138010186L; + + public static Integer[] getSharedUserIds(Item x, Item y) { + List sharedUsers = new ArrayList(); + for (Rating r : x.getAllRatings()) { + // same user rated the item + if (y.getUserRating(r.getUserId()) != null) { + sharedUsers.add(r.getUserId()); + } + } + return sharedUsers.toArray(new Integer[sharedUsers.size()]); + } + + /* + * Unique id in the dataset. + */ + private int id; + + /* + * Name. + */ + private String name; + + /* + * All ratings for this item. Supports only one rating per item for a user. + * Mapping: userId -> rating + */ + private Map ratingsByUserId; + + private Content itemContent; + + public Item(Integer id, List ratings) { + this(id, String.valueOf(id), ratings); + } + + public Item(Integer id, String name) { + this(id, name, new ArrayList(3)); + } + + public Item(Integer id, String name, List ratings) { + this.id = id; + this.name = name; + // load ratings into userId -> rating map. + ratingsByUserId = new HashMap(ratings.size()); + for (Rating r : ratings) { + ratingsByUserId.put(r.getUserId(), r); + } + } + + /** + * Updates existing user rating or adds a new user rating for this item. + * + * @param r + * rating to add. + */ + public void addUserRating(Rating r) { + ratingsByUserId.put(r.getUserId(), r); + } + + /** + * Returns all ratings that we have for this item. + * + * @return + */ + public Collection getAllRatings() { + return ratingsByUserId.values(); + } + + public double getAverageRating() { + double allRatingsSum = 0.0; + Collection allItemRatings = ratingsByUserId.values(); + for (Rating rating : allItemRatings) { + allRatingsSum += rating.getRating(); + } + // use 2.5 if there are no ratings. + return allItemRatings.size() > 0 ? allRatingsSum + / allItemRatings.size() : 2.5; + } + + public int getId() { + return id; + } + + public Content getItemContent() { + return itemContent; + } + + public String getName() { + return name; + } + + /* + * Utility method to extract array of ratings based on array of user ids. + */ + public double[] getRatingsForItemList(Integer[] userIds) { + double[] ratings = new double[userIds.length]; + for (int i = 0, n = userIds.length; i < n; i++) { + Rating r = getUserRating(userIds[i]); + if (r == null) { + throw new IllegalArgumentException( + "Item doesn't have rating by specified user id (" + + "userId=" + userIds[i] + ", itemId=" + + getId()); + } + ratings[i] = r.getRating(); + } + return ratings; + } + + /** + * Returns rating that specified user gave to the item. + * + * @param userId + * user + * @return user rating or null if user hasn't rated this item. + */ + public Rating getUserRating(Integer userId) { + return ratingsByUserId.get(userId); + } + + public void setItemContent(Content content) { + this.itemContent = content; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/model/Rating.java b/src/org/yooreeka/algos/reco/collab/model/Rating.java new file mode 100644 index 0000000..2a3f665 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/Rating.java @@ -0,0 +1,127 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +/** + * Generic representation of a rating given by user to a product (item). + */ +public class Rating implements java.io.Serializable { + + /** + * SVUID + */ + private static final long serialVersionUID = 1438346522502387789L; + + protected Item item; + + private int userId; + private int itemId; + private int rating; + + public Rating(int userId, int bookId, int rating) { + this.userId = userId; + this.itemId = bookId; + this.rating = rating; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final Rating other = (Rating) obj; + if (itemId != other.itemId) + return false; + if (rating != other.rating) + return false; + if (userId != other.userId) + return false; + return true; + } + + /** + * @return the item + */ + public Item getItem() { + return item; + } + + public int getItemId() { + return itemId; + } + + public int getRating() { + return rating; + } + + public int getUserId() { + return userId; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + itemId; + result = prime * result + rating; + result = prime * result + userId; + return result; + } + + /** + * @param item + * the item to set + */ + public void setItem(Item item) { + this.item = item; + } + + public void setItemId(int bookId) { + this.itemId = bookId; + } + + public void setRating(int rating) { + this.rating = rating; + } + + public void setUserId(int userId) { + this.userId = userId; + } + + @Override + public String toString() { + return this.getClass().getSimpleName() + "[userId: " + userId + + ", itemId: " + itemId + ", rating: " + rating + "]"; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java b/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java new file mode 100644 index 0000000..7a78128 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/RecommendationType.java @@ -0,0 +1,38 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +/** + * Defines all supported similarity types. + */ +public enum RecommendationType { + USER_BASED, ITEM_BASED, IMPROVED_USER_BASED, ITEM_PENALTY_BASED, USER_CONTENT_BASED, ITEM_CONTENT_BASED, USER_ITEM_CONTENT_BASED +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java b/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java new file mode 100644 index 0000000..aadd2cc --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/SimilarItem.java @@ -0,0 +1,128 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * @author Babis Marmanis + * + */ +public class SimilarItem { + + public static SimilarItem[] getTopSimilarItems( + List similarItems, int topN) { + + // sort friends based on itemAgreement + SimilarItem.sort(similarItems); + + // select top N friends + List topItems = new ArrayList(); + for (SimilarItem f : similarItems) { + if (topItems.size() >= topN) { + // have enough friends. + break; + } + topItems.add(f); + } + + return topItems.toArray(new SimilarItem[topItems.size()]); + } + + public static void printItems(SimilarItem[] items, String header) { + System.out.println("\n" + header + "\n"); + for (SimilarItem f : items) { + System.out.printf("name: %-36s, similarity: %f\n", f.getItem() + .getName(), f.getSimilarity()); + } + } + + public static void sort(List similarItems) { + + Collections.sort(similarItems, new Comparator() { + + public int compare(SimilarItem f1, SimilarItem f2) { + + int result = 0; + if (f1.getSimilarity() < f2.getSimilarity()) { + result = 1; // reverse order + } else if (f1.getSimilarity() > f2.getSimilarity()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + + private Item item; + + /* + * Similarity + */ + private double similarity = -1; + + public SimilarItem(Item item, double sim) { + this.item = item; + similarity = sim; + } + + // ---------------------------------------------- + // GETTERS / SETTERS + // ---------------------------------------------- + + /** + * @return the item + */ + public Item getItem() { + return item; + } + + /** + * @return the similarity + */ + public double getSimilarity() { + return similarity; + } + + /** + * @param item + * the item to set + */ + public void setItem(Item item) { + this.item = item; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/model/SimilarUser.java b/src/org/yooreeka/algos/reco/collab/model/SimilarUser.java new file mode 100644 index 0000000..bda7f25 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/SimilarUser.java @@ -0,0 +1,134 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * Utility class that acts as a holder for user and similarity value that was + * assigned to the user. + */ +public class SimilarUser { + + public static SimilarUser[] getTopNFriends(List similarUsers, + int topN) { + + // sort friends based on itemAgreement + SimilarUser.sort(similarUsers); + + // select top N friends + List topFriends = new ArrayList(); + for (SimilarUser f : similarUsers) { + if (topFriends.size() >= topN) { + // have enough friends. + break; + } + + // This is useful when we compose results from different + // recommenders + if (!topFriends.contains(f)) { + topFriends.add(f); + } + } + + return topFriends.toArray(new SimilarUser[topFriends.size()]); + } + + /** + * Prints a list of user names with their similarities. + * + * @param friends + * similar users + * @param header + * title that will be printed at the top of the list. + */ + public static void print(SimilarUser[] friends, String header) { + System.out.println("\n" + header + "\n"); + for (SimilarUser f : friends) { + System.out.printf("name: %-36s, similarity: %f\n", f.getName(), + f.getSimilarity()); + } + } + + public static void sort(List similarUsers) { + + Collections.sort(similarUsers, new Comparator() { + public int compare(SimilarUser f1, SimilarUser f2) { + int result = 0; + if (f1.getSimilarity() < f2.getSimilarity()) { + result = 1; // reverse order + } else if (f1.getSimilarity() > f2.getSimilarity()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + + /* + * The friend User . + */ + private User friend; + + /* + * Similarity + */ + private double similarity = -1; + + public SimilarUser(User user, double similarity) { + friend = user; + this.similarity = similarity; + } + + public int getId() { + return friend.getId(); + } + + public String getName() { + return friend.getName(); + } + + /** + * @return the similarity + */ + public double getSimilarity() { + return similarity; + } + + public User getUser() { + return friend; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/model/User.java b/src/org/yooreeka/algos/reco/collab/model/User.java new file mode 100644 index 0000000..3033672 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/model/User.java @@ -0,0 +1,175 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.model; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Generic representation of user which rates items. + */ +public class User implements java.io.Serializable { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -1884424246968533858L; + + /** + * Utility method to extract item ids that are shared between user A and + * user B. + */ + public static Integer[] getSharedItems(User x, User y) { + List sharedItems = new ArrayList(); + for (Rating r : x.getAllRatings()) { + if (y.getItemRating(r.getItemId()) != null) { + sharedItems.add(r.getItemId()); + } + } + return sharedItems.toArray(new Integer[sharedItems.size()]); + } + int id; + + String name; + + protected Map ratingsByItemId; + + private List userContent = new ArrayList(); + + public User(int id) { + this(id, String.valueOf(id), new ArrayList(3)); + } + + public User(int id, List ratings) { + this(id, String.valueOf(id), ratings); + } + + public User(int id, String name) { + this(id, name, new ArrayList(3)); + } + + public User(int id, String name, List ratings) { + this.id = id; + this.name = name; + ratingsByItemId = new HashMap(ratings.size()); + for (Rating r : ratings) { + ratingsByItemId.put(r.getItemId(), r); + } + } + + public void addRating(Rating rating) { + ratingsByItemId.put(rating.getItemId(), rating); + } + + public void addUserContent(Content content) { + userContent.add(content); + } + + public Collection getAllRatings() { + return ratingsByItemId.values(); + } + + public double getAverageRating() { + double allRatingsSum = 0.0; + Collection allUserRatings = getAllRatings(); + for (Rating rating : allUserRatings) { + allRatingsSum += rating.getRating(); + } + return allUserRatings.size() > 0 ? allRatingsSum + / allUserRatings.size() : 2.5; + } + + public int getId() { + return id; + } + + public Rating getItemRating(Integer itemId) { + return ratingsByItemId.get(itemId); + } + + public String getName() { + return name; + } + + /* + * Utility method to extract array of ratings based on array of item ids. + */ + public double[] getRatingsForItemList(Integer[] itemIds) { + double[] ratings = new double[itemIds.length]; + for (int i = 0, n = itemIds.length; i < n; i++) { + Rating r = getItemRating(itemIds[i]); + if (r == null) { + throw new IllegalArgumentException( + "User doesn't have specified item id (" + "userId=" + + getId() + ", itemId=" + itemIds[i]); + } + ratings[i] = r.getRating(); + } + return ratings; + } + + public List getUserContent() { + return userContent; + } + + public Content getUserContent(String contentId) { + Content matchedContent = null; + for (Content c : userContent) { + if (c.getId().equals(contentId)) { + matchedContent = c; + break; + } + } + return matchedContent; + } + + public void setRatings(List ratings) { + // Initialize or clean up + if (ratingsByItemId == null) { + ratingsByItemId = new HashMap(ratings.size()); + } else { + ratingsByItemId.clear(); + } + + // Load the ratings + for (Rating r : ratings) { + ratingsByItemId.put(r.getItemId(), r); + } + } + + public void setUserContent(List content) { + this.userContent = content; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/recommender/Delphi.java b/src/org/yooreeka/algos/reco/collab/recommender/Delphi.java new file mode 100644 index 0000000..86175c5 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/recommender/Delphi.java @@ -0,0 +1,545 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.recommender; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.model.SimilarItem; +import org.yooreeka.algos.reco.collab.model.SimilarUser; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrix; +import org.yooreeka.algos.reco.collab.similarity.util.SimilarityMatrixRepository; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * Recommender. Has to be initialized with similarity function and data. + * + * @author Babis Marmanis + * + */ +public class Delphi implements Recommender { + + private static final double DEFAULT_SIMILARITY_THRESHOLD = 0.50; + private static final double MAX_RATING = 5; + private static final Logger LOG = Logger.getLogger(Delphi.class.getName()); + + private RecommendationType type; + private SimilarityMatrix similarityMatrix; + private Dataset dataSet; + private boolean verbose = true; + private double similarityThreshold = DEFAULT_SIMILARITY_THRESHOLD; + private Map maxPredictedRating; + + public Delphi(Dataset dataSet, + RecommendationType type) { + + this(dataSet,type,false); + } + + public Delphi(Dataset dataSet, + RecommendationType type, + boolean useSimilarityCache) { + + this(dataSet,type,useSimilarityCache,null); + + SimilarityMatrixRepository smRepo = new SimilarityMatrixRepository(useSimilarityCache); + setSimilarityMatrix(smRepo.load(type, dataSet)); + } + + public Delphi(Dataset dataSet, + RecommendationType type, + boolean useSimilarityCache, + SimilarityMatrix similarityMatrix) { + + LOG.setLevel(YooreekaConfigurator.getLevel(Delphi.class.getName())); + + this.type = type; + + this.dataSet = dataSet; + maxPredictedRating = new HashMap(dataSet.getUserCount() / 2); + + this.similarityMatrix = similarityMatrix; + } + + + // -------------------------------------------------------------------- + // USER BASED SIMILARITY + // -------------------------------------------------------------------- + + private double estimateItemBasedRating(User user, Item item) { + + double estimatedRating; + + if (item != null && user != null) { + + estimatedRating = item.getAverageRating(); + + } else { + if (item == null && user == null) { + throw new IllegalArgumentException( + "At least, one of the arguments must not be null!"); + } else { + return 3.0d; + } + } + + int itemId = item.getId(); + int userId = user.getId(); + double similaritySum = 0.0; + double weightedRatingSum = 0.0; + + // check if the user has already rated the item + Rating existingRatingByUser = user.getItemRating(item.getId()); + + if (existingRatingByUser != null) { + + estimatedRating = existingRatingByUser.getRating(); + + } else { + + double similarityBetweenItems = 0; + double weightedRating = 0; + + for (Item anotherItem : dataSet.getItems()) { + + // only consider items that were rated by the user + Rating anotherItemRating = anotherItem.getUserRating(userId); + + if (anotherItemRating != null) { + + similarityBetweenItems = similarityMatrix.getValue(itemId, + anotherItem.getId()); + + if (similarityBetweenItems > similarityThreshold) { + + weightedRating = similarityBetweenItems + * anotherItemRating.getRating(); + + weightedRatingSum += weightedRating; + similaritySum += similarityBetweenItems; + } + } + } + + if (similaritySum > 0.0) { + + estimatedRating = weightedRatingSum / similaritySum; + } + } + + return estimatedRating; + } + + // ----------------------------------------------------------- + // PRIVATE (AUXILIARY) METHODS + // ----------------------------------------------------------- + private double estimateUserBasedRating(User user, Item item) { + + double estimatedRating = user.getAverageRating(); + + int itemId = item.getId(); + int userId = user.getId(); + + double similaritySum = 0.0; + double weightedRatingSum = 0.0; + + // check if user has already rated this item + Rating existingRatingByUser = user.getItemRating(item.getId()); + + if (existingRatingByUser != null) { + + estimatedRating = existingRatingByUser.getRating(); + + } else { + for (User anotherUser : dataSet.getUsers()) { + + Rating itemRating = anotherUser.getItemRating(itemId); + + // only consider users that rated this item + if (itemRating != null) { + + /** + * @todo describe how this generalizes to more accurate + * similarities + */ + double similarityBetweenUsers = similarityMatrix.getValue( + userId, anotherUser.getId()); + + double ratingByNeighbor = itemRating.getRating(); + + double weightedRating = similarityBetweenUsers + * ratingByNeighbor; + + weightedRatingSum += weightedRating; + similaritySum += similarityBetweenUsers; + } + } + + if (similaritySum > 0.0) { + estimatedRating = weightedRatingSum / similaritySum; + } + } + + return estimatedRating; + } + + private List findFriendsBasedOnUserSimilarity(User user) { + + List similarUsers = new ArrayList(); + + for (User friend : dataSet.getUsers()) { + + if (user.getId() != friend.getId()) { + + double similarity = similarityMatrix.getValue(user.getId(), + friend.getId()); + similarUsers.add(new SimilarUser(friend, similarity)); + } + } + + return similarUsers; + } + + // -------------------------------------------------------------------- + // ITEM BASED SIMILARITY + // -------------------------------------------------------------------- + + private List findItemsBasedOnItemSimilarity(Item item) { + + List similarItems = new ArrayList(); + + int itemId = item.getId(); + + for (Item sItem : dataSet.getItems()) { + + if (itemId != sItem.getId()) { + + double similarity = similarityMatrix.getValue(itemId, + sItem.getId()); + if (similarity > 0.0) { + similarItems.add(new SimilarItem(sItem, similarity)); + } + } + } + + return similarItems; + } + + public SimilarItem[] findSimilarItems(Item item) { + return findSimilarItems(item, 5); + } + + public SimilarItem[] findSimilarItems(Item item, int topN) { + + List similarItems = new ArrayList(); + + if (!isUserBased()) { + + similarItems = findItemsBasedOnItemSimilarity(item); + + } else { + + LOG.warning("Finding similar items based on User similarity is not supported!"); + } + + SimilarItem[] topSimilarItems = SimilarItem.getTopSimilarItems( + similarItems, topN); + + if (verbose) { + SimilarItem.printItems(topSimilarItems, + "Items like item " + item.getName() + ":"); + } + + return topSimilarItems; + } + + public SimilarUser[] findSimilarUsers(User user) { + SimilarUser[] topFriends = findSimilarUsers(user, 5); + + if (verbose) { + SimilarUser.print(topFriends, + "Top Friends for user " + user.getName() + ":"); + } + + return topFriends; + } + + public SimilarUser[] findSimilarUsers(User user, int topN) { + + List similarUsers = new ArrayList(); + + if (isUserBased()) { + + similarUsers = findFriendsBasedOnUserSimilarity(user); + + } else { + + /** + * TODO: 3.x: Create an algorithm that would allow you to find + * similar users based on item similarities. What kind of results do + * you get? Is it space efficient? How about execution time? + */ + LOG.warning("Finding friends based on Item similarity is not supported!"); + } + + return SimilarUser.getTopNFriends(similarUsers, topN); + } + + /** + * @return recommender's dataset. + */ + public Dataset getDataset() { + return this.dataSet; + } + + /** + * @return the maxPredictedRating of a particular user + */ + public double getMaxPredictedRating(Integer uID) { + Double maxPR = maxPredictedRating.get(uID); + + return (maxPR == null) ? 5.0d : maxPR; + } + + // -------------------------------------------------------------------- + // RATING PREDICTIONS + // -------------------------------------------------------------------- + + public double getSimilarity(Item i1, Item i2) { + + double sim = similarityMatrix.getValue(i1.getId(), i2.getId()); + + if (verbose) { + System.out.print("Item similarity between"); + System.out.print(" ItemID: " + i1.getId()); + System.out.print(" and"); + System.out.print(" ItemID: " + i2.getId()); + System.out.println(" is equal to " + sim); + } + + return sim; + } + + public double getSimilarity(User u1, User u2) { + + double sim = similarityMatrix.getValue(u1.getId(), u2.getId()); + + if (verbose) { + System.out.print("User Similarity between"); + System.out.print(" UserID: " + u1.getId()); + System.out.print(" and"); + System.out.print(" UserID: " + u2.getId()); + System.out.println(" is equal to " + sim); + } + + return sim; + } + + // -------------------------------------------------------------------- + // AUXILIARY METHODS + // -------------------------------------------------------------------- + + public SimilarityMatrix getSimilarityMatrix() { + return similarityMatrix; + } + + public double getSimilarityThreshold() { + return similarityThreshold; + } + + public RecommendationType getType() { + return type; + } + + public double getUserItemSimilarity(User user, Item item) { + + if (!isUserItemBased()) { + throw new IllegalStateException( + "Not valid for current similarity type:" + type); + } + + double sim = similarityMatrix.getValue(user.getId(), item.getId()); + + if (verbose) { + System.out.print("User Item Similarity between"); + System.out.print(" UserID: " + user.getId()); + System.out.print(" and"); + System.out.print(" ItemID: " + item.getId()); + System.out.println(" is equal to " + sim); + } + + return sim; + } + + private boolean isContentBased() { + return type.toString().indexOf("CONTENT") >= 0; + } + + private boolean isUserBased() { + return type.toString().indexOf("USER") >= 0 + && type.toString().indexOf("USER_ITEM") < 0; + } + + private boolean isUserItemBased() { + return type.toString().indexOf("USER_ITEM") >= 0; + } + + public boolean isVerbose() { + return verbose; + } + + @Override + public double predictBasedOnItemAverage(Item item) { + return item.getAverageRating(); + } + + @Override + public double predictBasedOnUserAverage(User user) { + return user.getAverageRating(); + } + + public double predictRating(int userId, int itemId) { + return predictRating(dataSet.getUser(userId), dataSet.getItem(itemId)); + } + + public double predictRating(User user, Item item) { + switch (type) { + case USER_BASED: + return estimateUserBasedRating(user, item); + case IMPROVED_USER_BASED: + return estimateUserBasedRating(user, item); + case ITEM_BASED: + return estimateItemBasedRating(user, item); + case ITEM_PENALTY_BASED: + return estimateItemBasedRating(user, item); + case USER_CONTENT_BASED: + throw new IllegalStateException( + "Not valid for current similarity type:" + type); + case ITEM_CONTENT_BASED: + throw new IllegalStateException( + "Not valid for current similarity type:" + type); + case USER_ITEM_CONTENT_BASED: + // Using similarity between User and Item + return MAX_RATING + * similarityMatrix.getValue(user.getId(), item.getId()); + } + + throw new RuntimeException("Unknown recommendation type:" + type); + } + + public List recommend(Integer userId) { + return recommend(dataSet.getUser(userId)); + } + + // -------------------------------------------------------------------- + // RECOMMENDATIONS + // -------------------------------------------------------------------- + public List recommend(User user) { + List recommendedItems = recommend(user, 5); + return recommendedItems; + } + + public List recommend(User user, int topN) { + + List recommendations = new ArrayList(); + + double maxRating = -1.0d; + + for (Item item : dataSet.getItems()) { + + // only consider items that user hasn't rated yet or doesn't own the + // content + if (!skipItem(user, item)) { + double predictedRating = predictRating(user, item); + + if (maxRating < predictedRating) { + maxRating = predictedRating; + } + + if (!Double.isNaN(predictedRating)) { + recommendations.add(new PredictedItemRating(user.getId(), + item.getId(), predictedRating)); + } + } else { + if (verbose) { + System.out.println("Skipping item:" + item.getName()); + } + } + } + + this.maxPredictedRating.put(user.getId(), maxRating); + + List topNRecommendations = PredictedItemRating + .getTopNRecommendations(recommendations, topN); + + if (verbose) { + PredictedItemRating.printUserRecommendations(user, dataSet, + topNRecommendations); + } + + return topNRecommendations; + } + + public void setSimilarityMatrix(SimilarityMatrix similarityMatrix) { + this.similarityMatrix = similarityMatrix; + } + + public void setSimilarityThreshold(double similarityThreshold) { + this.similarityThreshold = similarityThreshold; + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + private boolean skipItem(User user, Item item) { + boolean skipItem = true; + if (isContentBased()) { + if (user.getUserContent(item.getItemContent().getId()) == null) { + skipItem = false; + } + } else { + if (user.getItemRating(item.getId()) == null) { + skipItem = false; + } + } + return skipItem; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/recommender/DiggDelphi.java b/src/org/yooreeka/algos/reco/collab/recommender/DiggDelphi.java new file mode 100644 index 0000000..5b3c7f2 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/recommender/DiggDelphi.java @@ -0,0 +1,282 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.recommender; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.yooreeka.algos.reco.collab.data.BaseDataset; +import org.yooreeka.algos.reco.collab.data.DiggData; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.model.SimilarUser; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.config.YooreekaConfigurator; + +public class DiggDelphi { + + public static void main(String[] args) { + BaseDataset ds = DiggData.loadData(YooreekaConfigurator.getHome() + "/data/ch03/digg_stories.csv"); + User user = ds.getUser(1); + DiggDelphi delphi = new DiggDelphi(ds); + delphi.recommend(user); + } + + private Dataset ds; + private Delphi delphiUC; + private Delphi delphiUIC; + private Delphi delphiUR; + + private Delphi delphiIR; + + private boolean verbose = true; + + public DiggDelphi(Dataset ds) { + this.ds = ds; + + delphiUC = new Delphi(ds, RecommendationType.USER_CONTENT_BASED); + + delphiUIC = new Delphi(ds, RecommendationType.USER_ITEM_CONTENT_BASED); + + delphiUR = new Delphi(ds, RecommendationType.USER_BASED); + + delphiIR = new Delphi(ds, RecommendationType.ITEM_BASED); + + if (verbose) { + System.out + .println("Initialized " + this.getClass().getSimpleName()); + } + } + + public SimilarUser[] findSimilarUsers(User user) { + SimilarUser[] topFriends = findSimilarUsers(user, 5); + + if (verbose) { + SimilarUser.print(topFriends, + "Top Friends for user " + user.getName() + ":"); + } + + return topFriends; + } + + public SimilarUser[] findSimilarUsers(User user, int topN) { + List similarUsers = new ArrayList(); + + SimilarUser[] simU = delphiUC.findSimilarUsers(user, topN); + similarUsers.addAll(Arrays.asList(simU)); + + simU = delphiUR.findSimilarUsers(user, topN); + similarUsers.addAll(Arrays.asList(simU)); + // SimilarUser.print(simU, "Top Friends for user " + user.getName() + + // ":"); + + return SimilarUser.getTopNFriends(similarUsers, topN); + } + + public List naiveRecommend(User user, int topN) { + List recommendations = new ArrayList(); + + recommendations.addAll(delphiUIC.recommend(user, topN)); + recommendations.addAll(delphiUR.recommend(user, topN)); + recommendations.addAll(delphiIR.recommend(user, topN)); + + return PredictedItemRating + .getTopNRecommendations(recommendations, topN); + } + + public List recommend(User user) { + List recommendedItems = recommend(user, 5); + if (verbose) { + PredictedItemRating.printUserRecommendations(user, ds, + recommendedItems); + } + return recommendedItems; + } + + public List recommend(User user, int topN) { + List recommendations = new ArrayList(); + + // Establish a relative scaling factor + double maxR = -1.0d; + + // Get the maximum predicted ratings from each recommender + double maxRatingDelphiUIC = delphiUIC.getMaxPredictedRating(user + .getId()); + double maxRatingDelphiUR = delphiUR.getMaxPredictedRating(user.getId()); + double maxRatingDelphiIR = delphiIR.getMaxPredictedRating(user.getId()); + + // Find the maximum predicted rating across all recommendations + double[] sortedMaxR = { maxRatingDelphiUIC, maxRatingDelphiUR, + maxRatingDelphiIR }; + + Arrays.sort(sortedMaxR); + + maxR = sortedMaxR[2]; // This is the maximum predicted rating + + // auxiliary variable + double scaledRating = 1.0d; + + // Recommender 1 -- User-to-Item content based + double scaling = maxR / maxRatingDelphiUIC; + + // Set an ad hoc threshold and scale it + double scaledThreshold = 0.5 * scaling; + + List uicList = new ArrayList( + topN); + uicList = delphiUIC.recommend(user, topN); + + for (PredictedItemRating pR : uicList) { + + scaledRating = pR.getRating(6) * scaling; + + if (scaledRating < scaledThreshold) { + uicList.remove(pR); + } else { + pR.setRating(scaledRating); + } + } + + // Recommender 2 -- User based collaborative filtering + scaling = maxR / maxRatingDelphiUR; + scaledThreshold = 0.5 * scaling; + + List urList = new ArrayList( + topN); + urList = delphiUR.recommend(user, topN); + + for (PredictedItemRating pR : urList) { + + scaledRating = pR.getRating(6) * scaling; + + if (scaledRating < scaledThreshold) { + urList.remove(pR); + } else { + pR.setRating(scaledRating); + } + } + + // Recommender 3 -- Item based collaborative filtering + scaling = maxR / maxRatingDelphiIR; + scaledThreshold = 0.5 * scaling; + + List irList = new ArrayList( + topN); + irList = delphiIR.recommend(user, topN); + + for (PredictedItemRating pR : irList) { + + scaledRating = pR.getRating(6) * scaling; + + if (scaledRating < scaledThreshold) { + irList.remove(pR); + } else { + pR.setRating(scaledRating); + } + } + + /* + * At this point, uicList, urList, and irList contain ratings that are + * scaled and exceed the threshold value. + */ + double uicRating = 0; + double urRating = 0; + double irRating = 0; + double vote = 0; + + // build a set of items produced by all recommenders + Set allRecommendedItems = new HashSet(); + for (PredictedItemRating pir : urList) { + allRecommendedItems.add(pir.getItemId()); + } + for (PredictedItemRating pir : irList) { + allRecommendedItems.add(pir.getItemId()); + } + for (PredictedItemRating pir : uicList) { + allRecommendedItems.add(pir.getItemId()); + } + + for (Integer itemId : allRecommendedItems) { + // Initialize + uicRating = 0; + urRating = 0; + irRating = 0; + vote = 0; + + for (PredictedItemRating uic : urList) { + if (itemId == uic.getItemId()) { + uicRating = uic.getRating(6); + } + } + + for (PredictedItemRating ur : urList) { + if (itemId == ur.getItemId()) { + urRating = ur.getRating(6); + } + } + + for (PredictedItemRating ir : irList) { + if (itemId == ir.getItemId()) { + irRating = ir.getRating(6); + } + } + + vote = (uicRating + urRating + irRating) / 3.0d; + + recommendations.add(new PredictedItemRating(user.getId(), itemId, + vote)); + } + + rescale(recommendations, maxR); + + return PredictedItemRating + .getTopNRecommendations(recommendations, topN); + } + + private void rescale(List recommendations, + double scaleRange) { + int n = recommendations.size(); + double[] ratings = new double[n]; + int i = 0; + for (PredictedItemRating pir : recommendations) { + ratings[i] = pir.getRating(6); + i++; + } + Arrays.sort(ratings); + for (PredictedItemRating pir : recommendations) { + pir.setRating(pir.getRating(6) * (scaleRange / ratings[n - 1])); + } + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/recommender/MovieLensDelphi.java b/src/org/yooreeka/algos/reco/collab/recommender/MovieLensDelphi.java new file mode 100644 index 0000000..c34da2d --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/recommender/MovieLensDelphi.java @@ -0,0 +1,324 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.recommender; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.SimilarItem; +import org.yooreeka.algos.reco.collab.model.SimilarUser; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.similarity.movielens.MovieLensItemSimilarity; + +/** + * Recommender. Has to be initialized with similarity function and dataset. + */ +public class MovieLensDelphi implements Recommender { + + private static final double DEFAULT_SIMILARITY_THRESHOLD = 0.35; + + private Dataset dataSet; + private boolean verbose = true; + private double similarityThreshold = DEFAULT_SIMILARITY_THRESHOLD; + + private MovieLensItemSimilarity itemSimilarityMatrix; + + public MovieLensDelphi(Dataset ds) { + System.out.println("Entering MovieLensDelphi(Dataset) constructor ..."); + + this.dataSet = ds; + + // ------------------------------------------------------------------------ + System.out.println("Calculating item based similarities..."); + long start = System.currentTimeMillis(); + + itemSimilarityMatrix = new MovieLensItemSimilarity(ds); + + System.out.println("Item based similarities calculated in " + + (System.currentTimeMillis() - start) / 1000 + "(sec)."); + System.out.println("Similarities ready."); + // ------------------------------------------------------------------------ + + System.out.println("Leaving MovieLensDelpi(Dataset) constructor ..."); + } + + // -------------------------------------------------------------------- + // USER BASED SIMILARITY + // -------------------------------------------------------------------- + + // public SimilarUser[] findSimilarUsers(User user) { + // SimilarUser[] topFriends = findSimilarUsers(user, 5); + // + // if( verbose ) { + // SimilarUser.print(topFriends, "Top Friends for user " + user.getName() + + // ":"); + // } + // + // return topFriends; + // } + // + // public SimilarUser[] findSimilarUsers(User user, int topN) { + // + // List similarUsers = new ArrayList(); + // + // similarUsers = findFriendsBasedOnUserSimilarity(user); + // + // System.out.println("Finding friends based on Item similarity is not supported!"); + // + // return SimilarUser.getTopNFriends(similarUsers, topN); + // } + // + // + // private List findFriendsBasedOnUserSimilarity(User user) { + // + // List similarUsers = new ArrayList(); + // + // for(User friend : dataSet.getUsers()) { + // + // if( user.getId() != friend.getId() ) { + // + // double similarity = + // userSimilarityMatrix.getValue(user.getId(), friend.getId()); + // similarUsers.add(new SimilarUser(friend, similarity)); + // } + // } + // + // return similarUsers; + // } + + // -------------------------------------------------------------------- + // ITEM BASED SIMILARITY + // -------------------------------------------------------------------- + + // ----------------------------------------------------------- + // PRIVATE (AUXILIARY) METHODS + // ----------------------------------------------------------- + private double estimateItemBasedRating(User user, Item item) { + + double itemRating = item.getAverageRating(); + + int itemId = item.getId(); + int userId = user.getId(); + + double itemAvgRating = item.getAverageRating(); + + double weightedDeltaSum = 0.0; + int sumN = 0; + + // check if the user has already rated the item + Rating existingRatingByUser = user.getItemRating(item.getId()); + + if (existingRatingByUser != null) { + + itemRating = existingRatingByUser.getRating(); + + } else { + + double similarityBetweenItems = 0; + double weightedDelta = 0; + double delta = 0; + + for (Item anotherItem : dataSet.getItems()) { + + // only consider items that were rated by the user + Rating anotherItemRating = anotherItem.getUserRating(userId); + + if (anotherItemRating != null) { + + delta = itemAvgRating - anotherItemRating.getRating(); + + similarityBetweenItems = itemSimilarityMatrix.getValue( + itemId, anotherItem.getId()); + + if (Math.abs(similarityBetweenItems) > similarityThreshold) { + + weightedDelta = similarityBetweenItems * delta; + + weightedDeltaSum += weightedDelta; + + sumN++; + } + } + } + + if (sumN > 0) { + itemRating = itemAvgRating - (weightedDeltaSum / sumN); + } + } + + return itemRating; + } + + private List findItemsBasedOnItemSimilarity(Item item) { + + List similarItems = new ArrayList(); + + int itemId = item.getId(); + + for (Item sItem : dataSet.getItems()) { + + if (itemId != sItem.getId()) { + + double similarity = itemSimilarityMatrix.getValue(itemId, + sItem.getId()); + if (similarity > 0.5) { + similarItems.add(new SimilarItem(sItem, similarity)); + } + } + } + + return similarItems; + } + + public SimilarItem[] findSimilarItems(Item item) { + SimilarItem[] topFriends = findSimilarItems(item, 5); + + if (verbose) { + SimilarItem.printItems(topFriends, + "Items like item " + item.getName() + ":"); + } + return topFriends; + } + + public SimilarItem[] findSimilarItems(Item item, int topN) { + + List similarItems = new ArrayList(); + + similarItems = findItemsBasedOnItemSimilarity(item); + + return SimilarItem.getTopSimilarItems(similarItems, topN); + } + + public SimilarUser[] findSimilarUsers(User user) { + throw new UnsupportedOperationException("Not supported."); + } + + public SimilarUser[] findSimilarUsers(User user, int topN) { + throw new UnsupportedOperationException("Not supported."); + } + + public Dataset getDataset() { + return dataSet; + } + + // -------------------------------------------------------------------- + // AUXILIARY METHODS + // -------------------------------------------------------------------- + public double getSimilarityThreshold() { + return similarityThreshold; + } + + public List getTopNRecommendations( + List recommendations, int topN) { + + PredictedItemRating.sort(recommendations); + + double maxR = recommendations.get(0).getRating(); + double scaledR; + + List topRecommendations = new ArrayList(); + for (PredictedItemRating r : recommendations) { + if (topRecommendations.size() >= topN) { + // have enough recommendations. + break; + } + if (maxR > 5) { + scaledR = r.getRating() * (5 / maxR); + r.setRating(scaledR); + } + + topRecommendations.add(r); + } + + return topRecommendations; + } + + public boolean isVerbose() { + return verbose; + } + + public double predictBasedOnItemAverage(Item item) { + return item.getAverageRating(); + } + + public double predictBasedOnUserAverage(User user) { + return user.getAverageRating(); + } + + public double predictRating(User user, Item item) { + return estimateItemBasedRating(user, item); + } + + // -------------------------------------------------------------------- + // RECOMMENDATIONS + // -------------------------------------------------------------------- + public List recommend(User user) { + List recommendedItems = recommend(user, 5); + if (verbose) { + PredictedItemRating.printUserRecommendations(user, dataSet, + recommendedItems); + } + return recommendedItems; + } + + public List recommend(User user, int topN) { + + List recommendations = new ArrayList(); + + for (Item item : dataSet.getItems()) { + + // only consider items that user hasn't rated yet + if (user.getItemRating(item.getId()) == null) { + + double predictedRating = estimateItemBasedRating(user, item); + + if (!Double.isNaN(predictedRating)) { + recommendations.add(new PredictedItemRating(user.getId(), + item.getId(), predictedRating)); + } + } + } + + return getTopNRecommendations(recommendations, topN); + } + + public void setSimilarityThreshold(double similarityThreshold) { + this.similarityThreshold = similarityThreshold; + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/recommender/PredictedItemRating.java b/src/org/yooreeka/algos/reco/collab/recommender/PredictedItemRating.java new file mode 100644 index 0000000..463e310 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/recommender/PredictedItemRating.java @@ -0,0 +1,152 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.recommender; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * Represents predicted user rating of an item. Used to return recommendations + * for the user. + */ +public class PredictedItemRating { + /** + * Sorts list of recommendations in descending order and return topN + * elements. + * + * @param recommendations + * @param topN + * @return + */ + public static List getTopNRecommendations( + List recommendations, int topN) { + + PredictedItemRating.sort(recommendations); + + List topRecommendations = new ArrayList(); + for (PredictedItemRating r : recommendations) { + if (topRecommendations.size() >= topN) { + // have enough recommendations. + break; + } + topRecommendations.add(r); + } + + return topRecommendations; + } + public static void printUserRecommendations(User user, Dataset ds, + List recommendedItems) { + System.out.println("\nRecommendations for user " + user.getName() + + ":\n"); + for (PredictedItemRating r : recommendedItems) { + System.out.printf("Item: %-36s, predicted rating: %f\n", ds + .getItem(r.getItemId()).getName(), r.getRating(4)); + } + } + /** + * Sorts list by rating value in descending order. Items with higher ratings + * will be in the head of the list. + * + * @param values + * list to sort. + */ + public static void sort(List values) { + Collections.sort(values, new Comparator() { + + public int compare(PredictedItemRating f1, PredictedItemRating f2) { + + int result = 0; + if (f1.getRating() < f2.getRating()) { + result = 1; // reverse order + } else if (f1.getRating() > f2.getRating()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + + private int userId; + + private int itemId; + + private double rating; + + public PredictedItemRating(int userId, int itemId, double rating) { + this.userId = userId; + this.itemId = itemId; + this.rating = rating; + } + + public int getItemId() { + return itemId; + } + + public double getRating() { + return rating; + } + + /** + * Returns rounded rating value with number of digits after decimal point + * specified by scale parameter. + * + * @param scale + * number of digits to keep after decimal point. + * @return rounded value. + */ + public double getRating(int scale) { + BigDecimal bd = new BigDecimal(rating); + return bd.setScale(scale, RoundingMode.HALF_UP).doubleValue(); + } + + public int getUserId() { + return userId; + } + + public void setRating(double val) { + this.rating = val; + } + + @Override + public String toString() { + return this.getClass().getSimpleName() + "[userId: " + userId + + ", itemId: " + itemId + ", rating: " + rating + "]"; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/recommender/Recommender.java b/src/org/yooreeka/algos/reco/collab/recommender/Recommender.java new file mode 100644 index 0000000..7b96e42 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/recommender/Recommender.java @@ -0,0 +1,88 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.recommender; + +import java.util.List; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.SimilarItem; +import org.yooreeka.algos.reco.collab.model.SimilarUser; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * + * @author Babis Marmanis + * + */ +public interface Recommender { + + public SimilarItem[] findSimilarItems(Item item); + + public SimilarItem[] findSimilarItems(Item item, int topN); + + // Similarities + public SimilarUser[] findSimilarUsers(User user); + + public SimilarUser[] findSimilarUsers(User user, int topN); + + // Auxiliary + public Dataset getDataset(); + + public double getSimilarityThreshold(); + + public double predictBasedOnItemAverage(Item item); + + public double predictBasedOnUserAverage(User user); + + // Predictions + public double predictRating(User user, Item item); + + /** + * Returns top 5 recommendations for the user. + * + * @param user + * @return recommended items with predicted ratings. + */ + public List recommend(User user); + + /** + * Returns top N recommendations for the user. + * + * @param user + * @param topN + * number of top recommendations to return. + * @return recommended items with predicted ratings. + */ + public List recommend(User user, int topN); + + public void setSimilarityThreshold(double similarityThreshold); +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensItemSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensItemSimilarity.java new file mode 100644 index 0000000..31e0e68 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensItemSimilarity.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.movielens; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrixImpl; +import org.yooreeka.algos.reco.collab.similarity.util.PearsonCorrelation; + +public class MovieLensItemSimilarity extends SimilarityMatrixImpl { + + /** + * SVUID + */ + private static final long serialVersionUID = 2571216412528879244L; + + public MovieLensItemSimilarity(Dataset ds) { + this(MovieLensItemSimilarity.class.getSimpleName(), ds); + } + + public MovieLensItemSimilarity(String id, Dataset ds) { + this.id = id; + this.useObjIdToIndexMapping = ds.isIdMappingRequired(); + calculate(ds); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nItems = dataSet.getItemCount(); + + similarityValues = new double[nItems][nItems]; + + // if we want to use mapping from itemId to index then generate + // index for every itemId + if (useObjIdToIndexMapping) { + for (Item item : dataSet.getItems()) { + idMapping.getIndex(String.valueOf(item.getId())); + } + } + + PearsonCorrelation pC = null; + + for (int u = 0; u < nItems; u++) { + + int itemAId = getObjIdFromIndex(u); + Item itemA = dataSet.getItem(itemAId); + + // we only need to calculate elements above the main diagonal. + for (int v = u + 1; v < nItems; v++) { + + int itemBId = getObjIdFromIndex(v); + Item itemB = dataSet.getItem(itemBId); + + pC = new PearsonCorrelation(dataSet, itemA, itemB); + + similarityValues[u][v] = pC.calculate(); + } + + // for u == v assign 1 + similarityValues[u][u] = 1.0; + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensUserSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensUserSimilarity.java new file mode 100644 index 0000000..585da30 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/movielens/MovieLensUserSimilarity.java @@ -0,0 +1,107 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.movielens; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrixImpl; +import org.yooreeka.algos.reco.collab.similarity.util.PearsonCorrelation; + +public class MovieLensUserSimilarity extends SimilarityMatrixImpl { + + /** + * SVUID + */ + private static final long serialVersionUID = 8510536889333771002L; + + public MovieLensUserSimilarity(Dataset ds) { + this(MovieLensUserSimilarity.class.getSimpleName(), ds); + } + + public MovieLensUserSimilarity(String id, Dataset ds) { + this.id = id; + this.useObjIdToIndexMapping = ds.isIdMappingRequired(); + calculate(ds); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nUsers = dataSet.getUserCount(); + + similarityValues = new double[nUsers][nUsers]; + + // if we want to use mapping from userId to index then generate + // index for every userId + if (useObjIdToIndexMapping) { + for (User u : dataSet.getUsers()) { + idMapping.getIndex(String.valueOf(u.getId())); + } + } + + for (int u = 0; u < nUsers; u++) { + + int userAId = getObjIdFromIndex(u); + User userA = dataSet.getUser(userAId); + + for (int v = u; v < nUsers; v++) { + int userBId = getObjIdFromIndex(v); + User userB = dataSet.getUser(userBId); + + /* Collect shared ratings */ + Integer[] sharedItemIds = User.getSharedItems(userA, userB); + + if (sharedItemIds.length > 0) { + double[] ratingsA = userA + .getRatingsForItemList(sharedItemIds); + double[] ratingsB = userB + .getRatingsForItemList(sharedItemIds); + + /* Center ratings by subtracting average */ + double avgA = userA.getAverageRating(); + double avgB = userB.getAverageRating(); + for (int i = 0; i < sharedItemIds.length; i++) { + ratingsA[i] = ratingsA[i] - avgA; + ratingsB[i] = ratingsB[i] - avgB; + } + + /* Calculate similarity - Pearson Correlation */ + PearsonCorrelation pr = new PearsonCorrelation(ratingsA, + ratingsB); + + similarityValues[u][v] = pr.calculate(); + } else { + similarityValues[u][v] = 0.0; + } + } + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedItemBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedItemBasedSimilarity.java new file mode 100644 index 0000000..d74ced3 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedItemBasedSimilarity.java @@ -0,0 +1,120 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +public class ImprovedItemBasedSimilarity extends SimilarityMatrixImpl { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -8364129617679022295L; + + public ImprovedItemBasedSimilarity(String id, Dataset dataSet, + boolean keepRatingCountMatrix) { + this.id = id; + this.keepRatingCountMatrix = keepRatingCountMatrix; + this.useObjIdToIndexMapping = dataSet.isIdMappingRequired(); + calculate(dataSet); + } + + @Override + protected void calculate(Dataset dataSet) { + int nItems = dataSet.getItemCount(); + int nRatingValues = 5; + similarityValues = new double[nItems][nItems]; + + if (keepRatingCountMatrix) { + ratingCountMatrix = new RatingCountMatrix[nItems][nItems]; + } + + // if we want to use mapping from itemId to index then generate + // index for every itemId + if (useObjIdToIndexMapping) { + for (Item item : dataSet.getItems()) { + idMapping.getIndex(String.valueOf(item.getId())); + } + } + + int totalCount = 0; + int agreementCount = 0; + + for (int u = 0; u < nItems; u++) { + int itemAId = getObjIdFromIndex(u); + Item itemA = dataSet.getItem(itemAId); + // we only need to calculate elements above the main diagonal. + for (int v = u + 1; v < nItems; v++) { + int itemBId = getObjIdFromIndex(v); + Item itemB = dataSet.getItem(itemBId); + RatingCountMatrix rcm = new RatingCountMatrix(itemA, itemB, + nRatingValues); + + totalCount = rcm.getTotalCount(); + agreementCount = rcm.getAgreementCount(); + + if (agreementCount > 0) { + /* + * See ImprovedUserBasedSimilarity class for detailed + * explanation. + */ + double weightedDisagreements = 0.0; + int maxBandId = rcm.getMatrix().length - 1; + for (int matrixBandId = 1; matrixBandId <= maxBandId; matrixBandId++) { + double bandWeight = matrixBandId; + weightedDisagreements += bandWeight + * rcm.getBandCount(matrixBandId); + } + + double similarityValue = 1.0 - (weightedDisagreements / totalCount); + + // normalizing to [0..1] + double normalizedSimilarityValue = (similarityValue - 1.0 + maxBandId) + / maxBandId; + similarityValues[u][v] = normalizedSimilarityValue; + } else { + similarityValues[u][v] = 0.0; + } + + // For large datasets + if (keepRatingCountMatrix) { + ratingCountMatrix[u][v] = rcm; + } + } + + // for u == v assign 1 + // ratingCountMatrix wasn't created for this case + similarityValues[u][u] = 1.0; + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedUserBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedUserBasedSimilarity.java new file mode 100644 index 0000000..1bfad22 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/ImprovedUserBasedSimilarity.java @@ -0,0 +1,129 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +public class ImprovedUserBasedSimilarity extends SimilarityMatrixImpl { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -4225607333671670946L; + + public ImprovedUserBasedSimilarity(Dataset dataSet) { + + this(ImprovedUserBasedSimilarity.class.getSimpleName(), dataSet, true); + } + + public ImprovedUserBasedSimilarity(String id, Dataset dataSet, + boolean keepRatingCountMatrix) { + this.id = id; + this.keepRatingCountMatrix = keepRatingCountMatrix; + this.useObjIdToIndexMapping = dataSet.isIdMappingRequired(); + calculate(dataSet); + } + + // here we assume that userId and bookId are: + // - integers, + // - start with 1 + // - have no gaps in sequence. + // Otherwise we would have to have a mapping from userId/bookId into index + @Override + protected void calculate(Dataset dataSet) { + + int nUsers = dataSet.getUserCount(); + int nRatingValues = 5; + + similarityValues = new double[nUsers][nUsers]; + if (keepRatingCountMatrix) { + ratingCountMatrix = new RatingCountMatrix[nUsers][nUsers]; + } + + // if we want to use mapping from userId to index then generate + // index for every userId + if (useObjIdToIndexMapping) { + for (User u : dataSet.getUsers()) { + idMapping.getIndex(String.valueOf(u.getId())); + } + } + + for (int u = 0; u < nUsers; u++) { + + int userAId = getObjIdFromIndex(u); + User userA = dataSet.getUser(userAId); + + // Notice that we need to consider only the upper triangular matrix + for (int v = u + 1; v < nUsers; v++) { + + int userBId = getObjIdFromIndex(v); + User userB = dataSet.getUser(userBId); + + RatingCountMatrix rcm = new RatingCountMatrix(userA, userB, + nRatingValues); + int totalCount = rcm.getTotalCount(); + int agreementCount = rcm.getAgreementCount(); + + if (agreementCount > 0) { + double weightedDisagreements = 0.0; + int maxBandId = rcm.getMatrix().length - 1; + for (int matrixBandId = 1; matrixBandId <= maxBandId; matrixBandId++) { + double bandWeight = matrixBandId; + weightedDisagreements += bandWeight + * rcm.getBandCount(matrixBandId); + } + + double similarityValue = 1.0 - (weightedDisagreements / totalCount); + + // normalizing to [0..1] + double normalizedSimilarityValue = (similarityValue - 1.0 + maxBandId) + / maxBandId; + + similarityValues[u][v] = normalizedSimilarityValue; + } else { + similarityValues[u][v] = 0.0; + } + + // For large datasets + if (keepRatingCountMatrix) { + ratingCountMatrix[u][v] = rcm; + } + + } + + // for u == v assign 1 + similarityValues[u][u] = 1.0; // RatingCountMatrix wasn't + // created for this case + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemBasedSimilarity.java new file mode 100644 index 0000000..9017099 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemBasedSimilarity.java @@ -0,0 +1,110 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +public class ItemBasedSimilarity extends SimilarityMatrixImpl { + + /** + * + */ + private static final long serialVersionUID = 3062035062791168163L; + + public ItemBasedSimilarity(String id, Dataset dataSet, + boolean keepRatingCountMatrix) { + this.id = id; + this.keepRatingCountMatrix = keepRatingCountMatrix; + this.useObjIdToIndexMapping = dataSet.isIdMappingRequired(); + calculate(dataSet); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nItems = dataSet.getItemCount(); + int nRatingValues = 5; + + similarityValues = new double[nItems][nItems]; + + if (keepRatingCountMatrix) { + ratingCountMatrix = new RatingCountMatrix[nItems][nItems]; + } + + // if we want to use mapping from itemId to index then generate + // index for every itemId + if (useObjIdToIndexMapping) { + for (Item item : dataSet.getItems()) { + idMapping.getIndex(String.valueOf(item.getId())); + } + } + + int totalCount = 0; + int agreementCount = 0; + + for (int u = 0; u < nItems; u++) { + + int itemAId = getObjIdFromIndex(u); + Item itemA = dataSet.getItem(itemAId); + + // we only need to calculate elements above the main diagonal. + for (int v = u + 1; v < nItems; v++) { + + int itemBId = getObjIdFromIndex(v); + + Item itemB = dataSet.getItem(itemBId); + + RatingCountMatrix rcm = new RatingCountMatrix(itemA, itemB, + nRatingValues); + + totalCount = rcm.getTotalCount(); + agreementCount = rcm.getAgreementCount(); + + if (agreementCount > 0) { + similarityValues[u][v] = (double) agreementCount + / (double) totalCount; + } else { + similarityValues[u][v] = 0.0; + } + + if (keepRatingCountMatrix) { + ratingCountMatrix[u][v] = rcm; + } + } + + // for u == v assign 1 + similarityValues[u][u] = 1.0; + + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemContentBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemContentBasedSimilarity.java new file mode 100644 index 0000000..7570b52 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemContentBasedSimilarity.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.util.metrics.CosineSimilarityMeasure; + +/** + * Similarity between items based on the content associated with items. + */ +public class ItemContentBasedSimilarity extends SimilarityMatrixImpl { + + /** + * SVUID + */ + private static final long serialVersionUID = -2807190886025734879L; + + public ItemContentBasedSimilarity(String id, Dataset ds) { + this.id = id; + this.useObjIdToIndexMapping = ds.isIdMappingRequired(); + calculate(ds); + } + + @Override + protected void calculate(Dataset dataSet) { + int nItems = dataSet.getItemCount(); + + similarityValues = new double[nItems][nItems]; + + // if we want to use mapping from itemId to index then generate + // index for every itemId + if (useObjIdToIndexMapping) { + for (Item item : dataSet.getItems()) { + idMapping.getIndex(String.valueOf(item.getId())); + } + } + + CosineSimilarityMeasure cosineMeasure = new CosineSimilarityMeasure(); + String[] allTerms = dataSet.getAllTerms(); + + for (int u = 0; u < nItems; u++) { + + int itemAId = getObjIdFromIndex(u); + Item itemA = dataSet.getItem(itemAId); + + // we only need to calculate elements above the main diagonal. + for (int v = u + 1; v < nItems; v++) { + + int itemBId = getObjIdFromIndex(v); + Item itemB = dataSet.getItem(itemBId); + + similarityValues[u][v] = cosineMeasure.calculate(itemA + .getItemContent().getTermVector(allTerms), itemB + .getItemContent().getTermVector(allTerms)); + } + + // for u == v assign 1 + similarityValues[u][u] = 1.0; + + } + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemPenaltyBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemPenaltyBasedSimilarity.java new file mode 100644 index 0000000..a89a70c --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/ItemPenaltyBasedSimilarity.java @@ -0,0 +1,161 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +public class ItemPenaltyBasedSimilarity extends SimilarityMatrixImpl { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -6137735175034641281L; + + public ItemPenaltyBasedSimilarity(Dataset dataSet) { + + this(ItemPenaltyBasedSimilarity.class.getSimpleName(), dataSet, true); + } + + public ItemPenaltyBasedSimilarity(String id, Dataset dataSet, + boolean keepRatingCountMatrix) { + this.id = id; + this.keepRatingCountMatrix = keepRatingCountMatrix; + this.useObjIdToIndexMapping = dataSet.isIdMappingRequired(); + calculate(dataSet); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nItems = dataSet.getItemCount(); + int nRatingValues = 5; + + /* + * The penalties distort the scale that we use for similarities + * maxBoundWeight is an auxiliary variable for scaling back to [0,1] + */ + double scaleFactor = 0.0; + + similarityValues = new double[nItems][nItems]; + + if (keepRatingCountMatrix) { + ratingCountMatrix = new RatingCountMatrix[nItems][nItems]; + } + + // if we want to use mapping from itemId to index then generate + // index for every itemId + if (useObjIdToIndexMapping) { + for (Item item : dataSet.getItems()) { + idMapping.getIndex(String.valueOf(item.getId())); + } + } + + // By using these variables we reduce the number of method calls + // inside the double loop. + int totalCount = 0; + int agreementCount = 0; + + for (int u = 0; u < nItems; u++) { + + int itemAId = getObjIdFromIndex(u); + Item itemA = dataSet.getItem(itemAId); + + // we only need to calculate elements above the main diagonal. + for (int v = u + 1; v < nItems; v++) { + + int itemBId = getObjIdFromIndex(v); + + Item itemB = dataSet.getItem(itemBId); + + RatingCountMatrix rcm = new RatingCountMatrix(itemA, itemB, + nRatingValues); + + totalCount = rcm.getTotalCount(); + agreementCount = rcm.getAgreementCount(); + + if (agreementCount > 0) { + + /* + * See ImprovedUserBasedSimilarity class for detailed + * explanation. + */ + double weightedDisagreements = 0.0; + + int maxBandId = rcm.getMatrix().length - 1; + + for (int matrixBandId = 1; matrixBandId <= maxBandId; matrixBandId++) { + + /* + * The following is a heuristic. Can you figure out what + * characteristics are captured in such an expression? + * The numbers 1.8 and 0.4 are arbitrary, however, we + * could define them by solving an optimization problem. + * How would you formulate the problem? How would you + * solve it? + */ + double bandWeight = 1.8 - Math.exp(1 - matrixBandId); + bandWeight = Math.pow(bandWeight, 0.4); + + if (bandWeight > scaleFactor) { + scaleFactor = bandWeight; + } + + weightedDisagreements += bandWeight + * rcm.getBandCount(matrixBandId); + } + + double similarityValue = 1.0 - (weightedDisagreements / totalCount); + + // w is the upper (negative) bound of the weighted + // similarity scale + double w = scaleFactor * (totalCount - agreementCount); + + similarityValues[u][v] = (w + similarityValue) / (w + 1); + + } else { + similarityValues[u][v] = 0.0; + } + + if (keepRatingCountMatrix) { + ratingCountMatrix[u][v] = rcm; + } + } + + // for u == v assign 1 + // ratingCountMatrix wasn't created for this case + similarityValues[u][u] = 1.0; + + } + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrix.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrix.java new file mode 100644 index 0000000..681742f --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrix.java @@ -0,0 +1,74 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +/** + * Defines similarity matrix. For user-oriented methods it represents + * similarities between users and for item-oriented methods this matrix + * represents similarities between items. + * + */ +public interface SimilarityMatrix extends java.io.Serializable { + + /** + * Similarity matrix id. + * + * @return + */ + public abstract String getId(); + + public abstract RatingCountMatrix getRatingCountMatrix(Integer idX, + Integer idY); + + /** + * Returns matrix of similarities. For user-oriented methods it represents + * similarities between users and for item-oriented methods the matrix + * represents similarities between items. + * + * @return similarity matrix + */ + public abstract double[][] getSimilarityMatrix(); + + /** + * Returns similarity value between two objects identified by their IDs. + * + * @param idX + * @param idY + * @return + */ + public abstract double getValue(Integer idX, Integer idY); + + public abstract boolean isRatingCountMatrixAvailable(); + + public void print(); +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrixImpl.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrixImpl.java new file mode 100644 index 0000000..fb4bfbd --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/SimilarityMatrixImpl.java @@ -0,0 +1,148 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import java.util.Arrays; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; +import org.yooreeka.util.internet.crawling.util.ValueToIndexMapping; + +public abstract class SimilarityMatrixImpl implements SimilarityMatrix { + + private static final long serialVersionUID = -8119322978934551969L; + + protected String id; + protected double similarityValues[][] = null; + protected RatingCountMatrix ratingCountMatrix[][] = null; + protected boolean keepRatingCountMatrix = false; + + protected boolean useObjIdToIndexMapping = true; + protected ValueToIndexMapping idMapping = new ValueToIndexMapping(); + + protected SimilarityMatrixImpl() { + } + + protected abstract void calculate(Dataset dataSet); + + public String getId() { + return this.id; + } + + /** + * + * @param objId + * user or item id. + * @return index that can be used to access the object in the matrix. + */ + protected int getIndexFromObjId(Integer objId) { + int index = 0; + if (useObjIdToIndexMapping) { + index = idMapping.getIndex(String.valueOf(objId)); + } else { + index = objId - 1; + } + return index; + } + + protected Integer getObjIdFromIndex(int index) { + Integer objId; + if (useObjIdToIndexMapping) { + objId = Integer.parseInt(idMapping.getValue(index)); + } else { + objId = index + 1; + } + return objId; + } + + public RatingCountMatrix getRatingCountMatrix(Integer idX, Integer idY) { + int x = getIndexFromObjId(idX); + int y = getIndexFromObjId(idY); + + return ratingCountMatrix[x][y]; + } + + public double[][] getSimilarityMatrix() { + return similarityValues; + } + + public boolean getUseObjIdToIndexMapping() { + return useObjIdToIndexMapping; + } + + public double getValue(Integer idX, Integer idY) { + if (similarityValues == null) { + throw new IllegalStateException( + "You have to calculate similarities first."); + } + + int x = getIndexFromObjId(idX); + int y = getIndexFromObjId(idY); + + int i, j; + if (x <= y) { + i = x; + j = y; + } else { + i = y; + j = x; + } + return similarityValues[i][j]; + } + + public boolean isRatingCountMatrixAvailable() { + return keepRatingCountMatrix; + } + + public void print() { + if (similarityValues != null) { + for (double[] row : this.similarityValues) { + System.out.println(Arrays.toString(row)); + } + } + } + + public void print(int nRows) { + int count = 0; + if (similarityValues != null) { + for (double[] row : this.similarityValues) { + if (count < nRows) { + System.out.println(Arrays.toString(row)); + } + count++; + } + } + } + + public void setUseObjIdToIndexMapping(boolean value) { + this.useObjIdToIndexMapping = value; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/UserBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserBasedSimilarity.java new file mode 100644 index 0000000..8c631de --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserBasedSimilarity.java @@ -0,0 +1,117 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +public class UserBasedSimilarity extends SimilarityMatrixImpl { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = 5741616253320567238L; + + public UserBasedSimilarity(Dataset dataSet) { + + this(UserBasedSimilarity.class.getSimpleName(), dataSet, true); + } + + public UserBasedSimilarity(String id, Dataset dataSet, + boolean keepRatingCountMatrix) { + this.id = id; + this.keepRatingCountMatrix = keepRatingCountMatrix; + this.useObjIdToIndexMapping = dataSet.isIdMappingRequired(); + calculate(dataSet); + } + + // here we assume that userId and bookId are: + // - integers, + // - start with 1 + // - have no gaps in sequence. + // Otherwise we would have to have a mapping from userId/bookId into index + @Override + protected void calculate(Dataset dataSet) { + + int nUsers = dataSet.getUserCount(); + int nRatingValues = 5; + + similarityValues = new double[nUsers][nUsers]; + + if (keepRatingCountMatrix) { + ratingCountMatrix = new RatingCountMatrix[nUsers][nUsers]; + } + + // if we want to use mapping from userId to index then generate + // index for every userId + if (useObjIdToIndexMapping) { + for (User u : dataSet.getUsers()) { + idMapping.getIndex(String.valueOf(u.getId())); + } + } + + for (int u = 0; u < nUsers; u++) { + + int userAId = getObjIdFromIndex(u); + User userA = dataSet.getUser(userAId); + + for (int v = u + 1; v < nUsers; v++) { + + int userBId = getObjIdFromIndex(v); + User userB = dataSet.getUser(userBId); + + RatingCountMatrix rcm = new RatingCountMatrix(userA, userB, + nRatingValues); + + int totalCount = rcm.getTotalCount(); + int agreementCount = rcm.getAgreementCount(); + + if (agreementCount > 0) { + + similarityValues[u][v] = (double) agreementCount + / (double) totalCount; + } else { + similarityValues[u][v] = 0.0; + } + + // For large datasets + if (keepRatingCountMatrix) { + ratingCountMatrix[u][v] = rcm; + } + } + + // for u == v assign 1. + // RatingCountMatrix wasn't created for this case + similarityValues[u][u] = 1.0; + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/UserContentBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserContentBasedSimilarity.java new file mode 100644 index 0000000..101be1e --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserContentBasedSimilarity.java @@ -0,0 +1,107 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.util.metrics.CosineSimilarityMeasure; + +/** + * Similarity between users based on the content associated with users. + */ +public class UserContentBasedSimilarity extends SimilarityMatrixImpl { + + /** + * SVUID + */ + private static final long serialVersionUID = 5809078434246172835L; + + public UserContentBasedSimilarity(String id, Dataset ds) { + this.id = id; + this.useObjIdToIndexMapping = ds.isIdMappingRequired(); + calculate(ds); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nUsers = dataSet.getUserCount(); + + similarityValues = new double[nUsers][nUsers]; + + // if we want to use mapping from userId to index then generate + // index for every userId + if (useObjIdToIndexMapping) { + for (User u : dataSet.getUsers()) { + idMapping.getIndex(String.valueOf(u.getId())); + } + } + + CosineSimilarityMeasure cosineMeasure = new CosineSimilarityMeasure(); + String[] allTerms = dataSet.getAllTerms(); + + for (int u = 0; u < nUsers; u++) { + int userAId = getObjIdFromIndex(u); + User userA = dataSet.getUser(userAId); + + for (int v = u + 1; v < nUsers; v++) { + + int userBId = getObjIdFromIndex(v); + User userB = dataSet.getUser(userBId); + + double similarity = 0.0; + + for (Content userAContent : userA.getUserContent()) { + + double bestCosineSimValue = 0.0; + + for (Content userBContent : userB.getUserContent()) { + double cosineSimValue = cosineMeasure.calculate( + userAContent.getTermVector(allTerms), + userBContent.getTermVector(allTerms)); + bestCosineSimValue = Math.max(bestCosineSimValue, + cosineSimValue); + } + + similarity += bestCosineSimValue; + } + // System.out.println("Similarity user[" + u + "][" + v + "]=" + + // similarity); + similarityValues[u][v] = similarity + / userA.getUserContent().size(); + } + + // for u == v assign 1. + similarityValues[u][u] = 1.0; + } + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/naive/UserItemContentBasedSimilarity.java b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserItemContentBasedSimilarity.java new file mode 100644 index 0000000..e41f437 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/naive/UserItemContentBasedSimilarity.java @@ -0,0 +1,184 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.naive; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.util.internet.crawling.util.ValueToIndexMapping; +import org.yooreeka.util.metrics.CosineSimilarityMeasure; + +/** + * Similarity between users based on the content associated with users. + */ +public class UserItemContentBasedSimilarity extends SimilarityMatrixImpl { + + /** + * SVUID + */ + private static final long serialVersionUID = -372816966539384847L; + + private ValueToIndexMapping idMappingForUser = new ValueToIndexMapping(); + private ValueToIndexMapping idMappingForItem = new ValueToIndexMapping(); + + public UserItemContentBasedSimilarity(String id, Dataset ds) { + this.id = id; + this.useObjIdToIndexMapping = ds.isIdMappingRequired(); + calculate(ds); + } + + @Override + protected void calculate(Dataset dataSet) { + + int nUsers = dataSet.getUserCount(); + int nItems = dataSet.getItemCount(); + + similarityValues = new double[nUsers][nItems]; + + // if we want to use mapping from userId/itemId to matrix index + // then we need to generate index for every userId and itemId + if (useObjIdToIndexMapping) { + for (User u : dataSet.getUsers()) { + idMappingForUser.getIndex(String.valueOf(u.getId())); + } + + for (Item i : dataSet.getItems()) { + idMappingForItem.getIndex(String.valueOf(i.getId())); + } + } + + CosineSimilarityMeasure cosineMeasure = new CosineSimilarityMeasure(); + String[] allTerms = dataSet.getAllTerms(); + + for (int u = 0; u < nUsers; u++) { + int userId = getUserIdForIndex(u); + User user = dataSet.getUser(userId); + + for (int v = 0; v < nItems; v++) { + + int itemId = getItemIdFromIndex(v); + Item item = dataSet.getItem(itemId); + + double simValue = 0.0; + double bestCosineSimValue = 0.0; + + for (Content userContent : user.getUserContent()) { + + simValue = cosineMeasure.calculate(userContent + .getTermVector(allTerms), item.getItemContent() + .getTermVector(allTerms)); + bestCosineSimValue = Math.max(bestCosineSimValue, simValue); + } + + similarityValues[u][v] = bestCosineSimValue; + } + } + } + + /* + * Utility method to convert itemId into matrix index + */ + private int getIndexForItemId(Integer itemId) { + int index = 0; + if (useObjIdToIndexMapping) { + index = idMappingForItem.getIndex(String.valueOf(itemId)); + } else { + index = itemId - 1; + } + return index; + } + + /* + * Utility method to convert userId into matrix index. + */ + private int getIndexForUserId(Integer userId) { + int index = 0; + if (useObjIdToIndexMapping) { + index = idMappingForUser.getIndex(String.valueOf(userId)); + } else { + index = userId - 1; + } + return index; + } + + @Override + protected int getIndexFromObjId(Integer objId) { + throw new UnsupportedOperationException( + "Should not be used. Use user or item specific method istead."); + } + + /* + * Utility method to convert matrix index into itemId. + */ + private Integer getItemIdFromIndex(int index) { + Integer objId; + if (useObjIdToIndexMapping) { + objId = Integer.parseInt(idMappingForItem.getValue(index)); + } else { + objId = index + 1; + } + return objId; + } + + @Override + protected Integer getObjIdFromIndex(int index) { + throw new UnsupportedOperationException( + "Should not be used. Use user or item specific method istead."); + } + + /* + * Utility method to convert matrix index into userId + */ + private Integer getUserIdForIndex(int index) { + Integer objId; + if (useObjIdToIndexMapping) { + objId = Integer.parseInt(idMappingForUser.getValue(index)); + } else { + objId = index + 1; + } + return objId; + } + + @Override + public double getValue(Integer userId, Integer itemId) { + if (similarityValues == null) { + throw new IllegalStateException( + "You have to calculate similarities first."); + } + + int x = getIndexForUserId(userId); + int y = getIndexForItemId(itemId); + + return similarityValues[x][y]; + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrix.java b/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrix.java new file mode 100644 index 0000000..55e19cc --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrix.java @@ -0,0 +1,82 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.triangular; + +import java.util.Hashtable; + +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; + +/** + * Defines a similarity matrix, which uses a Hashtable. The + * Hashtable store the upper triangular part of the similarity + * matrix. + * + * Note: If the similarity matrix is not symmetric then this is not an + * appropriate representation. For example, in the case of user-oriented methods + * you might want the similarity matrix to reflect the assymetry between the + * tastes of various individuals. Person A may like person B and considers + * himself similar to person B. However, person B may not feel the same way. + * + */ +public interface UpperTriangularSimilarityMatrix extends java.io.Serializable { + + /** + * Similarity matrix id. + * + * @return + */ + public abstract String getId(); + + public abstract RatingCountMatrix getRatingCountMatrix(Integer idX, + Integer idY); + + /** + * Returns an upper triangular matrix of similarities. For user-oriented + * methods it represents similarities between users and for item-oriented + * methods the matrix represents similarities between items. + * + * @return similarity matrix + */ + public abstract Hashtable getSimilarityMatrix(); + + /** + * Returns similarity value between two objects identified by their IDs. + * + * @param idX + * @param idY + * @return + */ + public abstract double getValue(Integer idX, Integer idY); + + public abstract boolean isRatingCountMatrixAvailable(); + + public void print(); +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrixImpl.java b/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrixImpl.java new file mode 100644 index 0000000..192cbc3 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/triangular/UpperTriangularSimilarityMatrixImpl.java @@ -0,0 +1,150 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.triangular; + +import java.util.Arrays; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrix; +import org.yooreeka.algos.reco.collab.similarity.util.RatingCountMatrix; +import org.yooreeka.util.internet.crawling.util.ValueToIndexMapping; + +public abstract class UpperTriangularSimilarityMatrixImpl implements + SimilarityMatrix { + + private static final long serialVersionUID = -6083265402166050924L; + + protected String id; + protected double similarityValues[][] = null; + protected RatingCountMatrix ratingCountMatrix[][] = null; + protected boolean keepRatingCountMatrix = false; + + protected boolean useObjIdToIndexMapping = true; + protected ValueToIndexMapping idMapping = new ValueToIndexMapping(); + + protected UpperTriangularSimilarityMatrixImpl() { + } + + protected abstract void calculate(Dataset dataSet); + + public String getId() { + return this.id; + } + + /** + * + * @param objId + * user or item id. + * @return index that can be used to access the object in the matrix. + */ + protected int getIndexFromObjId(Integer objId) { + int index = 0; + if (useObjIdToIndexMapping) { + index = idMapping.getIndex(String.valueOf(objId)); + } else { + index = objId - 1; + } + return index; + } + + protected Integer getObjIdFromIndex(int index) { + Integer objId; + if (useObjIdToIndexMapping) { + objId = Integer.parseInt(idMapping.getValue(index)); + } else { + objId = index + 1; + } + return objId; + } + + public RatingCountMatrix getRatingCountMatrix(Integer idX, Integer idY) { + int x = getIndexFromObjId(idX); + int y = getIndexFromObjId(idY); + + return ratingCountMatrix[x][y]; + } + + public double[][] getSimilarityMatrix() { + return similarityValues; + } + + public boolean getUseObjIdToIndexMapping() { + return useObjIdToIndexMapping; + } + + public double getValue(Integer idX, Integer idY) { + if (similarityValues == null) { + throw new IllegalStateException( + "You have to calculate similarities first."); + } + + int x = getIndexFromObjId(idX); + int y = getIndexFromObjId(idY); + + int i, j; + if (x <= y) { + i = x; + j = y; + } else { + i = y; + j = x; + } + return similarityValues[i][j]; + } + + public boolean isRatingCountMatrixAvailable() { + return keepRatingCountMatrix; + } + + public void print() { + if (similarityValues != null) { + for (double[] row : this.similarityValues) { + System.out.println(Arrays.toString(row)); + } + } + } + + public void print(int nRows) { + int count = 0; + if (similarityValues != null) { + for (double[] row : this.similarityValues) { + if (count < nRows) { + System.out.println(Arrays.toString(row)); + } + count++; + } + } + } + + public void setUseObjIdToIndexMapping(boolean value) { + this.useObjIdToIndexMapping = value; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/util/PearsonCorrelation.java b/src/org/yooreeka/algos/reco/collab/similarity/util/PearsonCorrelation.java new file mode 100644 index 0000000..e33e90d --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/util/PearsonCorrelation.java @@ -0,0 +1,170 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.util; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.User; + +/** + * @author Babis Marmanis + * + */ +public class PearsonCorrelation { + + private static final double ZERO = 0.0d; + + int n; + + double[] x; + double[] y; + + public PearsonCorrelation(Dataset ds, Item iA, Item iB) { + + double aAvgR = iA.getAverageRating(); + double bAvgR = iB.getAverageRating(); + + Integer[] uid = Item.getSharedUserIds(iA, iB); + n = uid.length; + + x = new double[n]; + y = new double[n]; + + User u; + double urA = 0; + double urB = 0; + + for (int i = 0; i < n; i++) { + + u = ds.getUser(uid[i]); + urA = u.getItemRating(iA.getId()).getRating(); + urB = u.getItemRating(iB.getId()).getRating(); + + x[i] = urA - aAvgR; + y[i] = urB - bAvgR; + } + } + + public PearsonCorrelation(double[] x, double[] y) + throws java.lang.IllegalArgumentException { + + if (x.length != y.length) { + throw new IllegalArgumentException( + "Arrays x and y should have the same length!"); + } + + n = x.length; + // System.out.print("N="+n); + + this.x = x; + this.y = y; + } + + public double calculate() { + + if (n == 0) { + return 0.0; + } + + double rho = 0.0d; + + double avgX = getAverage(x); + double avgY = getAverage(y); + + double sX = getStdDev(avgX, x); + double sY = getStdDev(avgY, y); + + double xy = 0; + + for (int i = 0; i < n; i++) { + + xy += (x[i] - avgX) * (y[i] - avgY); + } + + // No variation -- all points have the same values for either X or Y or + // both + if (sX == ZERO || sY == ZERO) { + + double indX = ZERO; + double indY = ZERO; + + for (int i = 1; i < n; i++) { + + indX += (x[0] - x[i]); + indY += (y[0] - y[i]); + } + + if (indX == ZERO && indY == ZERO) { + // All points refer to the same value + // This is a degenerate case of correlation + return 1.0; + } else { + // Either the values of the X vary or the values of Y + if (sX == ZERO) { + sX = sY; + } else { + sY = sX; + } + } + } + + rho = xy / (n * (sX * sY)); + + return rho; + } + + private double getAverage(double[] v) { + double avg = 0; + + for (double xi : v) { + avg += xi; + } + + avg = avg / v.length; + + // System.out.print("Average: "+avg); + return avg; + } + + private double getStdDev(double m, double[] v) { + double sigma = 0; + + for (double xi : v) { + sigma += (xi - m) * (xi - m); + } + + sigma = sigma / v.length; + + // System.out.print("StdDev: "+Math.sqrt(sigma)); + return Math.sqrt(sigma); + } + +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/util/RatingCountMatrix.java b/src/org/yooreeka/algos/reco/collab/similarity/util/RatingCountMatrix.java new file mode 100644 index 0000000..1ca5ae8 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/util/RatingCountMatrix.java @@ -0,0 +1,131 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.util; + +import java.io.Serializable; + +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; + +public class RatingCountMatrix implements Serializable { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -8216800040843757769L; + + private int matrix[][] = null; + + public RatingCountMatrix(Item itemA, Item itemB, int nRatingValues) { + init(nRatingValues); + calculate(itemA, itemB); + } + + public RatingCountMatrix(User userA, User userB, int nRatingValues) { + init(nRatingValues); + calculate(userA, userB); + } + + /* + * Populates matrix using user ratings for provided items. We only consider + * users that rated both items. + */ + private void calculate(Item itemA, Item itemB) { + for (Rating ratingForA : itemA.getAllRatings()) { + // check if the same user rated itemB + Rating ratingForB = itemB.getUserRating(ratingForA.getUserId()); + if (ratingForB != null) { + // element in the matrix is determined by the rating values. + int i = ratingForA.getRating() - 1; + int j = ratingForB.getRating() - 1; + matrix[i][j]++; + } + } + } + + /* + * Populates matrix using ratings for items that the two users share. + */ + private void calculate(User userA, User userB) { + + for (Rating ratingByA : userA.getAllRatings()) { + + Rating ratingByB = userB.getItemRating(ratingByA.getItemId()); + + if (ratingByB != null) { + + int i = ratingByA.getRating() - 1; + int j = ratingByB.getRating() - 1; + matrix[i][j]++; + } + } + } + + public int getAgreementCount() { + int ratingCount = 0; + for (int i = 0, n = matrix.length; i < n; i++) { + ratingCount += matrix[i][i]; + } + return ratingCount; + } + + public int getBandCount(int bandId) { + int bandCount = 0; + for (int i = 0, n = matrix.length; (i + bandId) < n; i++) { + bandCount += matrix[i][i + bandId]; + bandCount += matrix[i + bandId][i]; + } + return bandCount; + } + + public int[][] getMatrix() { + return matrix; + } + + public int getTotalCount() { + + int ratingCount = 0; + int n = matrix.length; + + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + ratingCount += matrix[i][j]; + } + } + return ratingCount; + } + + private void init(int nSize) { + // starting point - all elements are zero + matrix = new int[nSize][nSize]; + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixCache.java b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixCache.java new file mode 100644 index 0000000..ac56519 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixCache.java @@ -0,0 +1,71 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.util; + +import java.io.File; + +import org.yooreeka.algos.reco.collab.cache.FileStore; +import org.yooreeka.algos.reco.collab.cache.Store; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrix; + +public class SimilarityMatrixCache { + + private Store store; + private String location; + + public SimilarityMatrixCache(File location) { + store = new FileStore(location); + this.location = location.getAbsolutePath(); + } + + public SimilarityMatrix get(String id) { + SimilarityMatrix s = null; + if (store.exists(id)) { + s = (SimilarityMatrix) store.get(id); + } + return s; + } + + public String getLocation() { + return location; + } + + public void put(String id, SimilarityMatrix similarityMatrix) { + if (store.exists(id)) { + store.remove(id); + } + store.put(id, similarityMatrix); + } + + public void remove(String id) { + store.remove(id); + } +} diff --git a/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java new file mode 100644 index 0000000..48eb933 --- /dev/null +++ b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java @@ -0,0 +1,173 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.collab.similarity.util; + +import java.io.File; + +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.similarity.naive.ImprovedUserBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.ItemBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.ItemContentBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.ItemPenaltyBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.SimilarityMatrix; +import org.yooreeka.algos.reco.collab.similarity.naive.UserBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.UserContentBasedSimilarity; +import org.yooreeka.algos.reco.collab.similarity.naive.UserItemContentBasedSimilarity; +import org.yooreeka.config.YooreekaConfigurator; + +public class SimilarityMatrixRepository { + + /** + * Generates id for similarity matrix based on type and dataset name. + * + * @param type + * @param datasetName + * @return + */ + public static String getId(RecommendationType type, String datasetName) { + String classname = null; + switch (type) { + case ITEM_BASED: + classname = ItemBasedSimilarity.class.getSimpleName(); + break; + case ITEM_PENALTY_BASED: + classname = ItemPenaltyBasedSimilarity.class.getSimpleName(); + break; + case USER_BASED: + classname = UserBasedSimilarity.class.getSimpleName(); + break; + case IMPROVED_USER_BASED: + classname = ImprovedUserBasedSimilarity.class.getSimpleName(); + break; + case USER_CONTENT_BASED: + classname = UserContentBasedSimilarity.class.getSimpleName(); + break; + case ITEM_CONTENT_BASED: + classname = ItemContentBasedSimilarity.class.getSimpleName(); + break; + case USER_ITEM_CONTENT_BASED: + classname = UserItemContentBasedSimilarity.class.getSimpleName(); + break; + default: + throw new IllegalArgumentException("Unknown type: " + type); + } + return classname + "-" + datasetName; + } + + SimilarityMatrixCache cache; + + public SimilarityMatrixRepository(boolean useCache) { + if (useCache) { + String appTempDir = YooreekaConfigurator + .getProperty(YooreekaConfigurator.TEMP_DIR); + File cacheDir = new File(appTempDir, + "ch3/collaborative/SimilarityCache"); + cache = new SimilarityMatrixCache(cacheDir); + } else { + cache = null; + } + } + + public SimilarityMatrixRepository(SimilarityMatrixCache cache) { + this.cache = cache; + } + + public SimilarityMatrix load(RecommendationType type, Dataset data) { + boolean keepRatingCountMatrix = true; + return load(type, data, keepRatingCountMatrix); + } + + public SimilarityMatrix load(RecommendationType type, Dataset data, + boolean keepRatingCountMatrix) { + SimilarityMatrix m = null; + + String id = getId(type, data.getName()); + // if cache is available then try to load from cache first + if (cache != null) { + m = cache.get(id); + if (m == null) { + System.out + .println("similarity matrix instance doesn't exist in cache: " + + "id: " + + id + + ", cache: '" + + cache.getLocation() + "'."); + } else { + System.out + .println("similarity matrix instance was loaded from cache: " + + "id: " + + id + + ", cache: '" + + cache.getLocation() + "'."); + } + } + + // create a new instance + if (m == null) { + switch (type) { + case ITEM_BASED: + m = new ItemBasedSimilarity(id, data, keepRatingCountMatrix); + break; + case ITEM_PENALTY_BASED: + m = new ItemPenaltyBasedSimilarity(id, data, + keepRatingCountMatrix); + break; + case USER_BASED: + m = new UserBasedSimilarity(id, data, keepRatingCountMatrix); + break; + case IMPROVED_USER_BASED: + m = new ImprovedUserBasedSimilarity(id, data, + keepRatingCountMatrix); + break; + case USER_CONTENT_BASED: + m = new UserContentBasedSimilarity(id, data); + break; + case ITEM_CONTENT_BASED: + m = new ItemContentBasedSimilarity(id, data); + break; + case USER_ITEM_CONTENT_BASED: + m = new UserItemContentBasedSimilarity(id, data); + break; + default: + throw new IllegalArgumentException( + "Unsupported recommendation type: " + type.toString()); + } + // store new instance in cache + if (cache != null) { + cache.put(id, m); + } + } + + return m; + } + +} diff --git a/src/org/yooreeka/algos/reco/content/digg/DiggCategory.java b/src/org/yooreeka/algos/reco/content/digg/DiggCategory.java new file mode 100644 index 0000000..367fc68 --- /dev/null +++ b/src/org/yooreeka/algos/reco/content/digg/DiggCategory.java @@ -0,0 +1,83 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.content.digg; + +import java.util.ArrayList; +import java.util.List; + +import de.thesuntoucher.jigg.data.Container; + +public class DiggCategory extends Container { + private static final List allCategories = new ArrayList(); + + public static final DiggCategory TECHNOLOGY = new DiggCategory( + "Technology", "technology"); + public static final DiggCategory WORLD_AND_BUSINESS = new DiggCategory( + "World&Business", "world_business"); + public static final DiggCategory SPORTS = new DiggCategory("Sports", + "sports"); + public static final DiggCategory SCIENCE = new DiggCategory("Science", + "science"); + public static final DiggCategory GAMING = new DiggCategory("Gaming", + "gaming"); + public static final DiggCategory ENTERTAINMENT = new DiggCategory( + "Entertainment", "entertainment"); + public static final DiggCategory VIDEOS = new DiggCategory("Videos", + "videos"); + + public static List getAllCategories() { + return DiggCategory.allCategories; + } + + public static DiggCategory valueOf(String name) { + DiggCategory match = null; + for (DiggCategory c : allCategories) { + if (c.getName().equalsIgnoreCase(name)) { + match = c; + break; + } + } + return match; + } + + private DiggCategory(String name, String shortName) { + super(name, shortName); + allCategories.add(this); + } + + // Note that default Container.toString() implementation in jigg library + // won't work with digg api call. + @Override + public String toString() { + return getShortName(); + } + +} diff --git a/src/org/yooreeka/algos/reco/content/digg/DiggService.java b/src/org/yooreeka/algos/reco/content/digg/DiggService.java new file mode 100644 index 0000000..5c7d5b1 --- /dev/null +++ b/src/org/yooreeka/algos/reco/content/digg/DiggService.java @@ -0,0 +1,253 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.content.digg; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import com.headzoo.net.services.digg.Rooster; +import com.headzoo.net.services.digg.exceptions.DiggRequestException; + +import de.thesuntoucher.jigg.Jigg; +import de.thesuntoucher.jigg.args.StoriesArguments; +import de.thesuntoucher.jigg.data.Container; +import de.thesuntoucher.jigg.data.Story; +import de.thesuntoucher.jigg.data.User; + +public class DiggService { + + private static final int MAX_ITEM_COUNT_PER_REQUEST = 100; + private static final int DEFAULT_ITEM_COUNT_PER_CATEGORY = 20; + private int itemCountPerCategory = 20; + private String API_KEY = "http://www.manning.com"; // "http://code.google.com/p/jigg"; + private Jigg jigg; + + public DiggService() { + jigg = new Jigg(API_KEY); + setItemCountPerCategory(DEFAULT_ITEM_COUNT_PER_CATEGORY); + } + + /** + * Get popular stories in a specific container + * + * @throws IOException + * @throws DiggRequestException + */ + public List fetchPopular(String container) + throws DiggRequestException, IOException { + /* + * The first thing you need to do is create an instance of the Rooster + * class. You will need to pass your application key as a constructor + * parameter. + * + * @link http://apidoc.digg.com/ApplicationKeys + */ + Rooster rooster = new Rooster("http://www.manning.com/marmanis"); + + com.headzoo.net.services.digg.types.collections.StoryList stories = null; + // com.headzoo.net.services.digg.types.collections.Container c = + // getDiggContainer(container); + stories = rooster.stories().fetchAll(); // .fetchPopularInContainer(c); + + ArrayList storiesList = new ArrayList( + stories.size()); + for (com.headzoo.net.services.digg.types.Story s : stories) { + DiggStoryItem dsi = new DiggStoryItem((int) s.getId(), + s.getTitle(), s.getDescription()); + if (s.getUser() != null) { + dsi.setUsername(s.getUser().getName()); + } + if (s.getLink() != null) { + dsi.setLink(s.getLink().toExternalForm()); + } + storiesList.add(dsi); + } + return storiesList; + } + + /** + * Utility method to retrieve a set of stories from each category. + * + * @return list of stories. + */ + public List getAllStories() { + List newsItems = new ArrayList(); + for (DiggCategory c : DiggCategory.getAllCategories()) { + newsItems.addAll(getStories(c)); + } + return newsItems; + } + + public com.headzoo.net.services.digg.types.collections.Container getDiggContainer( + String val) { + com.headzoo.net.services.digg.types.collections.Container c; + + if (val.equalsIgnoreCase("tech")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Technology", "technology"); + + } else if (val.equalsIgnoreCase("world")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "World & Business", "world_business"); + + } else if (val.equalsIgnoreCase("biz")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "World & Business", "world_business"); + + } else if (val.equalsIgnoreCase("sci")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Science", "science"); + + } else if (val.equalsIgnoreCase("game")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Gaming", "gaming"); + + } else if (val.equalsIgnoreCase("life")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Lifestyle", "lifestyle"); + + } else if (val.equalsIgnoreCase("fun")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Entertainment", "entertainment"); + + } else if (val.equalsIgnoreCase("sport")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Sports", "sports"); + + } else if (val.equalsIgnoreCase("offb")) { + + c = new com.headzoo.net.services.digg.types.collections.Container( + "Offbeat", "offbeat"); + + } else { + StringBuilder str = new StringBuilder( + "Not known Container alias.\n"); + str.append("Try one of the following: \n"); + str.append(" tech --> Container(\"Technology\", \"technology\")"); + str.append(" world --> Container(\"World & Business\", \"world_business\")"); + str.append(" biz --> Container(\"World & Business\", \"world_business\")"); + str.append(" sci --> Container(\"Science\", \"science\")"); + str.append(" game --> Container(\"Gaming\", \"gaming\")"); + str.append(" life --> Container(\"Lifestyle\", \"lifestyle\")"); + str.append(" fun --> Container(\"Entertainment\", \"entertainment\")"); + str.append(" sport --> Container(\"Sports\", \"sports\")"); + str.append(" offb --> Container(\"Offbeat\", \"offbeat\")"); + + throw new IllegalArgumentException(); + } + + return c; + } + + public int getItemCountPerCategory() { + return this.itemCountPerCategory; + } + + /** + * + * @param category + * @return + */ + public List getStories(DiggCategory category) { + + StoriesArguments storiesArgs = new StoriesArguments(); + storiesArgs.setCount(itemCountPerCategory); + + List stories = jigg.getPopularStories(category, storiesArgs); + + List items = new ArrayList(); + + for (Story story : stories) { + + int itemId = story.getId(); + String itemName = story.getTitle(); + String description = story.getDescription(); + + DiggStoryItem item = new DiggStoryItem(itemId, itemName, + description); + item.print(); + + // additional fields + item.setLink(story.getLink()); + item.setTopic(story.getTopic().getName()); + if (story.getUser() != null) { + item.setUsername(story.getUser().getName()); + } + + items.add(item); + } + return items; + } + + /** + * Retrieves a set of stories submitted by user. + * + * @param userId + * Digg username + * @param maxStories + * max number of stories to retrieve + * @return list of stories or empty list if the user doesn't have any. + */ + public List getUserStories(String userId, int maxStories) { + User user = new User(userId); + StoriesArguments args = new StoriesArguments(); + args.setCount(maxStories); + List stories = jigg.getStories(user, args); + List items = new ArrayList(); + for (Story story : stories) { + DiggStoryItem item = new DiggStoryItem(story.getId(), + story.getTitle(), story.getDescription()); + item.setLink(story.getLink()); + item.setTopic(story.getTopic().getName()); + Container container = story.getContainer(); + String categoryName = container.getName(); + item.setCategory(categoryName); + if (story.getUser() != null) { + item.setUsername(story.getUser().getName()); + } + items.add(item); + } + return items; + } + + public void setItemCountPerCategory(int count) { + this.itemCountPerCategory = Math.min(MAX_ITEM_COUNT_PER_REQUEST, count); + } +} diff --git a/src/org/yooreeka/algos/reco/content/digg/DiggStoryItem.java b/src/org/yooreeka/algos/reco/content/digg/DiggStoryItem.java new file mode 100644 index 0000000..0f4a6c8 --- /dev/null +++ b/src/org/yooreeka/algos/reco/content/digg/DiggStoryItem.java @@ -0,0 +1,109 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.content.digg; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.reco.collab.model.Item; + +public class DiggStoryItem extends Item { + + /** + * SVUID + */ + private static final long serialVersionUID = 1924555535749825404L; + + private String link; + private String description; + private String topic; + private String username; + private String category; + + public DiggStoryItem(int storyId, String title, String description) { + super(storyId, title); + this.description = description; + String text = title + " " + description; + Content content = new Content(String.valueOf(storyId), text); + setItemContent(content); + } + + public String getCategory() { + return category; + } + + public String getDescription() { + return description; + } + + public String getLink() { + return link; + } + + public String getTitle() { + return getName(); + } + + public String getTopic() { + return topic; + } + + public String getUsername() { + return username; + } + + public void print() { + System.out + .println("---------------------------------------------------------------------"); + System.out.println("Category: " + this.getCategory() + + " -- NewsCategory: " + this.getTopic()); + System.out.println("Title: " + this.getTitle()); + System.out + .println("_____________________________________________________________________"); + System.out.println("Description:\n" + this.getDescription()); + System.out + .println("_____________________________________________________________________"); + } + + public void setCategory(String category) { + this.category = category; + } + + public void setLink(String link) { + this.link = link; + } + + public void setTopic(String topic) { + this.topic = topic; + } + + public void setUsername(String username) { + this.username = username; + } +} diff --git a/src/org/yooreeka/algos/reco/content/digg/DiggUser.java b/src/org/yooreeka/algos/reco/content/digg/DiggUser.java new file mode 100644 index 0000000..1563869 --- /dev/null +++ b/src/org/yooreeka/algos/reco/content/digg/DiggUser.java @@ -0,0 +1,45 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.reco.content.digg; + +import org.yooreeka.algos.reco.collab.model.User; + +public class DiggUser extends User { + + /** + * SVUID + */ + private static final long serialVersionUID = 5334812189997430446L; + + public DiggUser(int id, String name) { + super(id, name); + } +} diff --git a/src/org/yooreeka/algos/search/data/SearchResult.java b/src/org/yooreeka/algos/search/data/SearchResult.java new file mode 100644 index 0000000..13fd76e --- /dev/null +++ b/src/org/yooreeka/algos/search/data/SearchResult.java @@ -0,0 +1,180 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.data; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * Custom wrapper for the search results. + * + * @author Babis Marmanis + * + */ +public class SearchResult { + + /** + * Sorts list in descending order of score value. + */ + public static void sortByScore(List values) { + Collections.sort(values, new Comparator() { + public int compare(SearchResult r1, SearchResult r2) { + int result = 0; + // sort based on score value + if (r1.getScore() < r2.getScore()) { + result = 1; // sorting in descending order + } else if (r1.getScore() > r2.getScore()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + /** + * Sorts array in descending order of score value. + */ + public static void sortByScore(SearchResult[] values) { + Arrays.sort(values, new Comparator() { + public int compare(SearchResult r1, SearchResult r2) { + int result = 0; + // sort based on score value + if (r1.getScore() < r2.getScore()) { + result = 1; // sorting in descending order + } else if (r1.getScore() > r2.getScore()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + private String docId; + private String docType; + + private String title; + + private String url; + + private double score; + + public SearchResult(String docId, String docType, String title, String url, + double score) { + + this.docId = docId; + this.docType = docType; + this.title = title; + this.url = url; + this.score = score; + } + + /** + * @return the docId + */ + public String getDocId() { + return docId; + } + + public String getDocType() { + return docType; + } + + /** + * @return the score + */ + public double getScore() { + return score; + } + + /** + * @return document title if available + */ + public String getTitle() { + return title; + } + + /** + * @return the url + */ + public String getUrl() { + return url; + } + + public String print() { + StringBuilder strB = new StringBuilder(); + // strB.append("Document ID : ").append(docId).append("\n"); + strB.append("Document Type: ").append(docType).append("\n"); + strB.append("Document Title : ").append(title).append("\n"); + strB.append("Document URL: ").append(url).append(" --> "); + strB.append("Relevance Score: ").append(score).append("\n"); + return strB.toString(); + } + + /** + * @param docId + * the docId to set + */ + public void setDocId(String docId) { + this.docId = docId; + } + + public void setDocType(String docType) { + this.docType = docType; + } + + /** + * @param score + * the score to set + */ + public void setScore(double score) { + this.score = score; + } + + /** + * @param title + * document title + */ + public void setTitle(String title) { + this.title = title; + } + + /** + * @param url + * the url to set + */ + public void setUrl(String url) { + this.url = url; + } +} diff --git a/src/org/yooreeka/algos/search/lucene/LuceneIndexBuilder.java b/src/org/yooreeka/algos/search/lucene/LuceneIndexBuilder.java new file mode 100644 index 0000000..df86992 --- /dev/null +++ b/src/org/yooreeka/algos/search/lucene/LuceneIndexBuilder.java @@ -0,0 +1,152 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.lucene; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.yooreeka.util.internet.crawling.core.CrawlData; +import org.yooreeka.util.internet.crawling.core.CrawlDataProcessor; +import org.yooreeka.util.internet.crawling.db.ProcessedDocsDB; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +public class LuceneIndexBuilder implements CrawlDataProcessor { + + public static final String INDEX_FIELD_DOC_ID = "docid"; + public static final String INDEX_FIELD_DOC_TYPE = "doctype"; + public static final String INDEX_FIELD_CONTENT = "content"; + public static final String INDEX_FIELD_TITLE = "title"; + public static final String INDEX_FIELD_URL = "url"; + + private IndexWriter indexWriter; + private CrawlData crawlData; + private int RamBufferSizeMB = 128; + + public LuceneIndexBuilder(File indexFile, CrawlData crawlData) + throws IOException { + + this.crawlData = crawlData; + + try { + + indexWriter = getIndexWriter(indexFile); + + } catch (IOException ioX) { + throw new RuntimeException("Error while creating lucene index: ", + ioX); + } + } + + /* PRIVATE METHODS */ + private void buildLuceneIndex(String groupId, + ProcessedDocsDB parsedDocsService) { + + try { + + List docIdList = parsedDocsService.getDocumentIds(groupId); + + for (String docId : docIdList) { + indexDocument(indexWriter, + parsedDocsService.loadDocument(docId)); + } + + indexWriter.close(); + + } catch (IOException ioX) { + throw new RuntimeException("Error while creating lucene index: ", + ioX); + } + } + + private IndexWriter getIndexWriter(File file) throws IOException { + FSDirectory dir = FSDirectory.open(file); + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, + new StandardAnalyzer(Version.LUCENE_40)); + config.setOpenMode(OpenMode.CREATE_OR_APPEND); + config.setRAMBufferSizeMB(RamBufferSizeMB); + return new IndexWriter(dir, config); + } + + private void indexDocument(IndexWriter iw, ProcessedDocument parsedDoc) + throws IOException { + + org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); + + FieldType customType = new FieldType(TextField.TYPE_STORED); + customType.setStoreTermVectors(true); + customType.setStoreTermVectorPositions(true); + customType.setStoreTermVectorOffsets(false); + + doc.add(new Field(INDEX_FIELD_CONTENT, parsedDoc.getText(), customType)); + + doc.add(new StringField(INDEX_FIELD_URL, parsedDoc.getDocumentURL(), + Field.Store.YES)); + + doc.add(new StringField(INDEX_FIELD_DOC_ID, parsedDoc.getDocumentId(), + Field.Store.YES)); + + doc.add(new TextField(INDEX_FIELD_TITLE, parsedDoc.getDocumentTitle(), + Field.Store.YES)); + + doc.add(new StringField(INDEX_FIELD_DOC_TYPE, parsedDoc + .getDocumentType(), Field.Store.YES)); + + /** + * TODO: 2.2 -- The effect of boosting (Book Section 2.1.2) + * + * Uncomment the lines below to demonstrate the effect of boosting + */ + // if ( parsedDoc.getDocumentId().equals("g1-d13")) { + // doc.setBoost(2); + // } + + iw.addDocument(doc); + } + + public void run() { + List allGroups = crawlData.getProcessedDocsDB() + .getAllGroupIds(); + for (String groupId : allGroups) { + buildLuceneIndex(groupId, crawlData.getProcessedDocsDB()); + } + } +} diff --git a/src/org/yooreeka/algos/search/lucene/analyzer/CustomAnalyzer.java b/src/org/yooreeka/algos/search/lucene/analyzer/CustomAnalyzer.java new file mode 100644 index 0000000..2bcbb08 --- /dev/null +++ b/src/org/yooreeka/algos/search/lucene/analyzer/CustomAnalyzer.java @@ -0,0 +1,113 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.lucene.analyzer; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.StopwordAnalyzerBase; +import org.apache.lucene.util.Version; + +/** + * + * @author Babis Marmanis + * + */ +public class CustomAnalyzer extends StopwordAnalyzerBase { + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * An unmodifiable set containing some common English words that are usually + * not useful for searching. + */ + public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + + private static final String[] ADDITIONAL_STOP_WORDS = { "should", "would", + "from", "up", "i", "s", "it", "his", "has", "he", "she", "her", + "said", "been", "being", "final", "now", "hour", "minute", + "second", "stop", "start", "first", "third", "fast", "slow", + "large", "small" }; + + private static CharArraySet MERGED_STOP_WORDS; + + static { + MERGED_STOP_WORDS = new CharArraySet(Version.LUCENE_40, + STOP_WORDS_SET.size() + ADDITIONAL_STOP_WORDS.length, true); + } + + public CustomAnalyzer(Version matchVersion) { + this(matchVersion, MERGED_STOP_WORDS); + } + + /** + * Builds an analyzer with the given stop words. + * + * @param matchVersion + * Lucene version to match See + * {@link above} + * @param stopWords + * stop words + */ + public CustomAnalyzer(Version matchVersion, CharArraySet stopWords) { + + super(matchVersion, stopWords); + } + + @Override + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + + final StandardTokenizer src = new StandardTokenizer(matchVersion, + reader); + src.setMaxTokenLength(maxTokenLength); + TokenStream tok = new StandardFilter(matchVersion, src); + tok = new LowerCaseFilter(matchVersion, tok); + tok = new StopFilter(matchVersion, tok, stopwords); + return new TokenStreamComponents(src, tok) { + @Override + protected void setReader(final Reader reader) throws IOException { + src.setMaxTokenLength(CustomAnalyzer.this.maxTokenLength); + super.setReader(reader); + } + }; + } +} diff --git a/src/org/yooreeka/algos/search/lucene/analyzer/TextDocumentTerms.java b/src/org/yooreeka/algos/search/lucene/analyzer/TextDocumentTerms.java new file mode 100644 index 0000000..8a3dd07 --- /dev/null +++ b/src/org/yooreeka/algos/search/lucene/analyzer/TextDocumentTerms.java @@ -0,0 +1,78 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.lucene.analyzer; + +import java.util.HashMap; + +/** + * @author Babis Marmanis + * + */ +public class TextDocumentTerms { + + HashMap tf; + + public TextDocumentTerms(String text) { + + String[] terms = text.split("\\s"); + + tf = new HashMap(terms.length); + + for (String s : terms) { + + Integer f = tf.get(s); + + if (f == null) { + // This string has not been added yet + tf.put(s, Integer.valueOf(1)); + } else { + tf.put(s, ++f); + } + } + } + + public String[] getTerms() { + + String[] terms = new String[tf.size()]; + + int i = 0; + + for (String s : tf.keySet()) { + terms[i] = s; + i++; + } + return terms; + } + + public HashMap getTf() { + return tf; + } +} diff --git a/src/org/yooreeka/algos/search/ranking/DocRankMatrixBuilder.java b/src/org/yooreeka/algos/search/ranking/DocRankMatrixBuilder.java new file mode 100644 index 0000000..8619953 --- /dev/null +++ b/src/org/yooreeka/algos/search/ranking/DocRankMatrixBuilder.java @@ -0,0 +1,197 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.ranking; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.SimpleFSDirectory; +import org.yooreeka.algos.search.lucene.analyzer.TextDocumentTerms; +import org.yooreeka.util.internet.crawling.core.CrawlDataProcessor; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +public class DocRankMatrixBuilder implements CrawlDataProcessor { + + // private final int TERMS_TO_KEEP = 3; + + private int termsToKeep = 0; + + private String indexDir; + private PageRankMatrixH matrixH; + + public DocRankMatrixBuilder(String indexDir) { + this.indexDir = indexDir; + } + + private PageRankMatrixH buildMatrixH(IndexReader idxR) throws IOException { + + // only consider URLs that with fetched and parsed content + List allDocs = getProcessedDocs(idxR); + + PageRankMatrixH docMatrix = new PageRankMatrixH(allDocs.size()); + + for (int i = 0, n = allDocs.size(); i < n; i++) { + + for (int j = 0, k = allDocs.size(); j < k; j++) { + + double similarity = 0.0d; + + Document docX = idxR.document(i); + String xURL = docX.get("url"); + + if (i == j) { + + // Avoid shameless self-promotion ;-) + docMatrix.addLink(xURL, xURL, similarity); + + } else { + + TextDocumentTerms xDocumentTerms = new TextDocumentTerms( + docX.get("content")); + + Document docY = idxR.document(j); + TextDocumentTerms yDocumentTerms = new TextDocumentTerms( + docY.get("content")); + + similarity = getImportance(xDocumentTerms, yDocumentTerms); + + // add link from docX to docY + String yURL = docY.get("url"); + + docMatrix.addLink(xURL, yURL, similarity); + } + } + } + + docMatrix.calculate(); + + return docMatrix; + } + + /* + * Checks if the index entry belongs to the category that we want to use + * DocRank on. + */ + private boolean eligibleForDocRank(String doctype) { + return ProcessedDocument.TYPE_MSWORD.equalsIgnoreCase(doctype); + } + + public PageRankMatrixH getH() { + return matrixH; + } + + /* + * Calculates importance of document Y in the context of document X + */ + private double getImportance(TextDocumentTerms xTerms, + TextDocumentTerms yTerms) { + + // sharedTerms is the intersection of the two sets + Set sharedTerms = xTerms.getTf().keySet(); + sharedTerms.retainAll(yTerms.getTf().keySet()); + + double sharedTermsSum = 0.0; + + // Notice that this way of assigning importance is not symmetric. + // That is, if you swap X with Y then you get a different value; + // unless the frequencies are equal, of course! + + double xF, yF; + for (String term : sharedTerms) { + + xF = xTerms.getTf().get(term).doubleValue(); + yF = yTerms.getTf().get(term).doubleValue(); + + sharedTermsSum += Math.round(Math.tanh(yF / xF)); + } + + return sharedTermsSum; + } + + /* + * Collects doc ids from the index for documents with matching doc type. + */ + private List getProcessedDocs(IndexReader idxR) throws IOException { + List docs = new ArrayList(); + for (int i = 0, n = idxR.maxDoc(); i < n; i++) { + if (idxR.hasDeletions() == false) { + Document doc = idxR.document(i); + if (eligibleForDocRank(doc.get("doctype"))) { + docs.add(i); + } + } + } + return docs; + + } + + // private Map buildFreqMap(String[] terms, int[] freq) { + // + // int topNTermsToKeep = (termsToKeep == 0)? TERMS_TO_KEEP: termsToKeep; + // + // Map freqMap = + // TermFreqMapUtils.getTopNTermFreqMap(terms, freq, topNTermsToKeep); + // + // return freqMap; + // } + + /** + * @return the termsToKeep + */ + public int getTermsToKeep() { + return termsToKeep; + } + + public void run() { + try { + DirectoryReader idxR = DirectoryReader.open(new SimpleFSDirectory( + new File(indexDir))); + matrixH = buildMatrixH(idxR); + } catch (Exception e) { + throw new RuntimeException("Error while building matrix: ", e); + } + } + + /** + * @param termsToKeep + * the termsToKeep to set + */ + public void setTermsToKeep(int termsToKeep) { + this.termsToKeep = termsToKeep; + } + +} diff --git a/src/org/yooreeka/algos/search/ranking/PageRankMatrixBuilder.java b/src/org/yooreeka/algos/search/ranking/PageRankMatrixBuilder.java new file mode 100644 index 0000000..0466e35 --- /dev/null +++ b/src/org/yooreeka/algos/search/ranking/PageRankMatrixBuilder.java @@ -0,0 +1,98 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.ranking; + +import java.util.List; +import java.util.Set; + +import org.yooreeka.util.internet.crawling.core.CrawlData; +import org.yooreeka.util.internet.crawling.core.CrawlDataProcessor; +import org.yooreeka.util.internet.crawling.db.KnownUrlDB; +import org.yooreeka.util.internet.crawling.db.PageLinkDB; +import org.yooreeka.util.internet.crawling.model.KnownUrlEntry; + +public class PageRankMatrixBuilder implements CrawlDataProcessor { + + // private static final Logger logger = + // Logger.getLogger(PageRankMatrixBuilder.class); + + private PageRankMatrixH matrixH; + private CrawlData crawlData; + + public PageRankMatrixBuilder(CrawlData crawlData) { + this.crawlData = crawlData; + } + + private PageRankMatrixH buildMatrixH(KnownUrlDB knownUrlDB, + PageLinkDB pageLinkDB) { + + // logger.info("starting calculation of matrix H..."); + + // only consider URLs that with fetched and parsed content + List allProcessedUrls = knownUrlDB + .findProcessedUrls(KnownUrlEntry.STATUS_PROCESSED_SUCCESS); + + PageRankMatrixH pageMatrix = new PageRankMatrixH( + allProcessedUrls.size()); + + for (String url : allProcessedUrls) { + + // register url here in case it has no outlinks. + pageMatrix.addLink(url); + + Set pageOutlinks = pageLinkDB.getOutlinks(url); + + for (String outlink : pageOutlinks) { + + // only consider URLs with parsed content + if (knownUrlDB.isSuccessfullyProcessed(outlink)) { + pageMatrix.addLink(url, outlink); + } + } + } + + pageMatrix.calculate(); + + // logger.info("matrix H is ready. Matrix size: " + + // pageMatrix.getMatrix().length); + + return pageMatrix; + } + + public PageRankMatrixH getH() { + return matrixH; + } + + public void run() { + this.matrixH = buildMatrixH(crawlData.getKnownUrlsDB(), + crawlData.getPageLinkDB()); + } +} diff --git a/src/org/yooreeka/algos/search/ranking/PageRankMatrixH.java b/src/org/yooreeka/algos/search/ranking/PageRankMatrixH.java new file mode 100644 index 0000000..ed0e248 --- /dev/null +++ b/src/org/yooreeka/algos/search/ranking/PageRankMatrixH.java @@ -0,0 +1,184 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.ranking; + +import org.yooreeka.util.internet.crawling.util.ValueToIndexMapping; + +// Sub-stochastic matrix - some rows will have all zeros +public class PageRankMatrixH { + + private ValueToIndexMapping indexMapping = new ValueToIndexMapping(); + + double[][] matrix; + + private int numberOfPagesWithNoLinks = 0; + + public PageRankMatrixH(int nPages) { + matrix = new double[nPages][nPages]; + } + + /** + * Just associate page url with an index. Used for pages that have no + * outlinks. + */ + public void addLink(String pageUrl) { + indexMapping.getIndex(pageUrl); + } + + public void addLink(String fromPageUrl, String toPageUrl) { + addLink(fromPageUrl, toPageUrl, 1); + } + + public void addLink(String fromPageUrl, String toPageUrl, double weight) { + int i = indexMapping.getIndex(fromPageUrl); + int j = indexMapping.getIndex(toPageUrl); + + try { + + matrix[i][j] = weight; + + } catch (ArrayIndexOutOfBoundsException e) { + System.out.println("fromPageUrl:" + fromPageUrl + ", toPageUrl: " + + toPageUrl); + } + } + + public void calculate() { + + for (int i = 0, n = matrix.length; i < n; i++) { + + double rowSum = 0; + + for (int j = 0, k = matrix.length; j < k; j++) { + + rowSum += matrix[i][j]; + } + + if (rowSum > 0) { + + for (int j = 0, k = matrix.length; j < k; j++) { + + if (matrix[i][j] > 0) { + + matrix[i][j] = matrix[i][j] / rowSum; + } + } + + } else { + + numberOfPagesWithNoLinks++; + } + } + } + + /** + * A dangling node corresponds to a web page that has no outlinks. + * These nodes result in a H row that has all its values equal to 0. + */ + public int[] getDangling() { + + int n = getSize(); + int[] d = new int[n]; + + boolean foundOne = false; + + for (int i = 0; i < n; i++) { + + for (int j = 0; j < n; j++) { + + if (matrix[i][j] > 0) { + foundOne = true; + break; + } + } + + if (foundOne) { + d[i] = 0; + } else { + d[i] = 1; + } + + foundOne = false; + } + + return d; + } + + /** + * @return the indexMapping + */ + public ValueToIndexMapping getIndexMapping() { + return indexMapping; + } + + public double[][] getMatrix() { + return matrix; + } + + public int getNumberOfPagesWithNoLinks() { + return this.numberOfPagesWithNoLinks; + } + + public int getSize() { + return matrix.length; + } + + public void print() { + + StringBuilder txt = new StringBuilder("H Matrix\n\n"); + + for (int i = 0, n = matrix.length; i < n; i++) { + txt.append("Index: ").append(i); + txt.append(" --> "); + txt.append("Page ID: ").append(indexMapping.getValue(i)); + txt.append("\n"); + } + + txt.append("\n").append("\n"); + + for (int i = 0, n = matrix.length; i < n; i++) { + + for (int j = 0, k = matrix.length; j < k; j++) { + + txt.append(" "); + txt.append(matrix[i][j]); + + if (j < k - 1) { + txt.append(", "); + } else { + txt.append("\n"); + } + } + } + + System.out.println(txt.toString()); + } +} diff --git a/src/org/yooreeka/algos/search/ranking/Rank.java b/src/org/yooreeka/algos/search/ranking/Rank.java new file mode 100644 index 0000000..1f19d3b --- /dev/null +++ b/src/org/yooreeka/algos/search/ranking/Rank.java @@ -0,0 +1,294 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.ranking; + +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.yooreeka.config.YooreekaConfigurator; + +/** + * + * @author Babis Marmanis + * + */ +public abstract class Rank { + + private static final Logger LOG = Logger.getLogger(Rank.class.getName()); + + public static final double DEFAULT_ALPHA = 0.8; + public static final double DEFAULT_EPSILON = 0.001; + + /** + * This is the percentage of time that a random surfer follows the structure + * of the web. + */ + private double alpha = DEFAULT_ALPHA; + + /** This is the error tolerance for convergence */ + private double epsilon = DEFAULT_EPSILON; + + double[] pR; + + public Rank() { + LOG.setLevel(YooreekaConfigurator.getLevel(Rank.class.getName())); + } + + public void build() throws Exception { + + // check the results + // getH().print(); + + findPageRank(alpha, epsilon); + } + + public void findPageRank(double alpha, double epsilon) { + + // auxiliary variable + PageRankMatrixH matrixH = getH(); + + // The H matrix has size nxn and the PageRank vector has size n + int n = matrixH.getSize(); + + // auxiliary variable + double inv_n = (double) 1 / n; + + // This is the actual nxn matrix of double values + double[][] H = matrixH.getMatrix(); + + // A dummy variable that holds our error -- + // arbitrarily set to an initial value of 1 + double error = 1; + + // This holds the values of the PageRank vector + pR = new double[n]; + + // This is a copy of the PageRank vector from the previous iteration + double[] tmpPR = new double[n]; + + // Set the initial values (ad hoc) + for (int i = 0; i < n; i++) { + pR[i] = inv_n; + } + + /* + * Book Section 2.3 -- Altering the H matrix: Dangling nodes + */ + double[][] dNodes = getDanglingNodeMatrix(); + + /** + * TODO: 2.5 -- Altering the G matrix: Teleportation (Book Section 2.3) + * + * The following code defines the contribution of the dangling nodes, + * i.e. jumping randomly on a page that is not connected with the one + * that our surfer is currently viewing + * + * Notice that it is the same for all pages. An interesting variation of + * the algorithm would introduce a "teleportation" contribution that + * relates the probability of an arbitrary transition to the degree of + * interest that a user has for the content of a page. + * + * Exercise: Could that be done? If so, how? What problems can you see + * with that variation? + */ + double tNodes = (1 - alpha) * inv_n; + + // Replace the H matrix with the G matrix + for (int i = 0; i < n; i++) { + + for (int j = 0; j < n; j++) { + + H[i][j] = alpha * H[i][j] + dNodes[i][j] + tNodes; + } + } + + // Iterate until convergence. + + // A counter for our iterations + int k = 0; + + // We have found the PageRank values if our error is smaller than + // epsilon + while (error >= epsilon) { + + // Make a copy of the PageRank vector before we update it + for (int i = 0; i < n; i++) { + tmpPR[i] = pR[i]; + } + + double dummy = 0; + // Now we get the next point in the iteration + for (int i = 0; i < n; i++) { + + dummy = 0; + + for (int j = 0; j < n; j++) { + + dummy += tmpPR[j] * H[j][i]; + } + + pR[i] = dummy; + } + + // Get the error, so that we can check convergence + error = norm(pR, tmpPR); + + // DEBUG ONLY: Display the progress + if (LOG.getLevel() == Level.FINE) { + LOG.fine("\n Iteration: " + k + + ", PageRank convergence error: " + error); + for (int i = 0; i < n; i++) { + LOG.fine("Index: " + i + " --> PageRank: " + pR[i]); + } + } + // increase the value of the counter by one + k++; + } + + // Report the final values + + List allRankings = new ArrayList(); + for (int i = 0; i < n; i++) { + String url = matrixH.getIndexMapping().getValue(i); + RelevanceScore r = new RelevanceScore(url, pR[i]); + allRankings.add(r); + } + RelevanceScore.sort(allRankings); + LOG.info("\n______________ Calculation Results _______________\n"); + LOG.info("\nIterations: " + k); + LOG.info("\n____________________________________________________\n"); + for (RelevanceScore r : allRankings) { + LOG.info(MessageFormat.format( + "Page URL: %-42s --> Rank: %.15f\n", r.getId(), + r.getScore())); + } + LOG.info("\n____________________________________________________\n"); + } + + /** + * @return the alpha + */ + public double getAlpha() { + return alpha; + } + + /** + * TODO: 2.4 -- Altering the G matrix: Dangling nodes (Book Section 2.3) + * + * The following code defines the contribution of the dangling nodes, i.e. + * nodes that do not link to any other node. + * + * Notice that the 1/n contribution is arbitrary. Given that we have no + * other information about the random surfer's habits or preferences, the + * 1/n value is fair. However, an interesting variation would take into + * account some statistics related to the number of visits a page gets. + * + * Exercise: Change the algorithm, so that a dangling node's contribution + * depends on some page visit statistic. You can practice with a small set + * of pages and examine the effect on the ranking of the pages. + */ + private double[][] getDanglingNodeMatrix() { + + PageRankMatrixH matrixH = getH(); + + int n = matrixH.getSize(); + + double inv_n = (double) 1 / n; + + // The dangling node vector + int[] dangling = matrixH.getDangling(); + + double[][] dNodes = new double[n][n]; + + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + + if (dangling[i] == 0) { + dNodes[i][j] = 0; + } else { + dNodes[i][j] = alpha * inv_n; + } + } + } + + return dNodes; + } + + /** + * @return the epsilon + */ + public double getEpsilon() { + return epsilon; + } + + public abstract PageRankMatrixH getH(); + + /** + * @return the pR + */ + public double getPageRank(String url) { + + int i = getH().getIndexMapping().getIndex(url); + + return pR[i]; + } + + private double norm(double[] a, double[] b) { + + double norm = 0; + + int n = a.length; + + for (int i = 0; i < n; i++) { + norm += Math.abs(a[i] - b[i]); + } + return norm; + } + + /** + * @param alpha + * the alpha to set + */ + public void setAlpha(double alpha) { + this.alpha = alpha; + } + + /** + * @param epsilon + * the epsilon to set + */ + public void setEpsilon(double epsilon) { + this.epsilon = epsilon; + } +} diff --git a/src/org/yooreeka/algos/search/ranking/RelevanceScore.java b/src/org/yooreeka/algos/search/ranking/RelevanceScore.java new file mode 100644 index 0000000..2531d78 --- /dev/null +++ b/src/org/yooreeka/algos/search/ranking/RelevanceScore.java @@ -0,0 +1,78 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.ranking; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * Utility class that acts as a holder for double value and id of the object + * that this value corresponds. + */ +public class RelevanceScore { + /** + * Sorts list in descending order of score value. + */ + public static void sort(List values) { + Collections.sort(values, new Comparator() { + public int compare(RelevanceScore r1, RelevanceScore r2) { + int result = 0; + // sort based on score value + if (r1.getScore() < r2.getScore()) { + result = 1; // sorting in descending order + } else if (r1.getScore() > r2.getScore()) { + result = -1; + } else { + result = 0; + } + return result; + } + }); + } + private String id; + + private double score; + + public RelevanceScore(String id, double rank) { + this.id = id; + this.score = rank; + } + + public String getId() { + return id; + } + + public double getScore() { + return score; + } + +} diff --git a/src/org/yooreeka/algos/search/util/TermFreqMapUtils.java b/src/org/yooreeka/algos/search/util/TermFreqMapUtils.java new file mode 100644 index 0000000..69c2dc4 --- /dev/null +++ b/src/org/yooreeka/algos/search/util/TermFreqMapUtils.java @@ -0,0 +1,93 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.search.util; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +public class TermFreqMapUtils { + + public static Map buildTermFreqMap(String[] keys, + int[] values) { + int n = keys.length; + Map map = new HashMap(n); + + for (int i = 0; i < n; i++) { + map.put(keys[i], values[i]); + } + + return map; + } + + public static Map getTopNTermFreqMap(String[] terms, + int[] frequencies, int topNTerms) { + + Map tfMap = TermFreqMapUtils.buildTermFreqMap(terms, + frequencies); + boolean descending = true; + String[] sortedTerms = TermFreqMapUtils.sortTermsByFrequencies(tfMap, + descending); + int n = Math.min(sortedTerms.length, topNTerms); + Map topNTermFreqMap = new HashMap(); + for (int i = 0; i < n; i++) { + String key = sortedTerms[i]; + Integer value = tfMap.get(sortedTerms[i]); + topNTermFreqMap.put(key, value); + } + + return topNTermFreqMap; + + } + + public static String[] sortTermsByFrequencies( + final Map tfMap, final boolean descending) { + + String[] sortedTerms = tfMap.keySet().toArray(new String[tfMap.size()]); + + Arrays.sort(sortedTerms, new Comparator() { + + public int compare(String key1, String key2) { + int v1 = tfMap.get(key1); + int v2 = tfMap.get(key2); + if (descending) { + return v2 - v1; + } else { + return v1 - v2; + } + } + + }); + + return sortedTerms; + } +} diff --git a/src/org/yooreeka/algos/taxis/bayesian/NaiveBayes.java b/src/org/yooreeka/algos/taxis/bayesian/NaiveBayes.java new file mode 100644 index 0000000..c2d7f4e --- /dev/null +++ b/src/org/yooreeka/algos/taxis/bayesian/NaiveBayes.java @@ -0,0 +1,327 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.bayesian; + +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Logger; + +import org.yooreeka.algos.taxis.core.AttributeValue; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.config.YooreekaConfigurator; + +/** + * A basic implementation of the Naive Bayes algorithm. + * + * The emphasis is on teaching the algorithm, not optimizing its performance. + * + * @author Babis Marmanis + */ +public class NaiveBayes implements Classifier { + + private static final Logger LOG = Logger.getLogger(NaiveBayes.class.getName()); + + /** + * You can use the NaiveBayes classifier in many occasions So, let's give it + * a name to identify the instance of the Classifier. + */ + private String name; + + /** + * Every classifier needs a training set. Notice that both the name of the + * classifier and its training set are intentionally set during the + * Construction phase. + * + * Once you created an instance of the NaiveBayes classifier you cannot set + * its TrainingSet but you can always get the reference to it and add + * instances. + */ + protected TrainingSet tSet; + + /** + * These are the probabilities for each concept + */ + protected Map conceptPriors; + + /** + * This structure contains the fundamental calculation elements of the Naive + * Bayes method, i.e. the conditional probabilities. + */ + protected Map> p; + + /** + * These are the attribute indices that we should consider for training + */ + protected ArrayList attributeList; + + /** An auxiliary variable */ + protected boolean verbose = false; + + /** + * The only constructor for this classifier takes a name and a training set + * as arguments. + * + * @param name + * the name of the classifier + * @param set + * the training set for this classifier + */ + public NaiveBayes(String name, TrainingSet set) { + + LOG.setLevel(YooreekaConfigurator.getLevel(NaiveBayes.class.getName())); + + this.name = name; + tSet = set; + + conceptPriors = new HashMap(tSet.getNumberOfConcepts()); + verbose = false; + } + + /** + * Strictly speaking these are not the prior probabilities but just the + * counts. However, we want to reuse these counts and the priors can be + * obtained by a simple division. + */ + private void calculateConceptPriors() { + + for (Concept c : tSet.getConceptSet()) { + + // Calculate the priors for the concepts + int totalConceptCount = 0; + + for (Instance i : tSet.getInstances().values()) { + + if (i.getConcept().equals(c)) { + totalConceptCount++; + } + } + + conceptPriors.put(c, new Double(totalConceptCount)); + } + } + + protected void calculateConditionalProbabilities() { + + p = new HashMap>(); + + for (Instance i : tSet.getInstances().values()) { + + for (Attribute a : i.getAtrributes()) { + + if (a != null && attributeList.contains(a.getName())) { + + if (p.get(i.getConcept()) == null) { + + p.put(i.getConcept(), + new HashMap()); + + } + + Map aMap = p.get(i.getConcept()); + AttributeValue aV = aMap.get(a); + if (aV == null) { + + aV = new AttributeValue(a.getValue()); + aMap.put(a, aV); + + } else { + aV.count(); + } + } + } + } + } + + public Concept classify(Instance instance) { + + Concept bestConcept = null; + double bestP = 0.0; + + if (tSet == null || tSet.getConceptSet().size() == 0) { + throw new IllegalStateException("You have to train classifier first."); + } + + LOG.finest("\n*** Classifying instance: " + instance.toString() + "\n"); + + for (Concept c : tSet.getConceptSet()) { + + double p = getProbability(c, instance); + + LOG.fine(MessageFormat.format("P(%s|%s) = %.15f\n", c.getName(), instance.toString(), p)); + + if (p >= bestP) { + bestConcept = c; + bestP = p; + } + } + return bestConcept; + } + + /** + * @return the name + */ + public String getName() { + return name; + } + + public double getProbability(Concept c) { + Double trInstanceCount = conceptPriors.get(c); + if (trInstanceCount == null) { + trInstanceCount = 0.0; + } + return trInstanceCount / tSet.getSize(); + } + + /** + * This method calculates the posterior probability that we deal with + * concept c provided that we observed instance i. + * This is the application of Bayes theorem. + * + * @param c + * is a probable concept for instance i + * @param i + * is the observed instance + * @return posterior probability of c given instance + * i + */ + public double getProbability(Concept c, Instance i) { + + double cP = 0; + + if (tSet.getConceptSet().contains(c)) { + + cP = (getProbability(i, c) * getProbability(c)) / getProbability(i); + + } else { + // We have never seen this concept before + // assign to it a "reasonable" value + cP = 1 / (tSet.getNumberOfConcepts() + 1.0); + } + + return cP; + } + + /** + * This method calculates the denumerator of Bayes theorem + * + * @param Instance i + * @return the probability of observing Instance i + */ + public double getProbability(Instance i) { + + double cP = 0; + + for (Concept c : getTset().getConceptSet()) { + + cP += getProbability(i, c) * getProbability(c); + } + return (cP == 0) ? (double) 1 / tSet.getSize() : cP; + } + + public double getProbability(Instance i, Concept c) { + + double cP = 1; + + for (Attribute a : i.getAtrributes()) { + + if (a != null && attributeList.contains(a.getName())) { + + Map aMap = p.get(c); + AttributeValue aV = aMap.get(a); + if (aV == null) { + // the specific attribute value is not present for the + // current concept. + // Can you justify the following estimate? + // Can you think of a better choice? + cP *= ((double) 1 / (tSet.getSize() + 1)); + } else { + cP *= (aV.getCount() / conceptPriors.get(c)); + } + } + } + + return (cP == 1) ? (double) 1 / tSet.getNumberOfConcepts() : cP; + } + + /** + * @return the tSet + */ + public TrainingSet getTset() { + return tSet; + } + + /** + * Training simply sets the probability for each concept + * + */ + public boolean train() { + + long t0 = System.currentTimeMillis(); + + boolean hasTrained = false; + + if (attributeList == null || attributeList.size() == 0) { + + String msg = "Can't train the classifier without specifying the attributes"+ + " for training!\n"+ + "Use the method --> trainOnAttribute(Attribute a)"; + throw new IllegalStateException(msg); + + } else { + + calculateConceptPriors(); + + calculateConditionalProbabilities(); + + hasTrained = true; + } + + LOG.fine(" Naive Bayes training completed in "); + LOG.fine((System.currentTimeMillis() - t0) + " (ms)"); + + return hasTrained; + } + + public void trainOnAttribute(String aName) { + + if (attributeList == null) { + attributeList = new ArrayList(); + } + + attributeList.add(aName); + } +} diff --git a/src/org/yooreeka/algos/taxis/boosting/BoostingARCX4Classifier.java b/src/org/yooreeka/algos/taxis/boosting/BoostingARCX4Classifier.java new file mode 100644 index 0000000..e19cc93 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/boosting/BoostingARCX4Classifier.java @@ -0,0 +1,190 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.boosting; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.algos.taxis.ensemble.ClassifierEnsemble; +import org.yooreeka.algos.taxis.ensemble.ConceptMajorityVoter; + +public abstract class BoostingARCX4Classifier extends ClassifierEnsemble { + + private TrainingSet originalTSet; + + private int classifierPopulation = 2; + + public BoostingARCX4Classifier(String name, TrainingSet tSet) { + super(name); + this.originalTSet = tSet; + } + + public TrainingSet buildTSet(TrainingSet tSet, double[] w) { + + WeightBasedRandom wRnd = new WeightBasedRandom(w); + + int n = w.length; + + Instance[] sample = new Instance[n]; + + Map instances = tSet.getInstances(); + + for (int i = 0; i < n; i++) { + int instanceIndex = wRnd.nextInt(); + sample[i] = instances.get(instanceIndex); + } + + return new TrainingSet(sample); + } + + @Override + public Concept classify(Instance instance) { + + ConceptMajorityVoter voter = new ConceptMajorityVoter(instance); + + for (Classifier baseClassifier : baseClassifiers) { + + Concept c = baseClassifier.classify(instance); + + voter.addVote(c); + } + + if (verbose) { + voter.print(); + } + + return voter.getWinner(); + } + + public abstract Classifier getClassifierForTraining(TrainingSet set); + + /** + * @return the classifierPopulation + */ + public int getClassifierPopulation() { + return classifierPopulation; + } + + public boolean isVerbose() { + return verbose; + } + + /** + * @param classifierPopulation + * the classifierPopulation to set + */ + public void setClassifierPopulation(int classifierPopulation) { + this.classifierPopulation = classifierPopulation; + } + + @Override + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + @Override + public boolean train() { + + baseClassifiers = new ArrayList(); + + int size = originalTSet.getSize(); + + /* + * Weights that define sample selection + */ + double[] w = new double[size]; + + /* + * Number of times instance was misclassified by classifiers that are + * currently in ensemble. + */ + int[] m = new int[size]; + + double w0 = 1.0 / size; + + Arrays.fill(w, w0); + Arrays.fill(m, 0); + + for (int i = 0; i < classifierPopulation; i++) { + if (verbose) { + System.out.println("Instance weights: " + Arrays.toString(w)); + System.out.println("Instance misclassifications: " + + Arrays.toString(m)); + } + + TrainingSet tSet = buildTSet(originalTSet, w); + + Classifier baseClassifier = getClassifierForTraining(tSet); + + baseClassifier.train(); + + updateWeights(originalTSet, w, m, baseClassifier); + + baseClassifiers.add(baseClassifier); + } + + return true; + } + + public void updateWeights(TrainingSet tSet, double[] w, int[] m, + Classifier baseClassifier) { + + int n = w.length; + + // update misclassification counts with results from latest classifier + for (int i = 0; i < n; i++) { + Instance instance = tSet.getInstance(i); + Concept actualConcept = baseClassifier.classify(instance); + Concept expectedConcept = instance.getConcept(); + if (actualConcept == null + || !(actualConcept.getName().equals(expectedConcept + .getName()))) { + m[i]++; + } + } + + // update weights + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum += (1.0 + Math.pow(m[i], 4)); + } + + for (int i = 0; i < n; i++) { + w[i] = (1.0 + Math.pow(m[i], 4)) / sum; + } + + } +} diff --git a/src/org/yooreeka/algos/taxis/boosting/WeightBasedRandom.java b/src/org/yooreeka/algos/taxis/boosting/WeightBasedRandom.java new file mode 100644 index 0000000..c047577 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/boosting/WeightBasedRandom.java @@ -0,0 +1,80 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.boosting; + +import java.util.Random; + +public class WeightBasedRandom { + + private double[] w; + + private Random rnd; + + /** + * Creates a new pseudorandom number generator. Distribution and range of + * numbers is defined by array of weights. + * + * @param w + * weights that define distribution. All weights should add up to + * 1. + */ + public WeightBasedRandom(double[] w) { + this.w = w; + this.rnd = new Random(); + } + + /* + * Returns next pseudorandom integer between 0 and w.length distributed + * according to weights. + */ + public int nextInt() { + + /* + * Pseudorandom, uniformly distributed double value between 0.0 and 1.0 + */ + double x = rnd.nextDouble(); + + double cdf = 0.0; + + int y = 0; + + for (int i = 0, n = w.length; i < n; i++) { + cdf = cdf + w[i]; + y = i; + if (cdf >= x) { + break; + } + } + + return y; + } + +} diff --git a/src/org/yooreeka/algos/taxis/core/AttributeValue.java b/src/org/yooreeka/algos/taxis/core/AttributeValue.java new file mode 100644 index 0000000..2b89aa9 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/AttributeValue.java @@ -0,0 +1,113 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +/** + * @author Babis Marmanis + * + */ +public class AttributeValue { + + private Object value; + + private int count; + + public AttributeValue(Object value) { + this.value = value; + count = 1; + } + + public void count() { + count++; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + + final AttributeValue other = (AttributeValue) obj; + + if (obj == null) { + return false; + } + + if (getClass() != obj.getClass()) { + return false; + } + + if (this == obj) { + return true; + } + + if (value == null) { + + if (other.value != null) { + return false; + } + + } else if (!value.equals(other.value)) { + + return false; + } + + return true; + } + + /** + * @return the count + */ + public int getCount() { + return count; + } + + // OVERRIDEN METHODS + + /** + * @return the value + */ + public Object getValue() { + return value; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Attribute value: " + value + " was found " + count + "times"; + } +} diff --git a/src/org/yooreeka/algos/taxis/core/BaseConcept.java b/src/org/yooreeka/algos/taxis/core/BaseConcept.java new file mode 100644 index 0000000..5b1e263 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/BaseConcept.java @@ -0,0 +1,124 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +import java.util.ArrayList; + +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +/** + * @author Babis Marmanis + * + */ +public class BaseConcept implements Concept { + + private String name; + private BaseConcept parent; + + private ArrayList instances = new ArrayList(); + + public BaseConcept(String name) { + this.name = name; + } + + public BaseConcept(String name, BaseConcept parent) { + this.name = name; + this.parent = parent; + } + + public synchronized void addInstance(Instance i) { + instances.add(i); + } + + @Override + public boolean equals(Object obj) { + + final BaseConcept other = (BaseConcept) obj; + + if (this == obj) { + return true; + } + + if (!(obj instanceof BaseConcept)) { + return false; + } + + if (name == null) { + if (other.name != null) { + return false; + } + } else if (!name.equals(other.name)) { + return false; + } + + if (parent == null) { + if (other.parent != null) { + return false; + } + } else if (!parent.equals(other.parent)) { + return false; + } + + return true; + } + + public Instance[] getInstances() { + return instances.toArray(new Instance[instances.size()]); + } + + public String getName() { + return name; + } + + public Concept getParent() { + return parent; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + result = prime * result + ((parent == null) ? 0 : parent.hashCode()); + return result; + } + + public void setParent(BaseConcept parent) { + this.parent = parent; + } + + @Override + public String toString() { + return name; + } + +} diff --git a/src/org/yooreeka/algos/taxis/core/BaseInstance.java b/src/org/yooreeka/algos/taxis/core/BaseInstance.java new file mode 100644 index 0000000..796b2fc --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/BaseInstance.java @@ -0,0 +1,239 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +/** + * @author Babis Marmanis + * + */ +public class BaseInstance implements Instance { + + public static BaseInstance createInstance(String conceptName, + String[] attrNames, String[] attrValues) { + int n = attrNames.length; + StringAttribute[] attributes = new StringAttribute[n]; + for (int i = 0; i < n; i++) { + attributes[i] = new StringAttribute(attrNames[i], attrValues[i]); + } + + Concept concept = new BaseConcept(conceptName); + return new BaseInstance(concept, attributes); + } + protected Concept concept; + + protected StringAttribute[] attributes; + + public BaseInstance() { + // DO NOTHING + } + + /** + * @param concept + * @param attributes + */ + public BaseInstance(Concept concept, StringAttribute[] attributes) { + this.concept = concept; + this.attributes = attributes; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + + final BaseInstance other = (BaseInstance) obj; + + // Check the basics first + if (this == obj) { + return true; + } + + if ((getClass() != obj.getClass()) || obj == null) { + return false; + } + + // Check the concept + if (concept == null) { + if (other.concept != null) { + return false; + } + } else { + if (!concept.equals(other.concept)) { + return false; + } + } + + // Finally check all the attributes + for (int i = 0; i < attributes.length; i++) { + if (attributes[i] == null) { + if (other.attributes[i] != null) { + return false; + } + } else { + if (!attributes[i].getName().equals( + other.attributes[i].getName())) { + return false; + } else { + if (!attributes[i].getValue().equals( + other.attributes[i].getValue())) { + return false; + } + } + } + } + return true; + } + + public Attribute[] getAtrributes() { + return attributes; + } + + public StringAttribute getAttribute(int i) { + return attributes[i]; + } + + public Attribute getAttributeByName(String attrName) { + Attribute matchedAttribute = null; + + if (attributes != null) { + for (Attribute a : attributes) { + if (attrName.equalsIgnoreCase(a.getName())) { + matchedAttribute = a; + break; + } + } + } + + return matchedAttribute; + } + + public Concept getConcept() { + return concept; + } + + public BaseInstance[] load(BufferedReader bR) throws IOException { + + ArrayList baseInstances = new ArrayList(); + + String line; + boolean hasMoreLines = true; + + while (hasMoreLines) { + + line = bR.readLine(); + + if (line == null) { + + hasMoreLines = false; + + } else { + + String[] data = line.split(","); + + int n = data.length; + + StringAttribute[] attributes = new StringAttribute[n - 1]; + + for (int i = 0; i < n - 1; i++) { + attributes[i] = new StringAttribute("a-" + i, data[i]); + } + + // The last value is assumed to be the class/concept + + baseInstances.add(new BaseInstance( + new BaseConcept(data[n - 1]), attributes)); + } + } + + return baseInstances.toArray(new BaseInstance[baseInstances.size()]); + } + + /** + * This method loads the training instances for the user clicks. + * + * @param fileName + * the name of the file that contains the user clicks + * @throws IOException + */ + public BaseInstance[] load(String fileName) throws IOException { + + File file = new File(fileName); + FileReader fReader = new FileReader(file); + BufferedReader bR = new BufferedReader(fReader); + + return load(bR); + } + + /** + * Pretty print the information for this Instance + */ + public void print() { + + if (attributes != null) { + for (Attribute a : attributes) { + + if (a == null || a.getName() == null) { + System.out.print(" - "); + } else { + if (a.getValue() == null) { + System.out.print(" - "); + } else { + System.out.print(" - " + a.getName() + " = " + + a.getValue()); + } + } + } + } + + System.out.println(" --> " + getConcept().getName()); + } + + /** + * @param concept + * the concept to set + */ + public void setConcept(Concept concept) { + this.concept = concept; + } +} diff --git a/src/org/yooreeka/algos/taxis/core/DoubleAttribute.java b/src/org/yooreeka/algos/taxis/core/DoubleAttribute.java new file mode 100644 index 0000000..6ce39d0 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/DoubleAttribute.java @@ -0,0 +1,96 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +import org.yooreeka.algos.taxis.core.intf.Attribute; + +public class DoubleAttribute implements Attribute { + + public static final Double DEFAULT_VALUE = 0.0; + + String name; + Double value; + + public DoubleAttribute(String name, Double value) { + this.name = name; + this.value = value; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final DoubleAttribute other = (DoubleAttribute) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (!value.equals(other.value)) + return false; + return true; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.data.Attribute#getName() + */ + public String getName() { + return name; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.data.Attribute#getValue() + */ + public Object getValue() { + return value; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + +} diff --git a/src/org/yooreeka/algos/taxis/core/StringAttribute.java b/src/org/yooreeka/algos/taxis/core/StringAttribute.java new file mode 100644 index 0000000..2750ef0 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/StringAttribute.java @@ -0,0 +1,108 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +import org.yooreeka.algos.taxis.core.intf.Attribute; + +/** + * @author Babis Marmanis + * + */ +public class StringAttribute implements Attribute { + + public static final String DEFAULT_VALUE = "*"; + + String name; + String value; + + public StringAttribute(String name, String value) { + this.name = name; + this.value = value; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final StringAttribute other = (StringAttribute) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (!value.equals(other.value)) + return false; + return true; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.data.Attribute#getName() + */ + public String getName() { + return name; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.data.Attribute#getValue() + */ + public Object getValue() { + return value; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + /** + * @param value + * the value to set + */ + public void setValue(String value) { + this.value = value; + } + +} diff --git a/src/org/yooreeka/algos/taxis/core/TrainingSet.java b/src/org/yooreeka/algos/taxis/core/TrainingSet.java new file mode 100644 index 0000000..b678ff8 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/TrainingSet.java @@ -0,0 +1,173 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class TrainingSet implements Serializable { + + /** + * A unique ID, just in case that we want to serialize our training + * instanceSet. + */ + private static final long serialVersionUID = 4754213130190809633L; + + /** + * @return the serialVersionUID + */ + public static long getSerialVersionUID() { + return serialVersionUID; + } + + private boolean verbose = false; + /** + * TODO: 5.x -- Training set management (Book Section 2.4.1 and 5.7) + * + * For large training sets, it may be beneficial to serialize them and store + * them because loading a large training instanceSet is computationally + * expensive. + * + * How would you go about merging two training sets? What problems do you + * foresee? + */ + private HashMap instanceSet; + private HashSet conceptSet; + + private HashSet attributeNameSet; + + public TrainingSet() { + + instanceSet = new HashMap(); + } + + public TrainingSet(Instance[] instances) { + + int instanceId = 0; + + instanceSet = new HashMap(); + conceptSet = new HashSet(); + attributeNameSet = new HashSet(); + + Concept c; + for (Instance i : instances) { + + // System.out.println("Instance Added: "); + // i.print(); + + instanceSet.put(instanceId, i); + + c = i.getConcept(); + if (!conceptSet.contains(c)) { + + conceptSet.add(c); + } + + for (Attribute a : i.getAtrributes()) { + if (a != null) { + attributeNameSet.add(a.getName()); + } + } + + instanceId++; + } + + if (verbose) { + System.out + .println("-------------------------------------------------------------"); + System.out.print("Loaded " + getSize() + + " instances that belong into "); + System.out.println(this.getNumberOfConcepts() + " concepts"); + System.out + .println("-------------------------------------------------------------"); + } + } + + public HashSet getAttributeNameSet() { + return attributeNameSet; + } + + /** + * @return the conceptSet + */ + public HashSet getConceptSet() { + return conceptSet; + } + + public Instance getInstance(int index) { + return instanceSet.get(index); + } + + /** + * @return the instanceSet + */ + public HashMap getInstances() { + return instanceSet; + } + + public int getNumberOfConcepts() { + return conceptSet.size(); + } + + /** + * @return the size of the instanceSet + */ + public int getSize() { + return instanceSet.size(); + } + + /** + * @return the verbose + */ + public boolean isVerbose() { + return verbose; + } + + public void print() { + + for (Instance i : instanceSet.values()) { + i.print(); + } + } + + /** + * @param verbose + * the verbose to set + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } +} diff --git a/src/org/yooreeka/algos/taxis/core/intf/Attribute.java b/src/org/yooreeka/algos/taxis/core/intf/Attribute.java new file mode 100644 index 0000000..d77d5f4 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/intf/Attribute.java @@ -0,0 +1,42 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core.intf; + +/** + * @author Babis Marmanis + * + */ +public interface Attribute { + + public String getName(); + + public Object getValue(); +} diff --git a/src/org/yooreeka/algos/taxis/core/intf/Classifier.java b/src/org/yooreeka/algos/taxis/core/intf/Classifier.java new file mode 100644 index 0000000..238ffad --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/intf/Classifier.java @@ -0,0 +1,51 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core.intf; + +/** + * Every classifier must be: + *
    + *
  • able to load a TrainingSet, and
  • + *
  • able to classify an Instance
  • + *
+ * + * This interface reflects these two elementary methods. + * + * @author Babis Marmanis + */ +public interface Classifier { + + public Concept classify(Instance instance); + + public String getName(); + + public boolean train(); +} diff --git a/src/org/yooreeka/algos/taxis/core/intf/Concept.java b/src/org/yooreeka/algos/taxis/core/intf/Concept.java new file mode 100644 index 0000000..6db245b --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/intf/Concept.java @@ -0,0 +1,44 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core.intf; + +/** + * @author Babis Marmanis + * + */ +public interface Concept { + + public Instance[] getInstances(); + + public String getName(); + + public Concept getParent(); +} diff --git a/src/org/yooreeka/algos/taxis/core/intf/Instance.java b/src/org/yooreeka/algos/taxis/core/intf/Instance.java new file mode 100644 index 0000000..4917912 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/core/intf/Instance.java @@ -0,0 +1,46 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.core.intf; + +/** + * @author Babis Marmanis + * + */ +public interface Instance { + + public Attribute[] getAtrributes(); + + public Attribute getAttributeByName(String attrName); + + public Concept getConcept(); + + public void print(); +} diff --git a/src/org/yooreeka/algos/taxis/ensemble/ClassifierEnsemble.java b/src/org/yooreeka/algos/taxis/ensemble/ClassifierEnsemble.java new file mode 100644 index 0000000..3fdc717 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/ensemble/ClassifierEnsemble.java @@ -0,0 +1,106 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.ensemble; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +/** + * Base implementation for bagging classifier. + */ +public abstract class ClassifierEnsemble implements Classifier { + + public enum ClassifierMemberType { + NEURAL_NETWORK, DECISION_TREE, NAIVE_BAYES + } + + protected String name; + + protected boolean verbose = false; + + protected List baseClassifiers = new ArrayList(); + + public ClassifierEnsemble(String name) { + this.name = name; + } + + public void addMember(Classifier baseClassifier) { + baseClassifiers.add(baseClassifier); + } + + public Concept classify(Instance instance) { + + ConceptMajorityVoter voter = new ConceptMajorityVoter(instance); + + for (Classifier baseClassifier : baseClassifiers) { + + Concept c = baseClassifier.classify(instance); + + voter.addVote(c); + } + + if (verbose) { + voter.print(); + } + + return voter.getWinner(); + } + + public int getEnsemblePopulation() { + return baseClassifiers.size(); + } + + public String getName() { + return name; + } + + public void removeMember(Classifier c) { + baseClassifiers.remove(c); + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + public boolean train() { + + for (Classifier c : baseClassifiers) { + // training base classifier + c.train(); + } + + return true; + } +} diff --git a/src/org/yooreeka/algos/taxis/ensemble/ConceptMajorityVoter.java b/src/org/yooreeka/algos/taxis/ensemble/ConceptMajorityVoter.java new file mode 100644 index 0000000..3cc3bcf --- /dev/null +++ b/src/org/yooreeka/algos/taxis/ensemble/ConceptMajorityVoter.java @@ -0,0 +1,87 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.ensemble; + +import java.util.HashMap; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class ConceptMajorityVoter { + + private Map votes = new HashMap(); + + private Instance i; + + public ConceptMajorityVoter(Instance i) { + this.i = i; + } + + public void addVote(Concept c) { + + Integer conceptVoteCount = votes.get(c); + + if (conceptVoteCount == null) { + conceptVoteCount = new Integer(1); + } else { + conceptVoteCount = conceptVoteCount + 1; + + } + votes.put(c, conceptVoteCount); + } + + public Concept getWinner() { + + int winnerVoteCount = 0; + Concept winnerConcept = null; + + for (Map.Entry e : votes.entrySet()) { + if (e.getValue() > winnerVoteCount) { + winnerConcept = e.getKey(); + winnerVoteCount = e.getValue(); + } + } + + return winnerConcept; + } + + public int getWinnerVoteCount() { + Concept winner = getWinner(); + return votes.get(winner); + } + + public void print() { + System.out.println("Votes for instace [" + i + "] : " + votes); + System.out.println("Winner concept: " + getWinner()); + } + +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/ClassifierResults.java b/src/org/yooreeka/algos/taxis/evaluation/ClassifierResults.java new file mode 100644 index 0000000..cf4053b --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/ClassifierResults.java @@ -0,0 +1,70 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public class ClassifierResults { + private String classifierId; + private boolean[] results; + private int nCorrect; + + public ClassifierResults(String classifierId, int n) { + this.classifierId = classifierId; + this.results = new boolean[n]; + this.nCorrect = 0; + } + + public double getAccuracy() { + return (double) nCorrect / (double) results.length; + } + + public String getClassifierId() { + return classifierId; + } + + public int getN() { + return results.length; + } + + public int getNCorrect() { + return nCorrect; + } + + public boolean getResult(int i) { + return results[i]; + } + + public void setResult(int i, boolean value) { + results[i] = value; + if (value) { + nCorrect++; + } + } +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/CochransQTest.java b/src/org/yooreeka/algos/taxis/evaluation/CochransQTest.java new file mode 100644 index 0000000..dc354a8 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/CochransQTest.java @@ -0,0 +1,128 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public class CochransQTest extends Test { + + private double q = 0.0; + + private ClassifierResults c1; + private ClassifierResults c2; + private ClassifierResults c3; + + private double L = 3.0; + + public CochransQTest(ClassifierResults c1, ClassifierResults c2, + ClassifierResults c3) { + this.c1 = c1; + this.c2 = c2; + this.c3 = c3; + + setStatisticSymbol("Q"); + + // Confidence interval: 0.05 + // Null hypothesis: classifiers are the same + // Degrees of freedom L - 1 = 2 + // Rejected if q > 5.991 + setThreshold(5.991); + + calculate(); + } + + @Override + protected void calculate() { + int n = c1.getN(); + + /* + * Total number of correct classifications among all classifiers. + */ + double T = calculateT(); + + double T2 = 0.0; + + for (int i = 0; i < n; i++) { + double x = 0.0; + + if (c1.getResult(i)) { + x++; + } + if (c2.getResult(i)) { + x++; + } + if (c3.getResult(i)) { + x++; + } + + T2 += (x * x); + } + + double sum = 0.0; + sum = (double) c1.getNCorrect() * c1.getNCorrect() + + (double) c2.getNCorrect() * c2.getNCorrect() + + (double) c3.getNCorrect() * c3.getNCorrect(); + + double a = L * sum; + + q = (L - 1) * (a - T * T) / (L * T - T2); + + setStatisticValue(q); + } + + /* + * Calculates total number of correct classifications among all classifiers. + */ + private int calculateT() { + return c1.getNCorrect() + c2.getNCorrect() + c3.getNCorrect(); + } + + // public boolean different() { + // return isDifferent(q,getThreshold()); + // } + // + @Override + public void evaluate() { + print("_____________________________________________________"); + + print("Evaluating classifiers " + c1.getClassifierId() + ", " + + c2.getClassifierId() + ", " + c3.getClassifierId() + ":"); + print("_____________________________________________________"); + print(c1.getClassifierId() + " accuracy: " + c1.getAccuracy()); + print(c2.getClassifierId() + " accuracy: " + c2.getAccuracy()); + print(c3.getClassifierId() + " accuracy: " + c3.getAccuracy()); + print("_____________________________________________________"); + + print("Confidence Interval : 0.05"); + print("Degrees of Freedom : 2"); + print("Statistic threshold (chi-square): 5.991"); + + // printResult("Q",q,different()); + } +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/Diff2PropTest.java b/src/org/yooreeka/algos/taxis/evaluation/Diff2PropTest.java new file mode 100644 index 0000000..91bedd7 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/Diff2PropTest.java @@ -0,0 +1,84 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public class Diff2PropTest extends Test { + + double z = 0.0; + + private ClassifierResults c1; + private ClassifierResults c2; + + public Diff2PropTest(ClassifierResults c1, ClassifierResults c2) { + this.c1 = c1; + this.c2 = c2; + + setStatisticSymbol("|z|"); + + /* + * Confidence interval: 0.05 Null hypothesis - classifiers are the same + * Null hypothesis is rejected if |z| > 1.96 + */ + setThreshold(1.96); + + calculate(); + } + + @Override + protected void calculate() { + + double n = c1.getN(); + double p = 0.5 * (c1.getAccuracy() + c2.getAccuracy()); + double a = c1.getAccuracy() - c2.getAccuracy(); + double b = (2.0 * p * (1 - p)) / n; + z = a / Math.sqrt(b); + + setStatisticValue(Math.abs(z)); + } + + @Override + public void evaluate() { + + print("_____________________________________________________"); + print("Evaluating classifiers " + c1.getClassifierId() + " and " + + c2.getClassifierId() + ":"); + + print("_____________________________________________________"); + print(c1.getClassifierId() + " accuracy: " + c1.getAccuracy()); + print(c2.getClassifierId() + " accuracy: " + c2.getAccuracy()); + print("_____________________________________________________"); + + print("Confidence Interval : 0.05"); + print("Statistic threshold (Std Normal): 1.96"); + + printResult(); + } +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/FTest.java b/src/org/yooreeka/algos/taxis/evaluation/FTest.java new file mode 100644 index 0000000..5714e8c --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/FTest.java @@ -0,0 +1,182 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public class FTest extends Test { + + private double F = 0.0; + + private ClassifierResults c1; + private ClassifierResults c2; + private ClassifierResults c3; + + private double L = 3.0; + + public FTest(ClassifierResults c1, ClassifierResults c2, + ClassifierResults c3) { + this.c1 = c1; + this.c2 = c2; + this.c3 = c3; + + setStatisticSymbol("F"); + + // For test size: 500 + // Confidence interval: 0.05 + // Null hypothesis: classifiers are the same + // Degrees of freedom: L - 1 = 2, 2 * (N - 1) = 2 * 499 = 998 + // F Distribution + // Rejected if F > 3.08 + // + // Tabulated values can be found at: + // http://www.itl.nist.gov/div898/handbook/eda/section3/eda3673.htm + + setThreshold(3.08); + + calculate(); + } + + @Override + protected void calculate() { + + /* + * Classifier accuracies: / N + */ + double p1 = c1.getAccuracy(); + double p2 = c2.getAccuracy(); + double p3 = c3.getAccuracy(); + + /* + * Combined accuracy across all classifiers: T / (L * N) + */ + double p = calculateCombinedAccuracy(); + + /* + * Number of test samples. + */ + double N = c1.getN(); + + /* + * SSA + */ + double SSA = N * (p1 * p1 + p2 * p2 + p3 * p3 - L * p * p); + + /* + * SSB + */ + double sumOfjL2 = calculateSumOfjL2(); + double SSB = sumOfjL2 / L - L * N * p; + + /* + * SST + */ + double SST = N * L * p * (1 - p); + + /* + * SSAB + */ + double SSAB = SST - SSA - SSB; + + /* + * MSA + */ + double MSA = SSA / (L - 1); + /* + * MSAB + */ + double MSAB = SSAB / ((L - 1) * (N - 1)); + + /* + * F + */ + F = MSA / MSAB; + + setStatisticValue(F); + } + + /* + * Accuracy based on combined results from all classifiers. + */ + private double calculateCombinedAccuracy() { + double nCorrect = c1.getNCorrect() + c2.getNCorrect() + + c3.getNCorrect(); + double nAll = c1.getN() + c2.getN() + c3.getN(); + return nCorrect / nAll; + } + + /* + * Calculates sum of jL squares. Where jL is the number of classifiers that + * correctly classified instance j. + */ + private double calculateSumOfjL2() { + int n = c1.getN(); + + double sumjL2 = 0.0; + + for (int j = 0; j < n; j++) { + double jL = 0.0; + + if (c1.getResult(j)) { + jL++; + } + if (c2.getResult(j)) { + jL++; + } + if (c3.getResult(j)) { + jL++; + } + + sumjL2 += (jL * jL); + } + + return sumjL2; + } + + @Override + public void evaluate() { + print("_____________________________________________________"); + print("Evaluating classifiers " + c1.getClassifierId() + ", " + + c2.getClassifierId() + ", " + c3.getClassifierId() + ":"); + + print("_____________________________________________________"); + print(c1.getClassifierId() + " accuracy: " + c1.getAccuracy()); + print(c2.getClassifierId() + " accuracy: " + c2.getAccuracy()); + print(c3.getClassifierId() + " accuracy: " + c3.getAccuracy()); + print("_____________________________________________________"); + + print("Confidence Interval : 0.05"); + print("Degrees of Freedom (1st): 2"); + print("Degrees of Freedom (2nd): 39998"); + print("Statistic threshold : 3.08"); + + printResult(); + } + +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/McNemarTest.java b/src/org/yooreeka/algos/taxis/evaluation/McNemarTest.java new file mode 100644 index 0000000..23d6d39 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/McNemarTest.java @@ -0,0 +1,118 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public class McNemarTest extends Test { + + private double chi2 = 0.0; + + private ClassifierResults c1; + private ClassifierResults c2; + + /* + * Using 'n??' notation. First '?' represents result for first classifier. + * Second '?' represents result for the second classifier. 0 - + * misclassification, 1 - correct classification. + */ + + private int n11 = 0; // both classifiers were correct + private int n10 = 0; // first is correct, second incorrect + private int n01 = 0; // first incorrect, second correct + private int n00 = 0; // both incorrect + + public McNemarTest(ClassifierResults c1, ClassifierResults c2) { + this.c1 = c1; + this.c2 = c2; + + setStatisticSymbol("Chi^2"); + + // using level of significance 0.05, 1 degree of freedom: + // reject null hypothesis if chi2 > 3.841 + setThreshold(3.841); + + calculate(); + } + + @Override + protected void calculate() { + int n = c1.getN(); + + for (int i = 0; i < n; i++) { + if (c1.getResult(i) && c2.getResult(i)) { + n11++; + } else if (c1.getResult(i) && !c2.getResult(i)) { + n10++; + } else if (!c1.getResult(i) && c2.getResult(i)) { + n01++; + } else { + n00++; + } + } + + double a = Math.abs(n01 - n10) - 1; + chi2 = a * a / (n01 + n10); + + setStatisticValue(chi2); + } + + @Override + public void evaluate() { + + print("_____________________________________________________"); + print("Evaluating classifiers " + c1.getClassifierId() + " and " + + c2.getClassifierId() + ":"); + + print("_____________________________________________________"); + print(c1.getClassifierId() + " accuracy: " + c1.getAccuracy()); + print(c2.getClassifierId() + " accuracy: " + c2.getAccuracy()); + print("N = " + c1.getN() + ", n00=" + n00 + ", n10=" + n10 + ", n01=" + + n01 + ", n11=" + n11); + print("_____________________________________________________"); + + print("Confidence Interval : 0.05"); + print("Degrees of Freedom : 1"); + print("Statistic threshold (Chi-square): 3.841"); + + printResult(); + } + + public int getN00() { + return n00; + } + + public int getN10() { + return n10; + } + + public int getN11() { + return n11; + } +} diff --git a/src/org/yooreeka/algos/taxis/evaluation/Test.java b/src/org/yooreeka/algos/taxis/evaluation/Test.java new file mode 100644 index 0000000..fb43549 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/evaluation/Test.java @@ -0,0 +1,105 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.evaluation; + +public abstract class Test { + + private String statisticSymbol; + + protected double statisticValue; + private double threshold; + public Test() { + super(); + } + + protected abstract void calculate(); + + public abstract void evaluate(); + + public String getStatisticSymbol() { + return statisticSymbol; + } + + public double getStatisticValue() { + return statisticValue; + } + + public double getThreshold() { + return threshold; + } + + protected boolean isDifferent(double statistic, double threshold) { + if (statistic > threshold) { + return true; + } else { + return false; + } + } + + protected void print(String val) { + System.out.print(" "); + System.out.println(val); + } + + protected void printResult() { + + boolean btmp = isDifferent(statisticValue, threshold); + + String tmp; + + if (btmp) { + tmp = " > "; + } else { + tmp = " <= "; + } + + print("________________________________________________________"); + + print(statisticSymbol + " value is " + statisticValue + "which is " + + tmp + threshold); + + print("The two classifiers are different: " + + String.valueOf(btmp).toUpperCase()); + } + + protected void setStatisticSymbol(String statisticSymbol) { + this.statisticSymbol = statisticSymbol; + } + + protected void setStatisticValue(double statisticValue) { + this.statisticValue = statisticValue; + } + + protected void setThreshold(double threshold) { + this.threshold = threshold; + } + +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/taxis/networks/neural/XORNetwork.java b/src/org/yooreeka/algos/taxis/networks/neural/XORNetwork.java new file mode 100644 index 0000000..3dd2cc7 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/XORNetwork.java @@ -0,0 +1,163 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural; + +import java.util.Arrays; + +import org.yooreeka.algos.taxis.networks.neural.core.BaseNN; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Layer; + +public class XORNetwork extends BaseNN { + + private static final long serialVersionUID = -511246579251846775L; + + private static final double TINY_NUMBER = 0.00001d; + + public static void main(String[] args) { + XORNetwork nn = new XORNetwork("XOR Test"); + + nn.create(); + + System.out.println("Classification using untrained network:"); + + double[] x = { 0, 0 }; + double[] y = nn.classify(x); + + // Results before training + + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 0, 1 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 1, 0 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 1, 1 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + System.out.println("Training..."); + + double nearZero = 0; + for (int i = 0; i < 16 * 1024; i++) { + + nn.train(new double[] { nearZero, nearZero }, new double[] { 0.0 }); + nn.train(new double[] { 1 + nearZero, 1 + nearZero }, + new double[] { 0.0 }); + nn.train(new double[] { 1 + nearZero, nearZero }, + new double[] { 1.0 }); + nn.train(new double[] { nearZero, 1 + nearZero }, + new double[] { 1.0 }); + + if (Math.random() < 0.5) { + nearZero = 0.0d + Math.random() * TINY_NUMBER; + } else { + nearZero = -(1.0d - Math.random() * TINY_NUMBER); + } + + // nn.printWeights(); + } + + System.out.println("Trained"); + + // After training + + System.out.println("Classification using trained network:"); + + x = new double[] { 0, 0 }; + y = nn.classify(x); + + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 0, 1 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 1, 0 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + x = new double[] { 1, 1 }; + y = nn.classify(x); + System.out.println(Arrays.toString(x) + " -> " + Arrays.toString(y)); + + } + + public XORNetwork(String name) { + super(name); + } + + /* + * Creates: 2 -> 3 -> 1 network. + */ + public void create() { + + // 1. Define Layers, Nodes and Node Biases + Layer inputLayer = createInputLayer(0, // layer id + 2 // number of nodes + ); + + Layer hiddenLayer = createHiddenLayer(1, // layer id + 3, // number of nodes + new double[] { 1, 1, 1 } // node biases + ); + + Layer outputLayer = createOutputLayer(2, // layer id + 1, // number of nodes + new double[] { 2.25 } // node biases + ); + + setInputLayer(inputLayer); + setOutputLayer(outputLayer); + addHiddenLayer(hiddenLayer); + + // 2. Define links and weights between nodes + // Id format: + setLink("0:0", "1:0", 0.25); + setLink("0:0", "1:1", 0.5); + setLink("0:0", "1:2", 0.25); + + setLink("0:1", "1:0", 0.25); + setLink("0:1", "1:1", 0.5); + setLink("0:1", "1:2", 0.25); + + setLink("1:0", "2:0", 0.8); + setLink("1:1", "2:0", 0.4); + setLink("1:2", "2:0", 0.8); + + System.out.println("NN created"); + + } + +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLayer.java b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLayer.java new file mode 100644 index 0000000..699ad9a --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLayer.java @@ -0,0 +1,139 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.networks.neural.core.intf.Layer; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Link; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Node; + +public class BaseLayer implements Layer { + + private static final long serialVersionUID = -1482668413756729940L; + + private int layerId; + private List nodes; + + public BaseLayer(int layerId) { + this.layerId = layerId; + this.nodes = new ArrayList(); + } + + public void addNode(Node n) { + nodes.add(n); + } + + public void calculate() { + for (Node node : nodes) { + node.calculate(); + } + } + + public void calculateWeightAdjustments() { + for (Node node : nodes) { + node.calculateWeightAdjustments(); + } + } + + public int getId() { + return layerId; + } + + public List getNodes() { + return nodes; + } + + public String getType() { + return ""; + } + + public double[] getValues() { + + double[] y = new double[nodes.size()]; + + for (int i = 0, n = y.length; i < n; i++) { + y[i] = nodes.get(i).getOutput(); + } + + return y; + } + + public void printWeights() { + for (Node n : nodes) { + for (Link link : n.getInlinks()) { + System.out.println(link.getFromNode().getNodeId() + "->" + + n.getNodeId() + ":" + link.getWeight()); + } + } + } + + public void propagate() { + for (Node node : nodes) { + node.propagate(); + } + } + + public void setExpectedOutputValues(double[] d) { + if (nodes.size() != d.length) { + throw new RuntimeException("Invalid layer configuration. " + + "Layer id: " + layerId + ", Expected number of nodes: " + + d.length + ", Actual number of nodes: " + nodes.size()); + } + + for (int i = 0, n = d.length; i < n; i++) { + Node node = nodes.get(i); + node.setExpectedOutput(d[i]); + } + } + + public void setInputValues(double[] x) { + if (nodes.size() != x.length) { + throw new RuntimeException("Invalid layer configuration. " + + "Layer id: " + layerId + ", Expected number of nodes: " + + x.length + ", Actual number of nodes: " + nodes.size()); + } + + for (int i = 0, n = x.length; i < n; i++) { + Node node = nodes.get(i); + Link inlink = node.getInlinks().get(0); + inlink.setValue(x[i]); + } + } + + public void updateWeights() { + for (Node node : nodes) { + node.updateWeights(); + } + } + +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLink.java b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLink.java new file mode 100644 index 0000000..0b9bf1d --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseLink.java @@ -0,0 +1,85 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +import org.yooreeka.algos.taxis.networks.neural.core.intf.Link; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Node; + +public class BaseLink implements Link { + + private static final long serialVersionUID = 6462508677299269035L; + + private Node fromNode; + private Node toNode; + private double value; + private double weight; + private double weightDelta; + + public Node getFromNode() { + return fromNode; + } + + public Node getToNode() { + return toNode; + } + + public double getValue() { + return value; + } + + public double getWeight() { + return weight; + } + + public double getWeightDelta() { + return weightDelta; + } + + public void setFromNode(Node unit) { + this.fromNode = unit; + } + + public void setToNode(Node unit) { + this.toNode = unit; + } + + public void setValue(double x) { + this.value = x; + } + + public void setWeight(double w) { + this.weight = w; + } + + public void setWeightDelta(double dw) { + weightDelta = dw; + } +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNN.java b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNN.java new file mode 100644 index 0000000..a824ade --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNN.java @@ -0,0 +1,429 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.networks.neural.core.intf.Layer; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Link; +import org.yooreeka.algos.taxis.networks.neural.core.intf.NeuralNetwork; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Node; + +public class BaseNN implements NeuralNetwork, java.io.Serializable { + + private static final long serialVersionUID = -7859066535923217638L; + + private static final double ERROR_THRESHOLD = 0.001d; + private static final double CONVERGENCE_THRESHOLD = 1E-10; + private static final double LEARNING_RATE = 0.25; + + private boolean verbose = false; + + /* + * Network name + */ + private String name; + + /* + * Contains nodes that belong to input layer. + */ + private Layer inputLayer; + + /* + * Contains nodes that belong to output layer. + */ + private Layer outputLayer; + + /* + * 0..* hidden layers. + */ + private List hiddenLayers; + + private double learningRate = LEARNING_RATE; + + /* + * + */ + private Map allNodes; + + public BaseNN(String name) { + this.name = name; + this.hiddenLayers = new ArrayList(); + this.allNodes = new HashMap(); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#addHiddenLayer(iweb2.ch5. + * classification.nn.intf.Layer) + */ + public void addHiddenLayer(Layer hiddenLayer) { + hiddenLayers.add(hiddenLayer); + for (Node node : hiddenLayer.getNodes()) { + addNode(node); + } + } + + private void addNode(Node node) { + String nodeId = node.getNodeId(); + if (allNodes.containsKey(nodeId)) { + throw new RuntimeException("Duplicate nodeId: " + nodeId); + } + allNodes.put(nodeId, node); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#classify(double[]) + */ + public double[] classify(double[] x) { + + inputLayer.setInputValues(x); + inputLayer.calculate(); + inputLayer.propagate(); + + for (Layer hLayer : hiddenLayers) { + hLayer.calculate(); + hLayer.propagate(); + } + + outputLayer.calculate(); + double[] y = outputLayer.getValues(); + return y; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#createHiddenLayer(int, + * int, double[]) + */ + public Layer createHiddenLayer(int layerId, int nNodes, double[] bias) { + if (bias.length != nNodes) { + throw new RuntimeException("Each node should have bias defined."); + } + BaseLayer baseLayer = new BaseLayer(layerId); + for (int i = 0; i < nNodes; i++) { + Node node = createHiddenNode(layerId + ":" + i); + node.setBias(bias[i]); + baseLayer.addNode(node); + } + return baseLayer; + } + + protected Node createHiddenNode(String nodeId) { + Node node = new SigmoidNode(nodeId); + node.setLearningRate(learningRate); + return node; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#createInputLayer(int, int) + */ + public Layer createInputLayer(int layerId, int nNodes) { + + BaseLayer baseLayer = new BaseLayer(layerId); + for (int i = 0; i < nNodes; i++) { + Node node = createInputNode(layerId + ":" + i); + Link inlink = new BaseLink(); + inlink.setFromNode(node); + inlink.setWeight(1.0); + node.addInlink(inlink); + baseLayer.addNode(node); + } + + return baseLayer; + } + + protected Node createInputNode(String nodeId) { + Node node = new LinearNode(nodeId); + node.setLearningRate(learningRate); + return node; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#createOutputLayer(int, + * int, double[]) + */ + public Layer createOutputLayer(int layerId, int nNodes, double[] bias) { + if (bias.length != nNodes) { + throw new RuntimeException("Each node should have bias defined."); + } + + BaseLayer baseLayer = new BaseLayer(layerId); + for (int i = 0; i < nNodes; i++) { + Node node = createOutputNode(layerId + ":" + i); + node.setBias(bias[i]); + baseLayer.addNode(node); + } + return baseLayer; + } + + protected Node createOutputNode(String nodeId) { + Node node = new LinearNode(nodeId); + node.setLearningRate(learningRate); + return node; + } + + private double error(double[] expectedY, double[] actualY) { + + double sum = 0.0; + + for (int i = 0, n = expectedY.length; i < n; i++) { + sum += Math.pow(actualY[i] - expectedY[i], 2.0); + } + + return sum / 2; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#getInputNodeCount() + */ + public int getInputNodeCount() { + return getNodeCount(this.inputLayer); + } + + public double getLearningRate() { + return learningRate; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#getName() + */ + public String getName() { + return name; + } + + private int getNodeCount(Layer layer) { + int nodeCount = 0; + + if (layer != null) { + nodeCount = layer.getNodes().size(); + } + + return nodeCount; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#getOutputNodeCount() + */ + public int getOutputNodeCount() { + return getNodeCount(this.outputLayer); + } + + /** + * @return the verbose + */ + public boolean isVerbose() { + return verbose; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#printWeights() + */ + public void printWeights() { + for (Layer layer : hiddenLayers) { + System.out.println(String.valueOf(layer.getId()) + ":"); + layer.printWeights(); + } + System.out.println(String.valueOf(outputLayer.getId()) + ":"); + outputLayer.printWeights(); + } + + public void removeAllNodesAndLayers() { + this.allNodes.clear(); + this.hiddenLayers.clear(); + this.inputLayer = null; + this.outputLayer = null; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#setInputLayer(iweb2.ch5. + * classification.nn.intf.Layer) + */ + public void setInputLayer(Layer inputLayer) { + this.inputLayer = inputLayer; + for (Node node : this.inputLayer.getNodes()) { + addNode(node); + } + } + + public void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + public void setLink(String fromNodeId, String toNodeId, double w) { + Link link = new BaseLink(); + Node fromNode = allNodes.get(fromNodeId); + if (fromNode == null) { + throw new RuntimeException("Unknown node id: " + fromNodeId); + } + Node toNode = allNodes.get(toNodeId); + if (toNode == null) { + throw new RuntimeException("Unknown node id: " + toNodeId); + } + + link.setFromNode(fromNode); + link.setToNode(toNode); + link.setWeight(w); + + fromNode.addOutlink(link); + toNode.addInlink(link); + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#setName(java.lang.String) + */ + public void setName(String name) { + this.name = name; + } + + public void setOutputLayer(Layer outputLayer) { + this.outputLayer = outputLayer; + for (Node node : this.outputLayer.getNodes()) { + addNode(node); + } + } + + /** + * @param verbose + * the verbose to set + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + // trains NN with one training sample at a time + /* + * (non-Javadoc) + * + * @see iweb2.ch5.classification.nn.NeuralNetwork#train(double[], double[]) + */ + public void train(double[] tX, double[] tY) { + + double lastError = 0.0; + int i = 0; + while (true) { + i++; + // Evaluate sample + double[] y = classify(tX); + + double err = error(tY, y); + + if (Double.isInfinite(err) || Double.isNaN(err)) { + // Couldn't even evaluate the error. Stop. + throw new RuntimeException( + "Training failed. Couldn't evaluate the error: " + + err + + ". Try some other NN configuration, parameters."); + } + + double convergence = Math.abs(err - lastError); + + if (err <= ERROR_THRESHOLD) { + // Good enough. No need to adjust weights for this sample. + lastError = err; + if (verbose) { + System.out.print("Error Threshold: " + ERROR_THRESHOLD); + System.out.print(" | Error Achieved: " + err); + System.out.print(" | Number of Iterations: " + i); + System.out.println(" | Absolute convergence: " + + convergence); + } + break; + } + + if (convergence <= CONVERGENCE_THRESHOLD) { // If we made almost no + // progress stop. + // No change. Stop. + if (verbose) { + System.out.print("Error Threshold: " + ERROR_THRESHOLD); + System.out.print(" | Error Achieved: " + err); + System.out.print(" | Number of Iterations: " + i); + System.out.println(" | Absolute convergence: " + + convergence); + } + break; + } + + lastError = err; + + // Set expected values so that we can determine the error + outputLayer.setExpectedOutputValues(tY); + + /* + * Calculate weight adjustments in the whole network + */ + + outputLayer.calculateWeightAdjustments(); + + for (Layer hLayer : hiddenLayers) { + // layer order doesn't matter because we will update weights + // later + hLayer.calculateWeightAdjustments(); // WeightIncrements + } + + /* + * Update Weights + */ + + outputLayer.updateWeights(); + + for (Layer hLayer : hiddenLayers) { + // layer order doesn't matter. + hLayer.updateWeights(); + } + } + // System.out.println("i = " + i + ", err = " + lastError); + } + +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNode.java b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNode.java new file mode 100644 index 0000000..26a26a7 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/BaseNode.java @@ -0,0 +1,216 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.networks.neural.core.intf.Link; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Node; + +abstract class BaseNode implements Node { + + private static final long serialVersionUID = 9035029651203728480L; + + protected String nodeId; + protected double x; // input value + protected double y; // output value + protected double bias; + protected double biasDelta; + protected List inlinks; + protected List outlinks; + + protected double learningRate; + + /* + * Used in training mode. + */ + private double expectedOutput; + + public BaseNode(String nodeId) { + this.nodeId = nodeId; + this.inlinks = new ArrayList(); + this.outlinks = new ArrayList(); + } + + public void addInlink(Link inlink) { + inlinks.add(inlink); + } + + public void addOutlink(Link outlink) { + outlinks.add(outlink); + } + + public void calculate() { + this.x = calculateActivation(); + this.y = fireNeuron(); + } + + public double calculateActivation() { + double result = bias; + for (Link inL : inlinks) { + result += inL.getWeight() * inL.getValue(); + } + x = result; + return x; + } + + public void calculateWeightAdjustments() { + double err = getNodeError(); + + for (Link link : getInlinks()) { + double y = link.getValue(); + double dW = learningRate * y * err; + link.setWeightDelta(link.getWeightDelta() + dW); + } + + // Bias adjustments + setBiasDelta(getBiasDelta() + learningRate * 1 * err); + } + + public abstract double fireNeuron(); + + public abstract double fireNeuronDerivative(); + + public double getBias() { + return bias; + } + + public double getBiasDelta() { + return biasDelta; + } + + public List getInlinks() { + return inlinks; + } + + public double getLastInput() { + return x; + } + + public double getLastOutput() { + return y; + } + + public double getLearningRate() { + return this.learningRate; + } + + // + public double getNodeError() { + // For output node + if (outlinks == null || outlinks.size() == 0) { + double d = expectedOutput; + /* + * Assuming E = 1/2 * ( d - y )^2 + */ + // return (d - y) * (1 - y) * y; + return (d - y) * fireNeuronDerivative(); + + } else { // for hidden node + double s = 0.0; + + for (Link outlink : outlinks) { + Node node = outlink.getToNode(); + s += node.getNodeError() * outlink.getWeight(); + } + + return fireNeuronDerivative() * s; + } + } + + public String getNodeId() { + return nodeId; + } + + public List getOutlinks() { + return outlinks; + } + + public double getOutput() { + return y; + } + + public double getOutputValue() { + return y; + } + + // Should it be at the link level? + public double inputF(List inputs) { + if (inputs == null || inputs.size() == 0) { + return y; + } else { + double result = bias; + for (Link inL : inputs) { + result += inL.getWeight() * inL.getValue(); + } + return result; + } + } + + public void propagate() { + for (Link outL : outlinks) { + outL.setValue(y); + } + } + + public void setBias(double b) { + this.bias = b; + } + + public void setBiasDelta(double db) { + this.biasDelta = db; + } + + public void setExpectedOutput(double d) { + this.expectedOutput = d; + } + + public void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + public void setOutput(double y) { + this.y = y; + } + + public void updateWeights() { + + for (Link link : getInlinks()) { + link.setWeight(link.getWeight() + link.getWeightDelta()); + link.setWeightDelta(0.0); + } + + // Bias adjustments + setBias(getBias() + getBiasDelta()); + setBiasDelta(0.0); + } +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/LinearNode.java b/src/org/yooreeka/algos/taxis/networks/neural/core/LinearNode.java new file mode 100644 index 0000000..6a285e1 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/LinearNode.java @@ -0,0 +1,59 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +public class LinearNode extends BaseNode { + + private static final long serialVersionUID = -6052548906001921511L; + + private double a = 0.0; + private double b = 0.0; + + public LinearNode(String nodeId) { + this(nodeId, 1.0, 0.0); + } + + public LinearNode(String nodeId, double a, double b) { + super(nodeId); + this.a = a; + this.b = b; + } + + @Override + public double fireNeuron() { + return a * x + b; + } + + @Override + public double fireNeuronDerivative() { + return a; + } +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/SigmoidNode.java b/src/org/yooreeka/algos/taxis/networks/neural/core/SigmoidNode.java new file mode 100644 index 0000000..74a1f8d --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/SigmoidNode.java @@ -0,0 +1,52 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core; + +public class SigmoidNode extends BaseNode { + + private static final long serialVersionUID = 5289776407864851871L; + + public SigmoidNode(String nodeId) { + super(nodeId); + } + + @Override + public double fireNeuron() { + // Sigmoid + y = Math.tanh(x); + return y; + } + + @Override + public double fireNeuronDerivative() { + return (1 - y * y); + } +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Layer.java b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Layer.java new file mode 100644 index 0000000..5455825 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Layer.java @@ -0,0 +1,57 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core.intf; + +import java.util.List; + +public interface Layer extends java.io.Serializable { + public void calculate(); + + public void calculateWeightAdjustments(); + + int getId(); + + List getNodes(); + + String getType(); // input, output, hidden + + public double[] getValues(); + + public void printWeights(); + + public void propagate(); + + public void setExpectedOutputValues(double[] x); + + public void setInputValues(double[] x); + + public void updateWeights(); +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Link.java b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Link.java new file mode 100644 index 0000000..0353aee --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Link.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core.intf; + +public interface Link extends java.io.Serializable { + Node getFromNode(); + + Node getToNode(); + + double getValue(); + + double getWeight(); + + double getWeightDelta(); + + void setFromNode(Node fromNode); + + void setToNode(Node toNode); + + void setValue(double x); + + void setWeight(double w); + + void setWeightDelta(double dw); +} diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/intf/NeuralNetwork.java b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/NeuralNetwork.java new file mode 100644 index 0000000..6f1430b --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/NeuralNetwork.java @@ -0,0 +1,69 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core.intf; + +public interface NeuralNetwork { + + public abstract void addHiddenLayer(Layer hiddenLayer); + + public abstract double[] classify(double[] x); + + public abstract Layer createHiddenLayer(int layerId, int nNodes, + double[] bias); + + public abstract Layer createInputLayer(int layerId, int nNodes); + + public abstract Layer createOutputLayer(int layerId, int nNodes, + double[] bias); + + public abstract int getInputNodeCount(); + + /** + * @return the name + */ + public abstract String getName(); + + public abstract int getOutputNodeCount(); + + public abstract void printWeights(); + + public abstract void setInputLayer(Layer inputLayer); + + /** + * @param name + * the name to set + */ + public abstract void setName(String name); + + // trains NN with one training sample at a time + public abstract void train(double[] tX, double[] tY); + +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Node.java b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Node.java new file mode 100644 index 0000000..c9e6b16 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/networks/neural/core/intf/Node.java @@ -0,0 +1,98 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.networks.neural.core.intf; + +import java.util.List; + +public interface Node extends java.io.Serializable { + void addInlink(Link inlink); + + void addOutlink(Link outlink); + + public void calculate(); + + /** + * Net Activation + * + * @return + */ + double calculateActivation(); + + void calculateWeightAdjustments(); + + /** + * Activation function + * + * @return + */ + double fireNeuron(); + + /** + * Activation function derivative + * + * @return + */ + double fireNeuronDerivative(); + + double getBias(); + + double getBiasDelta(); + + List getInlinks(); + + /* + * Learning rate that will be used in training + */ + double getLearningRate(); + + double getNodeError(); + + String getNodeId(); + + List getOutlinks(); + + double getOutput(); + + public void propagate(); + + void setBias(double b); + + void setBiasDelta(double bd); + + /* + * For backpropagation + */ + void setExpectedOutput(double d); + + void setLearningRate(double learningRate); + + void updateWeights(); +} diff --git a/src/org/yooreeka/algos/taxis/tree/AttributeDefinition.java b/src/org/yooreeka/algos/taxis/tree/AttributeDefinition.java new file mode 100644 index 0000000..c549568 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/AttributeDefinition.java @@ -0,0 +1,100 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class allows us to associate type with the attribute. + */ +public class AttributeDefinition implements java.io.Serializable { + + private static final long serialVersionUID = -8446442452030956318L; + + /** + * Creates a copy of attribute definitions. + * + * @param attrs + * original list of attributes. + * @return new list. + */ + public static List copyAttributeDefs( + List attrs) { + return new ArrayList(attrs); + } + + /** + * Removes attribute definition with specified name from the list. + * + * @param attrName + * attribute name to remove. + * @param attributes + * list to remove from. + */ + public static void removeAttributeDef(String attrName, + List attributes) { + + if (attrName != null) { + for (int i = 0, n = attributes.size(); i < n; i++) { + AttributeDefinition a = attributes.get(i); + if (attrName.equalsIgnoreCase(a.getName())) { + attributes.remove(i); + break; + } + } + } + } + + /* + * Attribute name + */ + private String name; + + /* + * Attribute can be described as continuous (has numeric values) or discrete + * (has nominal/categorical values). + */ + private boolean isDiscrete; + + public AttributeDefinition(String name, boolean isDiscrete) { + this.name = name; + this.isDiscrete = isDiscrete; + } + + public String getName() { + return name; + } + + public boolean isDiscrete() { + return isDiscrete; + } +} diff --git a/src/org/yooreeka/algos/taxis/tree/AttributeSelector.java b/src/org/yooreeka/algos/taxis/tree/AttributeSelector.java new file mode 100644 index 0000000..01476e9 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/AttributeSelector.java @@ -0,0 +1,145 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.List; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class AttributeSelector implements java.io.Serializable { + + private static final long serialVersionUID = 1722498208605607524L; + + public AttributeSelector() { + + } + + /** + * Evaluates all candidate attributes and chooses one that provides the best + * split of the data. + * + * @param data + * data that will be used to evaluate split quality. + * @param candidateAttributes + * attributes to chose from. + * + * @return information about selected attribute along with the data for + * every branch produced by this split. + */ + public SplittingCriterion apply(List data, + List candidateAttributes) { + + int n = candidateAttributes.size(); + + double bestGainRatio = Double.MIN_VALUE; + + SplittingCriterion splitCriterion = new SplittingCriterion(); + + /* Calculate Gain Ratio for every available attribute. */ + for (int i = 0; i < n; i++) { + AttributeDefinition attrDef = candidateAttributes.get(i); + String attrName = attrDef.getName(); + Double splitPoint = null; + + BranchGroup branches = null; + + if (attrDef.isDiscrete()) { + /* + * For discrete attribute we split all data into subsets based + * on attribute values. + */ + branches = BranchGroup.createBranchesFromDiscreteAttr(data, + attrName); + } else { + /* + * For continuous attribute we pick a value that is in the + * middle of min and max attribute values that are present in + * the data. + */ + splitPoint = pickSplitPoint(data, attrName); + + /* + * All data will be split into two groups: group with values x + * <= splitPoint and group with values x > splitPoint + */ + branches = BranchGroup.createBranchesFromContiniuousAttr(data, + attrName, splitPoint); + } + + // Only consider attributes that split the data into more than one + // branch + if (branches.getBranches().size() > 1) { + Double gainRatio = calculateGainRatio(data, branches); + + if (gainRatio > bestGainRatio) { + bestGainRatio = gainRatio; + splitCriterion.setSplitAttributeName(attrName); + splitCriterion.setSplitPoint(splitPoint); + splitCriterion.setSplitData(branches); + } + } + } + + return splitCriterion; + } + + private Double calculateGainRatio(List allData, + BranchGroup branches) { + + List> dataByBranch = branches.getData(); + + InfoGain infoGain = new InfoGain(); + + return infoGain.gainRatio(allData, dataByBranch); + } + + /* + * Calculates a value to split on for continuous valued attributes. + */ + private Double pickSplitPoint(List data, String attrName) { + Double minValue = Double.MAX_VALUE; + Double maxValue = Double.MIN_VALUE; + + for (Instance i : data) { + Attribute a = i.getAttributeByName(attrName); + Double value = AttributeUtils.toDouble(a.getValue()); + if (value != null && value < minValue) { + minValue = value; + } + if (value != null && value > maxValue) { + maxValue = value; + } + } + + return (maxValue - minValue) / 2.0; + } +} diff --git a/src/org/yooreeka/algos/taxis/tree/AttributeUtils.java b/src/org/yooreeka/algos/taxis/tree/AttributeUtils.java new file mode 100644 index 0000000..3b6def7 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/AttributeUtils.java @@ -0,0 +1,61 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +public class AttributeUtils { + + public static Double toDouble(Object o) { + Double result = null; + if (o instanceof Double) { + result = (Double) o; + } else if (o instanceof String) { + result = Double.parseDouble((String) o); + } else if (o instanceof Integer) { + result = new Double((Integer) o); + } + + return result; + } + + public static String toString(Object o) { + String result = null; + if (o instanceof Double) { + result = String.valueOf(o); + } else if (o instanceof String) { + result = (String) o; + } else if (o instanceof Integer) { + result = String.valueOf(o); + } + + return result; + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/Branch.java b/src/org/yooreeka/algos/taxis/tree/Branch.java new file mode 100644 index 0000000..a4e16d7 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/Branch.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class Branch { + public static void addInstance(Map branches, + String branchName, Instance i) { + + Branch branch = branches.get(branchName); + if (branch == null) { + branch = new Branch(branchName); + branches.put(branchName, branch); + } + + branch.add(i); + } + private String branchName; + + private List data; + + public Branch() { + init(null); + } + + public Branch(String name) { + init(name); + } + + public void add(Instance instance) { + this.data.add(instance); + } + + public void add(List multipleInstances) { + this.data.addAll(multipleInstances); + } + + public List getData() { + return data; + } + + public String getName() { + return branchName; + } + + private void init(String name) { + branchName = name; + data = new ArrayList(); + } + + public void setData(List data) { + this.data = data; + } + + public void setName(String name) { + this.branchName = name; + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/BranchGroup.java b/src/org/yooreeka/algos/taxis/tree/BranchGroup.java new file mode 100644 index 0000000..534d506 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/BranchGroup.java @@ -0,0 +1,130 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class BranchGroup { + /** + * Value that is used to identify data subset when the split is done on + * continuous value. + */ + public static class BinaryBranchNames { + public static final String TRUE_BRANCH = "true"; + public static final String FALSE_BRANCH = "false"; + + private BinaryBranchNames() { + } + } + public static BranchGroup createBranchesFromContiniuousAttr( + List data, String attrName, Double splitPoint) { + + BranchGroup branches = new BranchGroup(attrName); + + for (Instance i : data) { + Attribute a = i.getAttributeByName(attrName); + Double value = AttributeUtils.toDouble(a.getValue()); + String branchName = SplittingCriterion.getBranchName(value, + splitPoint); + + branches.add(branchName, i); + } + + return branches; + } + + public static BranchGroup createBranchesFromDiscreteAttr( + List data, String attrName) { + + // Separate branch for each attribute value + BranchGroup branches = new BranchGroup(attrName); + + for (Instance i : data) { + Attribute a = i.getAttributeByName(attrName); + String attrValue = AttributeUtils.toString(a.getValue()); + String branchName = SplittingCriterion.getBranchName(attrValue); + + branches.add(branchName, i); + } + + return branches; + } + + private String name; + + private Map branches; + + public BranchGroup(String name) { + this.name = name; + branches = new HashMap(); + } + + public void add(String branchName, Instance i) { + + Branch branch = branches.get(branchName); + if (branch == null) { + branch = new Branch(branchName); + branches.put(branchName, branch); + } + + branch.add(i); + } + + public Branch getBranch(String branchName) { + return branches.get(branchName); + } + + public List getBranches() { + return new ArrayList(branches.values()); + }; + + public List> getData() { + List> allData = new ArrayList>(); + + for (Branch b : branches.values()) { + List branchData = b.getData(); + allData.add(branchData); + } + + return allData; + } + + public String getName() { + return name; + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/ConceptUtils.java b/src/org/yooreeka/algos/taxis/tree/ConceptUtils.java new file mode 100644 index 0000000..4d1139a --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/ConceptUtils.java @@ -0,0 +1,85 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class ConceptUtils { + + public static Map countConcepts(List instances) { + + Map conceptCounts = new HashMap(); + + for (Instance i : instances) { + String conceptName = i.getConcept().getName(); + Integer count = conceptCounts.get(conceptName); + if (count == null) { + count = 1; + } else { + count++; + } + conceptCounts.put(conceptName, count); + } + + return conceptCounts; + } + + public static String findMostFrequentConcept(List instances) { + + Map conceptCounts = countConcepts(instances); + + String mostFrequentConceptLabel = null; + + int n = 0; + for (Map.Entry e : conceptCounts.entrySet()) { + if (e.getValue() > n) { + n = e.getValue(); + mostFrequentConceptLabel = e.getKey(); + } + } + + return mostFrequentConceptLabel; + } + + public static String[] getUniqueConcepts(List instances) { + Set concepts = new HashSet(); + for (Instance i : instances) { + concepts.add(i.getConcept().getName()); + } + return concepts.toArray(new String[concepts.size()]); + } +} diff --git a/src/org/yooreeka/algos/taxis/tree/DecisionTreeClassifier.java b/src/org/yooreeka/algos/taxis/tree/DecisionTreeClassifier.java new file mode 100644 index 0000000..c966eec --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/DecisionTreeClassifier.java @@ -0,0 +1,248 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.yooreeka.algos.taxis.core.BaseConcept; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class DecisionTreeClassifier implements Classifier, java.io.Serializable { + + private static final long serialVersionUID = -3360341002492465102L; + + private String name; + + protected boolean verbose = false; + + /* + * Selects best attribute to split on. + */ + private AttributeSelector attributeSelector; + + protected Node rootTreeNode; + + /* + * No need to keep training data for serialization. + */ + private transient TrainingSet trainingData; + + /* + * Attributes that should be considered for tree training. + */ + private List availableAttributes; + + public DecisionTreeClassifier(String name, TrainingSet trainingData) { + this.name = name; + rootTreeNode = null; + attributeSelector = new AttributeSelector(); + this.trainingData = trainingData; + this.availableAttributes = new ArrayList(); + } + + public DecisionTreeClassifier(TrainingSet trainingData) { + this(DecisionTreeClassifier.class.getSimpleName(), trainingData); + } + + /** + * Builds subtree using provided data and attributes. + * + * @param data + * training instances that should be considered for subtree. + * @param candidateAttributes + * available attributes. + */ + private Node buildTree(List data, + List candidateAttributes) { + + /* + * Node that will represent the subtree. + */ + Node node = new Node(); + + String[] concepts = ConceptUtils.getUniqueConcepts(data); + String mostFrequentConcept = ConceptUtils.findMostFrequentConcept(data); + node.setMostFrequentConceptName(mostFrequentConcept); + node.setNodeTrainingData(data); + + /* + * No need to split if there is only on concept left. + */ + if (concepts.length == 1) { + node.setLeaf(true); + node.setConceptName(concepts[0]); + node.setAttributeName(null); + return node; + } + + /* + * We've run out of attributes to split on. Just use the most frequent + * concept. + */ + if (candidateAttributes == null || candidateAttributes.size() == 0) { + node.setLeaf(true); + node.setConceptName(mostFrequentConcept); + node.setAttributeName(null); + return node; + } + + /* + * Determines the best attribute to split on. + */ + SplittingCriterion bestSplitCriterion = attributeSelector.apply(data, + candidateAttributes); + + if (bestSplitCriterion == null + || bestSplitCriterion.getSplitAttributeName() == null) { + node.setLeaf(true); + node.setConceptName(concepts[0]); // pick first concept from the + // list + node.setAttributeName(null); + return node; + } + + /* + * For non-leaf nodes we don't have the class label. + */ + node.setConceptName(null); + node.setAttributeName(bestSplitCriterion.getSplitAttributeName()); + node.setSplitValue(bestSplitCriterion.getSplitPoint()); + + if (bestSplitCriterion.isDiscreteValueSplit()) { + // Split on discrete attribute value + BranchGroup branches = bestSplitCriterion.getSplitData(); + for (Branch branch : branches.getBranches()) { + + List selectedData = branch.getData(); + + // build a list of attributes for child node + List childNodeAttrs = AttributeDefinition + .copyAttributeDefs(candidateAttributes); + // remove current attribute from consideration + AttributeDefinition.removeAttributeDef( + bestSplitCriterion.getSplitAttributeName(), + childNodeAttrs); + + Node childNode = buildTree(selectedData, childNodeAttrs); + + node.addChild(branch.getName(), childNode); + } + } else { + // split on continuous-valued attribute + BranchGroup branches = bestSplitCriterion.getSplitData(); + for (Branch branch : branches.getBranches()) { + + List childNodeAttrs = AttributeDefinition + .copyAttributeDefs(candidateAttributes); + + Node childNode = buildTree(branch.getData(), childNodeAttrs); + node.addChild(branch.getName(), childNode); + } + } + + return node; + } + + public Concept classify(Instance i) { + + String category = rootTreeNode.classify(i); + return createConcept(category); + + } + + /* + * Allows suclasses to provide specific implementation of the concept. + */ + protected Concept createConcept(String category) { + return new BaseConcept(category); + } + + public String getName() { + return name; + } + + /** + * @return the verbose + */ + public boolean isVerbose() { + return verbose; + } + + /** + * Prints information about tree. + */ + public void printTree() { + System.out.println("--- Tree ---"); + rootTreeNode.print(0); + System.out.println("------------"); + } + + public void pruneTree() { + this.rootTreeNode.prune(); + } + + /** + * @param verbose + * the verbose to set + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + public boolean train() { + + long t0 = System.currentTimeMillis(); + + HashMap instances = trainingData.getInstances(); + ArrayList trainingInstances = new ArrayList( + instances.values()); + + rootTreeNode = buildTree(trainingInstances, availableAttributes); + + if (verbose) { + System.out.print(" Decision tree training completed in "); + System.out.println((System.currentTimeMillis() - t0) + " (ms)"); + } + + return true; + } + + public void trainOnAttribute(String name, boolean isDiscrete) { + AttributeDefinition attrDef = new AttributeDefinition(name, isDiscrete); + availableAttributes.add(attrDef); + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/InfoGain.java b/src/org/yooreeka/algos/taxis/tree/InfoGain.java new file mode 100644 index 0000000..353e6ab --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/InfoGain.java @@ -0,0 +1,151 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class InfoGain { + + public InfoGain() { + } + + /** + * Entropy of the dataset. + * + * @param data + * @return + */ + public Double entropy(List data) { + + /* + * How many times each class (category) occurs in the data. + */ + Map instanceCountByClassMap = ConceptUtils + .countConcepts(data); + + int n = data.size(); + + double sum = 0.0; + + for (Integer count : instanceCountByClassMap.values()) { + + double p = (double) count / (double) n; + + sum += p * log2(p); + + } + + return -sum; + + } + + public Double expectedInformation(List allData, + List> allDataSubsets) { + + double sum = 0.0; + + int n = allData.size(); + + for (List dataSubset : allDataSubsets) { + + sum += (double) dataSubset.size() / (double) n + * entropy(dataSubset); + + } + + return sum; + + } + + /** + * Information gain for a given split. + * + * @param allData + * initial set of instances. + * @param allDataSubsets + * initial set split into subsets. + * + * @return information gain. + */ + public Double gain(List allData, + List> allDataSubsets) { + + return entropy(allData) - expectedInformation(allData, allDataSubsets); + + } + + /** + * Gain ratio. + * + * @param allData + * initial set of instances. + * @param allDataSubsets + * initial set split into subsets. + * + * @return gain ratio. + */ + public Double gainRatio(List allData, + List> allDataSubsets) { + + return gain(allData, allDataSubsets) + / splitInfo(allData, allDataSubsets); + + } + + private double log2(double d) { + + return Math.log(d) / Math.log(2.0); + + } + + public Double splitInfo(List allData, + List> allDataSubsets) { + + double sum = 0.0; + + int n = allData.size(); + + for (List dataSubset : allDataSubsets) { + + double ratio = (double) dataSubset.size() / (double) n; + + sum += ratio * log2(ratio); + + } + + return -sum; + + } + +} \ No newline at end of file diff --git a/src/org/yooreeka/algos/taxis/tree/Node.java b/src/org/yooreeka/algos/taxis/tree/Node.java new file mode 100644 index 0000000..78d9ac6 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/Node.java @@ -0,0 +1,403 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; + +/** + * Decision tree node. + */ +class Node implements java.io.Serializable { + + private static final long serialVersionUID = -4282027910521283908L; + + /** + * Auxiliary variable for printing + */ + private boolean isVerbose = false; + + /* + * Instance attribute name that this node will use to choose the branch. + */ + private String attributeName; + + /* + * Map of child nodes keyed by branch name. Branch name depends on type of + * attribute. For discrete attribute actual attribute value will be used and + * for continuous-valued attributes we only have two branches that split all + * values into two subsets. + */ + private Map childNodesByBranchName; + + /* + * Indicates that this node is a leaf node. + */ + private boolean isLeaf; + + /* + * Predicted class (concept) name. + */ + private String conceptName; + + /* + * Split value that will be used to determine child node when evaluating + * continuous-valued attribute. + */ + private Double splitValue; + + /* + * This value will be used as a result of classification whenever non-leaf + * node can not choose next tree node. This can happen when there are no + * available attributes anymore but the node is not a leaf node. + */ + private String mostFrequentConceptName; + + /* + * Number of training instances that reached this node. Will only be used by + * pruning during the training phase. + */ + private transient List nodeTrainingData; + + public Node() { + childNodesByBranchName = new HashMap(); + } + + public void addChild(String value, Node node) { + this.childNodesByBranchName.put(value, node); + } + + public String classify(Instance i) { + Node subtree = this; + + while (subtree.isLeaf() == false) { + Node childNode = subtree.selectChild(i); + + if (childNode == null) { + // Decision tree couldn't choose next child + break; + } + + subtree = childNode; + } + + String category = null; + + if (subtree.isLeaf()) { + category = subtree.getConceptName(); + } else { + category = subtree.getMostFrequentConceptName(); + } + + return category; + } + + private double estimateErrorRate(int n, int e) { + TrueErrorRateEstimator ter = new TrueErrorRateEstimator(); + /* Using default confidence range: 25% (z = 0.69) */ + return ter.errorRate(n, e); + } + + private double estimateLeafErrorRate() { + + /* + * Calculate observed error rate (error rate based on our training data) + * if we use most frequent category as classification result of this + * node. + */ + int n = nodeTrainingData.size(); + int e = 0; + for (Instance i : nodeTrainingData) { + if (!mostFrequentConceptName.equalsIgnoreCase(i.getConcept() + .getName())) { + e++; + } + } + + return estimateErrorRate(n, e); + } + + private double estimateTreeErrorRate(Node subtree, List data) { + + /* + * Calculate observed error rate (error rate based on our training data) + * if we use most frequent category as classification result of this + * node. + */ + int n = data.size(); + int e = 0; + for (Instance i : data) { + String category = subtree.classify(i); + if (!category.equals(i.getConcept().getName())) { + e++; + } + } + + return estimateErrorRate(n, e); + } + + public String getAttributeName() { + return attributeName; + } + + public String getConceptName() { + return conceptName; + } + + public String getMostFrequentConceptName() { + return mostFrequentConceptName; + } + + public List getNodeTrainingData() { + return nodeTrainingData; + } + + public Double getSplitValue() { + return splitValue; + } + + public boolean isLeaf() { + return isLeaf; + } + + /** + * @return the isVerbose + */ + public boolean isVerbose() { + return isVerbose; + } + + public void print(int level) { + + String padding = StringUtils.leftPad("", level * 5); + + String nodeInfo = "Node:" + "attrName=" + this.attributeName + + ",isLeaf=" + this.isLeaf + ",concept=" + this.conceptName; + + System.out.println(padding + nodeInfo); + for (Map.Entry e : childNodesByBranchName.entrySet()) { + if (splitValue == null) { + System.out.println(padding + "-> Branch: [" + attributeName + + "=" + e.getKey() + "]"); + } else { + String condition; + if (BranchGroup.BinaryBranchNames.TRUE_BRANCH + .equalsIgnoreCase(e.getKey())) { + condition = "<="; + } else { + condition = ">"; + } + System.out.println(padding + "-> Branch: " + e.getKey() + " [" + + attributeName + condition + this.splitValue + "]"); + + } + e.getValue().print(level + 1); + } + } + + public void prune() { + + if (isLeaf) { + return; + } + + /* + * First prune all child nodes (child subtrees). + */ + for (Node childNode : childNodesByBranchName.values()) { + childNode.prune(); + } + + // find most popular subtree + Node mostPopularSubtree = selectMostFrequentSubtree(); + + /* + * Evaluate current node (subtree) + */ + + double leafErrorRate = 0.0; + double nodeErrorRate = 0.0; + double mostPopularSubtreeErrorRate = 0.0; + + /* + * Estimate error rate for the case when we use the most frequent + * concept from the node training set. + */ + leafErrorRate = estimateLeafErrorRate(); + + /* + * Estimate error rate using current tree + */ + nodeErrorRate = estimateTreeErrorRate(this, nodeTrainingData); + + /* + * Estimate error rate for most popular subtree + */ + mostPopularSubtreeErrorRate = estimateTreeErrorRate(mostPopularSubtree, + nodeTrainingData); + + if (isVerbose) { + System.out.printf("Pruning: " + this.attributeName + + ", tree error rate: %.5f" + ", subtree error rate: %.5f" + + ", leaf error rate: %.5f\n", nodeErrorRate, + mostPopularSubtreeErrorRate, leafErrorRate); + } + + if (nodeErrorRate >= leafErrorRate + || nodeErrorRate >= mostPopularSubtreeErrorRate) { + + // We can get better error rate after pruning + + if (leafErrorRate <= mostPopularSubtreeErrorRate) { + + if (isVerbose) { + System.out.println("Replacing current node with leaf node"); + } + + // replace current node with leaf node. + this.setLeaf(true); + this.childNodesByBranchName.clear(); + this.conceptName = this.mostFrequentConceptName; + this.splitValue = null; + + } else { + + if (isVerbose) { + System.out.println("Replacing current node with subtree"); + } + + // replace current node with subtree + this.childNodesByBranchName.clear(); + this.attributeName = mostPopularSubtree.getAttributeName(); + this.isLeaf = mostPopularSubtree.isLeaf(); + this.childNodesByBranchName = mostPopularSubtree.childNodesByBranchName; + this.conceptName = mostPopularSubtree.conceptName; + this.splitValue = mostPopularSubtree.splitValue; + // Note: we are keeping current training data of the node and + // most frequent concept name that is based on training data. + } + } + + } + + /** + * Returns next node from the tree that fits provided instance. + * + * @param t + * instance that we are trying to classify. + * + * @return next tree node or null. + */ + public Node selectChild(Instance t) { + + Node child = null; + + Attribute a = t.getAttributeByName(attributeName); + + if (a != null) { + + String branchName = null; + + if (splitValue != null) { + Double attrValue = AttributeUtils.toDouble(a.getValue()); + branchName = SplittingCriterion.getBranchName(attrValue, + splitValue); + } else { + String attrValue = AttributeUtils.toString(a.getValue()); + branchName = SplittingCriterion.getBranchName(attrValue); + } + child = childNodesByBranchName.get(branchName); + } + + // can be null if instance attribute is missing or has value that we + // haven't seen during training (for discrete attributes) + + return child; + } + + /** + * Selects child node (subtree) that is most frequent outcome of the current + * node (has the most training samples). + */ + private Node selectMostFrequentSubtree() { + Node selectedNode = null; + int maxTrainingSamples = 0; + for (Node childNode : childNodesByBranchName.values()) { + if (childNode.getNodeTrainingData() != null) { + int n = childNode.getNodeTrainingData().size(); + if (n > maxTrainingSamples) { + selectedNode = childNode; + maxTrainingSamples = n; + } + } + } + + return selectedNode; + } + + public void setAttributeName(String attributeName) { + this.attributeName = attributeName; + } + + public void setConceptName(String conceptName) { + this.conceptName = conceptName; + } + + public void setLeaf(boolean isLeaf) { + this.isLeaf = isLeaf; + } + + public void setMostFrequentConceptName(String mostFrequentConceptName) { + this.mostFrequentConceptName = mostFrequentConceptName; + } + + public void setNodeTrainingData(List nodeTrainingData) { + this.nodeTrainingData = nodeTrainingData; + } + + public void setSplitValue(Double splitValue) { + this.splitValue = splitValue; + } + + /** + * @param isVerbose + * the isVerbose to set + */ + public void setVerbose(boolean isVerbose) { + this.isVerbose = isVerbose; + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/SplittingCriterion.java b/src/org/yooreeka/algos/taxis/tree/SplittingCriterion.java new file mode 100644 index 0000000..a853183 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/SplittingCriterion.java @@ -0,0 +1,124 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +/** + * Represents information about the split. + */ +public class SplittingCriterion { + + /** + * Returns branch name for continuous attributes. + * + * @param attrValue + * attribute value that should be evaluated. + * @param splitValue + * split point for continuous attributes. + * + * @return name of the branch. + */ + public static String getBranchName(Double attrValue, Double splitValue) { + String branchName = null; + + if (attrValue <= splitValue) { + branchName = BranchGroup.BinaryBranchNames.TRUE_BRANCH; + } else { + branchName = BranchGroup.BinaryBranchNames.FALSE_BRANCH; + } + + return branchName; + } + + /** + * Returns branch name for discrete attributes. Currently we always create a + * separate branch for every discrete attribute. + * + * @param attrValue + * attribute value that should be evaluated. + * + * @return name of the branch. + */ + public static String getBranchName(String attrValue) { + // Using attribute value as a branch name. + return attrValue; + } + + /* + * Attribute name to split on + */ + private String splitAttributeName; + + /* + * Only relevant for continuous attributes. Indicates value that will be + * used to decide true/false branch. + */ + private Double splitPoint; + + /* + * Data by branch. Each branch will have a subset of instances from the + * initial set that reached the node. We return it to avoid calculating this + * data for every branch again. + */ + private BranchGroup splitData; + + public String getSplitAttributeName() { + return splitAttributeName; + } + + public BranchGroup getSplitData() { + return splitData; + } + + public Double getSplitPoint() { + return splitPoint; + } + + public boolean isContinuousValueSplit() { + return splitPoint != null; + } + + public boolean isDiscreteValueSplit() { + return splitPoint == null; + } + + public void setSplitAttributeName(String splitAttributeName) { + this.splitAttributeName = splitAttributeName; + } + + public void setSplitData(BranchGroup splitData) { + this.splitData = splitData; + } + + public void setSplitPoint(Double splitPoint) { + this.splitPoint = splitPoint; + } + +} diff --git a/src/org/yooreeka/algos/taxis/tree/TrueErrorRateEstimator.java b/src/org/yooreeka/algos/taxis/tree/TrueErrorRateEstimator.java new file mode 100644 index 0000000..5174e39 --- /dev/null +++ b/src/org/yooreeka/algos/taxis/tree/TrueErrorRateEstimator.java @@ -0,0 +1,71 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.algos.taxis.tree; + +/* + * Estimates true error rate for tree pruning. Based on + * heuristic for C4.5. + */ +public class TrueErrorRateEstimator { + + /* + * Default value. + */ + private double z = 0.69; // for confidence: 0.25 or 25% + + /** + * Calculates true error rate for a node using error observed on training + * data. C4.5 uses upper confidence limit for error rate to represent true + * error rate. + * + * @param n + * total number of training samples at the node + * @param e + * number of misclassified samples at the node + * @return + */ + public double errorRate(double n, double e) { + /* + * Observed error rate based on our training data. + */ + double oe = e / n; + + /* + * Calculating upper confidence limit to use an estimate of the error + * rate + */ + double tmp1 = oe / n - (oe * oe) / n + (z * z) / (4 * n * n); + double numerator = oe + (z * z) / (2 * n) + z * Math.sqrt(tmp1); + double denominator = 1 + (z * z) / n; + + return numerator / denominator; + } +} diff --git a/src/org/yooreeka/config/YooreekaConfigurator.java b/src/org/yooreeka/config/YooreekaConfigurator.java new file mode 100644 index 0000000..bce24ee --- /dev/null +++ b/src/org/yooreeka/config/YooreekaConfigurator.java @@ -0,0 +1,220 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.config; + +import java.io.InputStream; +import java.util.Properties; +import java.util.logging.Level; + +import org.yooreeka.util.P; + +/** + * Central place to access to application properties. + * + * @author Babis Marmanis + */ +public class YooreekaConfigurator { + + public static final String DATA_DIR = "iweb2.data.dir"; + public static final String CRAWL_DATA_DIR="iweb2.crawl.dir"; + public static final String TEMP_DIR = "iweb2.temp.dir"; + public static final String MOVIELENS_DATA_DIR = "iweb2.movielens.data.dir"; + public static final String MOVIELENSTEST_DATA_DIR = "iweb2.ch3.movielenstest.data.dir"; + + public static final String LOG_LEVEL_SEVERE = "SEVERE"; + public static final String LOG_LEVEL_WARNING = "WARNING"; + public static final String LOG_LEVEL_INFO = "INFO"; + public static final String LOG_LEVEL_CONFIG = "CONFIG"; + public static final String LOG_LEVEL_FINE = "FINE"; + public static final String LOG_LEVEL_FINER = "FINER"; + public static final String LOG_LEVEL_FINEST = "FINEST"; + + /* + * System property name that can be used to override default properties + * file. + */ + private static String systemPropertyName = "iweb2.configuration"; + + /* + * Default resource name that will be used to load properties. + */ + private static String defaultResourceName = "/iweb2.properties"; + + private static Properties props = new Properties(); + + private static Properties logProps = new Properties(); + static { + // logger.debug("Initializing application properties..."); + String resourceName = System.getProperty(systemPropertyName); + if (resourceName == null) { + resourceName = defaultResourceName; + // logger.debug("System property '" + systemPropertyName + + // "' not found. Loading configuration from default resource: '" + + // defaultResourceName + "'."); + } else { + System.out + .println("Loading configuration from resource defined through system property: " + + systemPropertyName + "=" + resourceName); + } + + props = readProperties(resourceName); + } + + public static String getHome() { + + return props.getProperty("iweb2.home"); + } + + public static Level getLevel(String cName) { + + String logLevel = getLogProperty("log.level." + cName); + + if (logLevel == null) + logLevel = LOG_LEVEL_WARNING; + + Level l = null; + + switch (logLevel) { + case LOG_LEVEL_SEVERE: + l = Level.SEVERE; + break; + case LOG_LEVEL_WARNING: + l = Level.WARNING; + break; + case LOG_LEVEL_INFO: + l = Level.INFO; + break; + case LOG_LEVEL_CONFIG: + l = Level.CONFIG; + break; + case LOG_LEVEL_FINE: + l = Level.FINE; + break; + case LOG_LEVEL_FINER: + l = Level.FINER; + break; + case LOG_LEVEL_FINEST: + l = Level.FINEST; + break; + default: + l = Level.WARNING; + break; + } + return l; + } + + public static String getLogProperty(String key) { + return logProps.getProperty(key); + } + + /** + * First checks if there is a system property with the same key. Then + * attempts to load property from the configuration file. + * + * @return null if property not found. + */ + public static String getProperty(String key) { + // allow to override property using -D= + return System.getProperty(key, props.getProperty(key)); + } + + /** + * First checks if there is a system property with the same key. Then + * attempts to load property from the configuration file. + * + * @param key + * identifies property. + * @param defaultValue + * default value that will be used if property is not found. + * @return property value or default value. + */ + public static String getProperty(String key, String defaultValue) { + // allow to override property using -D= + return System.getProperty(key, props.getProperty(key, defaultValue)); + } + + public static Properties readProperties(String resourceName) { + + Properties props = new Properties(); + + try { + + InputStream inStream = YooreekaConfigurator.class.getResourceAsStream(resourceName); + + if (inStream != null) { + props.load(inStream); + } else { + printNoPropertiesFound(); + setStaticProperties(); + } + } catch (Exception e) { + String message = "Failed to load properties from resource: '" + + resourceName + "'."; + System.out.println("ERROR:\n" + message + "\n" + e.getMessage()); + throw new RuntimeException(message, e); + } + return props; + } + + /** + * Set the following values if iweb2.properties cannot be found: + *
+	 *   iweb2.home=C:/iWeb2
+	 *   iweb2.data.dir=C:/iWeb2/data
+	 *   iweb2.crawl.dir=C:/iWeb2/data/crawls
+	 *   iweb2.temp.dir=C:/iWeb2/deploy/temp
+	 *   iweb2.movielens.data.dir=C:/iWeb2/data/ch03/MovieLens
+	 * 
+ * + * NOTE: This shouldn't happen but rather than having people getting stuck with setting up properties + * we can provide a default set of values (which is what they would get from the "Download" distro by + * default anyway) ... + * + * Obviously, this will only work on MS Windows ... + */ + public static void setStaticProperties() { + props.put("iweb2.home", "C:/iWeb2"); + props.put("iweb2.data.dir", "C:/iWeb2/data"); + props.put("iweb2.crawl.dir", "C:/iWeb2/data/crawls"); + props.put("iweb2.temp.dir", "C:/iWeb2/deploy/temp"); + props.put("iweb2.movielens.data.dir", "C:/iWeb2/data/ch03/MovieLens"); + } + + private static void printNoPropertiesFound() { + P.hline(); + P.println(" Oops!"); + P.println(" The file __ iweb2.properties __ was not found!"); + P.println(" Did you set up the system properly?"); + P.hline(); + P.println(" WARNING: Loading DEFAULT property values ..."); + P.hline(); + } +} diff --git a/src/org/yooreeka/examples/credit/BaggingCreditClassifier.java b/src/org/yooreeka/examples/credit/BaggingCreditClassifier.java new file mode 100644 index 0000000..9ffccdb --- /dev/null +++ b/src/org/yooreeka/examples/credit/BaggingCreditClassifier.java @@ -0,0 +1,79 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.ensemble.ClassifierEnsemble; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.BootstrapTrainingSetBuilder; +import org.yooreeka.examples.credit.util.UserInstanceBuilder; + +public class BaggingCreditClassifier extends ClassifierEnsemble { + + private UserInstanceBuilder instanceBuilder; + private BootstrapTrainingSetBuilder bootstrapTSetBuilder; + + public BaggingCreditClassifier(UserDataset ds) { + + super(BaggingCreditClassifier.class.getSimpleName()); + + /* Creating instance builder for this classifier */ + instanceBuilder = new UserInstanceBuilder(false); + + /* + * Creating original training set that will be used to generate + * bootstrap sets + */ + TrainingSet originalTSet = instanceBuilder.createTrainingSet(ds); + + bootstrapTSetBuilder = new BootstrapTrainingSetBuilder(originalTSet); + } + + public Concept classify(User user) { + + if (verbose) { + System.out.println("User:\n >> " + user.toString()); + } + + return classify(instanceBuilder.createInstance(user)); + } + + public TrainingSet getBootstrapSet() { + return bootstrapTSetBuilder.buildBootstrapSet(); + } + + public UserInstanceBuilder getInstanceBuilder() { + return instanceBuilder; + } + +} diff --git a/src/org/yooreeka/examples/credit/BoostingCreditClassifier.java b/src/org/yooreeka/examples/credit/BoostingCreditClassifier.java new file mode 100644 index 0000000..d77ec31 --- /dev/null +++ b/src/org/yooreeka/examples/credit/BoostingCreditClassifier.java @@ -0,0 +1,131 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import org.yooreeka.algos.taxis.boosting.BoostingARCX4Classifier; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.UserInstanceBuilder; + +public class BoostingCreditClassifier extends BoostingARCX4Classifier { + + private UserInstanceBuilder instanceBuilder; + + private ClassifierMemberType classifierType; + + public BoostingCreditClassifier(String name, UserDataset ds, + UserInstanceBuilder instanceBuilder) { + this(name, instanceBuilder, instanceBuilder.createTrainingSet(ds)); + } + + public BoostingCreditClassifier(String name, + UserInstanceBuilder instanceBuilder, TrainingSet tSet) { + + super(name, tSet); + + this.instanceBuilder = instanceBuilder; + } + + public BoostingCreditClassifier(UserDataset ds) { + + this(BoostingCreditClassifier.class.getSimpleName(), ds, + new UserInstanceBuilder(false)); + + } + + public Concept classify(User user) { + + if (verbose) { + System.out.println("User:\n >> " + user.toString()); + } + + return classify(instanceBuilder.createInstance(user)); + } + + @Override + public Classifier getClassifierForTraining(TrainingSet set) { + + Classifier baseClassifier = null; + + switch (classifierType) { + case NEURAL_NETWORK: + NNCreditClassifier nnClassifier = new NNCreditClassifier(set); + nnClassifier.setLearningRate(0.01); + nnClassifier.useDefaultAttributes(); + baseClassifier = nnClassifier; + break; + case DECISION_TREE: + DTCreditClassifier dtClassifier = new DTCreditClassifier(set); + dtClassifier.useDefaultAttributes(); + dtClassifier.setPruneAfterTraining(true); + baseClassifier = dtClassifier; + break; + case NAIVE_BAYES: + NBCreditClassifier nbClassifier = new NBCreditClassifier(set); + nbClassifier.useDefaultAttributes(); + baseClassifier = nbClassifier; + break; + default: + throw new RuntimeException("Invalid classifier member type!"); + } + + return baseClassifier; + } + + /** + * @return the classifierType + */ + public ClassifierMemberType getClassifierType() { + return classifierType; + } + + public UserInstanceBuilder getInstanceBuilder() { + return instanceBuilder; + } + + /** + * @param classifierType + * the classifierType to set + */ + public void setClassifierType(String type) { + + if (type.equalsIgnoreCase("decision tree")) { + this.classifierType = ClassifierMemberType.DECISION_TREE; + } else if (type.equalsIgnoreCase("neural network")) { + this.classifierType = ClassifierMemberType.NEURAL_NETWORK; + } else if (type.equalsIgnoreCase("naive bayes")) { + this.classifierType = ClassifierMemberType.NAIVE_BAYES; + } + } +} diff --git a/src/org/yooreeka/examples/credit/CreditConcept.java b/src/org/yooreeka/examples/credit/CreditConcept.java new file mode 100644 index 0000000..5f897a9 --- /dev/null +++ b/src/org/yooreeka/examples/credit/CreditConcept.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import org.yooreeka.algos.taxis.core.BaseConcept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.credit.data.users.UserType; + +public class CreditConcept extends BaseConcept { + + public static final String CONCEPT_LABEL_EX = UserType.EXCELLENT; + public static final String CONCEPT_LABEL_VG = UserType.VERY_GOOD; + public static final String CONCEPT_LABEL_GD = UserType.GOOD; + public static final String CONCEPT_LABEL_BD = UserType.BAD; + public static final String CONCEPT_LABEL_DN = UserType.DANGEROUS; + + public static int getIndex(String val) { + int index = -1; + if (val.equals(CONCEPT_LABEL_EX)) { + index = 0; + } else if (val.equals(CONCEPT_LABEL_VG)) { + index = 1; + } else if (val.equals(CONCEPT_LABEL_GD)) { + index = 2; + } else if (val.equals(CONCEPT_LABEL_BD)) { + index = 3; + } else if (val.equals(CONCEPT_LABEL_DN)) { + index = 4; + } else { + throw new IllegalArgumentException("Unknown CreditConcept name!"); + } + return index; + } + + public static String getLabel(int val) { + + String label = null; + + if (val == 0) { + label = CONCEPT_LABEL_EX; + } else if (val == 1) { + label = CONCEPT_LABEL_VG; + } else if (val == 2) { + label = CONCEPT_LABEL_GD; + } else if (val == 3) { + label = CONCEPT_LABEL_BD; + } else if (val == 4) { + label = CONCEPT_LABEL_DN; + } else { + throw new IllegalArgumentException( + "Unknown CreditConcept index for label!"); + } + return label; + } + + public CreditConcept(String name) { + super(name); + } + + @Override + public Instance[] getInstances() { + throw new UnsupportedOperationException("not implemented."); + } +} diff --git a/src/org/yooreeka/examples/credit/CreditInstance.java b/src/org/yooreeka/examples/credit/CreditInstance.java new file mode 100644 index 0000000..77f176b --- /dev/null +++ b/src/org/yooreeka/examples/credit/CreditInstance.java @@ -0,0 +1,121 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.List; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class CreditInstance implements Instance { + + public static final String ATTR_NAME_USERID = "userid"; + public static final String ATTR_NAME_JOB_CLASS = "jobClass"; + public static final String ATTR_NAME_INCOME_TYPE = "incomeType"; + public static final String ATTR_NAME_CAR_OWNERSHIP = "carOwnership"; + public static final String ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP = "motorBicycleOwnership"; + public static final String ATTR_NAME_OTHER_PROPERTY_OWNERSHIP = "otherPropertyOwnership"; + public static final String ATTR_NAME_RETIREMENT_ACCOUNT = "retirementAccount"; + public static final String ATTR_NAME_CREDIT_SCORE = "creditScore"; + public static final String ATTR_NAME_AGE = "age"; + public static final String ATTR_NAME_MORTGAGE_DOWN_PAYMENT = "mortgageDownPayment"; + public static final String ATTR_NAME_BANKRUPTCY = "priorDeclaredBankruptcy"; + public static final String ATTR_NAME_CRIMINAL_RECORD = "priorCriminalRecord"; + + protected CreditConcept concept; + protected Attribute[] attributes; + + public CreditInstance(CreditConcept c, Attribute[] attrs) { + this.concept = c; + this.attributes = attrs; + } + + public CreditInstance(CreditConcept c, List attrs) { + this(c, attrs.toArray(new Attribute[attrs.size()])); + } + + public Attribute[] getAtrributes() { + return attributes; + } + + public Attribute getAttributeByName(String attrName) { + Attribute matchedAttribute = null; + + if (attributes != null) { + for (Attribute a : attributes) { + if (attrName.equalsIgnoreCase(a.getName())) { + matchedAttribute = a; + break; + } + } + } + + return matchedAttribute; + } + + public CreditConcept getConcept() { + return concept; + } + + public void print() { + print(new PrintWriter(System.out)); + } + + public void print(PrintWriter writer) { + if (attributes != null) { + for (Attribute a : attributes) { + + if (a == null || a.getName() == null) { + writer.print(" - "); + } else { + if (a.getValue() == null) { + writer.print(" - "); + } else { + writer.print(" - " + a.getName() + " = " + + a.getValue()); + } + } + } + } + + writer.println(" --> " + getConcept().getName()); + } + + @Override + public String toString() { + StringWriter sw = new StringWriter(); + print(new PrintWriter(sw)); + return sw.toString(); + } + +} diff --git a/src/org/yooreeka/examples/credit/DTCreditClassifier.java b/src/org/yooreeka/examples/credit/DTCreditClassifier.java new file mode 100644 index 0000000..959ff75 --- /dev/null +++ b/src/org/yooreeka/examples/credit/DTCreditClassifier.java @@ -0,0 +1,194 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.tree.DecisionTreeClassifier; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.UserInstanceBuilder; + +public class DTCreditClassifier extends DecisionTreeClassifier { + + private static final long serialVersionUID = 5491106283513021975L; + + private static String createDefaultClassifierName() { + return DTCreditClassifier.class.getSimpleName(); + } + + private static UserInstanceBuilder createDefaultInstanceBuilder() { + // using Instance Builder configured to produce instances with String + // attributes + return new UserInstanceBuilder(false); + } + + public static DTCreditClassifier loadClassifier(String filename) { + + Object o = null; + File f = new File(filename); + if (f.exists()) { + try { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file: '" + filename + + "'", e); + } + } else { + throw new IllegalArgumentException("File doesn't exist: '" + + filename + "'."); + } + + System.out.println("loaded classifier from file: " + filename); + + return (DTCreditClassifier) o; + + } + + public static void saveClassifier(String filename, DTCreditClassifier o) { + + try { + File f = new File(filename); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(o); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException("Error while saving data into file: '" + + filename + "'", e); + } + + System.out.println("saved classifier in file: " + filename); + } + + private UserInstanceBuilder instanceBuilder; + + private boolean pruneAfterTraining; + + public DTCreditClassifier(String name, TrainingSet ts, + UserInstanceBuilder instanceBuilder) { + + super(name, ts); + + this.instanceBuilder = instanceBuilder; + this.pruneAfterTraining = true; + + } + + public DTCreditClassifier(String name, UserDataset ds) { + this(name, ds, createDefaultInstanceBuilder()); + + } + + public DTCreditClassifier(String name, UserDataset ds, + UserInstanceBuilder instanceBuilder) { + + this(name, instanceBuilder.createTrainingSet(ds), instanceBuilder); + } + + public DTCreditClassifier(TrainingSet ts) { + this(createDefaultClassifierName(), ts, createDefaultInstanceBuilder()); + } + + public DTCreditClassifier(UserDataset ds) { + this(createDefaultClassifierName(), ds); + } + + public Concept classify(User u) { + return classify(instanceBuilder.createInstance(u)); + } + + public Concept classify(User u, boolean print) { + Concept c = classify(u); + if (print) { + System.out.println("Actual ---> " + u.getCategory() + + "\nAssigned -> " + c.getName()); + } + return c; + } + + public UserInstanceBuilder getInstanceBuilder() { + return this.instanceBuilder; + } + + public boolean isPruneAfterTraining() { + return pruneAfterTraining; + } + + public void setPruneAfterTraining(boolean pruneAfterTraining) { + this.pruneAfterTraining = pruneAfterTraining; + } + + @Override + public boolean train() { + boolean result = super.train(); + if (result && pruneAfterTraining) { + this.pruneTree(); + } + return result; + } + + public void useDefaultAttributes() { + trainOnAttribute(CreditInstance.ATTR_NAME_JOB_CLASS, true); + trainOnAttribute(CreditInstance.ATTR_NAME_INCOME_TYPE, true); + trainOnAttribute(CreditInstance.ATTR_NAME_AGE, true); + trainOnAttribute(CreditInstance.ATTR_NAME_CAR_OWNERSHIP, true); + trainOnAttribute(CreditInstance.ATTR_NAME_CREDIT_SCORE, true); + trainOnAttribute(CreditInstance.ATTR_NAME_MORTGAGE_DOWN_PAYMENT, true); + trainOnAttribute(CreditInstance.ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP, true); + trainOnAttribute(CreditInstance.ATTR_NAME_OTHER_PROPERTY_OWNERSHIP, + true); + trainOnAttribute(CreditInstance.ATTR_NAME_CRIMINAL_RECORD, true); + trainOnAttribute(CreditInstance.ATTR_NAME_BANKRUPTCY, true); + trainOnAttribute(CreditInstance.ATTR_NAME_RETIREMENT_ACCOUNT, true); + } + +} diff --git a/src/org/yooreeka/examples/credit/NBCreditClassifier.java b/src/org/yooreeka/examples/credit/NBCreditClassifier.java new file mode 100644 index 0000000..1d57be5 --- /dev/null +++ b/src/org/yooreeka/examples/credit/NBCreditClassifier.java @@ -0,0 +1,121 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import org.yooreeka.algos.taxis.bayesian.NaiveBayes; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.UserInstanceBuilder; + +public class NBCreditClassifier extends NaiveBayes { + + private static String createDefaultClassifierName() { + return NBCreditClassifier.class.getSimpleName(); + } + + private static UserInstanceBuilder createDefaultInstanceBuilder() { + // using Instance Builder configured to produce instances with String + // attributes + return new UserInstanceBuilder(false); + } + + private UserInstanceBuilder instanceBuilder; + + public NBCreditClassifier(String name, TrainingSet ts, + UserInstanceBuilder instanceBuilder) { + + super(name, ts); + + this.instanceBuilder = instanceBuilder; + } + + public NBCreditClassifier(String name, UserDataset ds) { + this(name, ds, createDefaultInstanceBuilder()); + } + + public NBCreditClassifier(String name, UserDataset ds, + UserInstanceBuilder instanceBuilder) { + + this(name, instanceBuilder.createTrainingSet(ds), instanceBuilder); + + } + + public NBCreditClassifier(TrainingSet ts) { + + super(createDefaultClassifierName(), ts); + + this.instanceBuilder = createDefaultInstanceBuilder(); + } + + public NBCreditClassifier(UserDataset ds) { + this(createDefaultClassifierName(), ds); + } + + @Override + public Concept classify(Instance instance) { + return super.classify(instance); + } + + public Concept classify(User user) { + return classify(instanceBuilder.createInstance(user)); + } + + public Concept classify(User u, boolean print) { + Concept c = classify(u); + if (print) { + System.out.println("Actual ---> " + u.getCategory() + + "\nAssigned -> " + c.getName()); + } + return c; + } + + public UserInstanceBuilder getInstanceBuilder() { + return this.instanceBuilder; + } + + public void useDefaultAttributes() { + trainOnAttribute(CreditInstance.ATTR_NAME_JOB_CLASS); + trainOnAttribute(CreditInstance.ATTR_NAME_INCOME_TYPE); + trainOnAttribute(CreditInstance.ATTR_NAME_AGE); + trainOnAttribute(CreditInstance.ATTR_NAME_CAR_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_CREDIT_SCORE); + trainOnAttribute(CreditInstance.ATTR_NAME_MORTGAGE_DOWN_PAYMENT); + trainOnAttribute(CreditInstance.ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_OTHER_PROPERTY_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_CRIMINAL_RECORD); + trainOnAttribute(CreditInstance.ATTR_NAME_BANKRUPTCY); + trainOnAttribute(CreditInstance.ATTR_NAME_RETIREMENT_ACCOUNT); + } + +} diff --git a/src/org/yooreeka/examples/credit/NNCreditClassifier.java b/src/org/yooreeka/examples/credit/NNCreditClassifier.java new file mode 100644 index 0000000..d9d6c3a --- /dev/null +++ b/src/org/yooreeka/examples/credit/NNCreditClassifier.java @@ -0,0 +1,406 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.core.DoubleAttribute; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.UserInstanceBuilder; + +public class NNCreditClassifier implements Classifier, java.io.Serializable { + + private static final long serialVersionUID = 8584476885427513654L; + + private static final String SERIALIZATION_PATH = YooreekaConfigurator + .getHome() + "\\data\\ch06\\"; + + private static String createDefaultClassifierName() { + return NNCreditClassifier.class.getSimpleName(); + } + + private static UserInstanceBuilder createDefaultInstanceBuilder() { + // using Instance Builder configured to produce instances with Double + // attributes + return new UserInstanceBuilder(true); + } + + public static NNCreditClassifier load(String filename) { + + Object o = null; + File f = new File(SERIALIZATION_PATH + filename); + if (f.exists()) { + try { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file: '" + filename + + "'", e); + } + } else { + throw new IllegalArgumentException("File doesn't exist: '" + + filename + "'."); + } + + System.out.println("loaded classifier from file: " + filename); + + return (NNCreditClassifier) o; + + } + + private boolean verbose = false; + + private String name; + + /* + * Neural Network that will be used by this classifier. + */ + private UserCreditNN nn; + + private int DEFAULT_TRAINING_ITERATIONS = 10; + + /* + * Number of times to feed training instances into the network during + * training. + */ + private int nTrainingIterations = DEFAULT_TRAINING_ITERATIONS; + + private double DEFAULT_LEARNING_RATE = 0.025; + + /* + * Learning rate that will be used in NN training. + */ + private double learningRate = DEFAULT_LEARNING_RATE; + + private transient TrainingSet ts; + + private UserInstanceBuilder instanceBuilder; + + /* + * Attribute names that should be used as Neural Network inputs. + */ + private List availableAttributeNames; + + String[] categories = new String[] { CreditConcept.CONCEPT_LABEL_EX, + CreditConcept.CONCEPT_LABEL_VG, CreditConcept.CONCEPT_LABEL_GD, + CreditConcept.CONCEPT_LABEL_BD, CreditConcept.CONCEPT_LABEL_DN }; + + public NNCreditClassifier(String name, TrainingSet ts, + UserInstanceBuilder instanceBuilder) { + + this.name = name; + + this.ts = ts; + + this.instanceBuilder = instanceBuilder; + + this.availableAttributeNames = new ArrayList(); + + nn = createNeuralNetwork(); + } + + public NNCreditClassifier(String name, UserDataset ds) { + // using Instance Builder configured to produce instances with Double + // attributes + this(name, ds, createDefaultInstanceBuilder()); + } + + public NNCreditClassifier(String name, UserDataset ds, + UserInstanceBuilder instanceBuilder) { + this(name, instanceBuilder.createTrainingSet(ds), instanceBuilder); + } + + public NNCreditClassifier(TrainingSet ts) { + this(createDefaultClassifierName(), ts, createDefaultInstanceBuilder()); + } + + public NNCreditClassifier(UserDataset ds) { + this(createDefaultClassifierName(), ds); + } + + public Concept classify(Instance instance) { + + double[] x = createNNInputs(instance); + + double[] y = nn.classify(x); + + Concept c = createConceptFromNNOutput(y); + + if (verbose) { + System.out.println("\nAssessment:\n >> This is a " + c.getName()); + } + return c; + } + + public Concept classify(User user) { + if (verbose) { + System.out.println("User:\n >> " + user.toString()); + } + return classify(instanceBuilder.createInstance(user)); + } + + public Concept classify(User u, boolean print) { + Concept c = classify(u); + if (print) { + System.out.println("Actual ---> " + u.getCategory() + + "\nAssigned -> " + c.getName()); + } + return c; + } + + private Concept createConceptFromNNOutput(double[] y) { + + int categoryIndex = 0; + for (int i = 1, n = y.length; i < n; i++) { + if (y[i] > y[categoryIndex]) { + categoryIndex = i; + } + } + + return new CreditConcept(categories[categoryIndex]); + } + + private UserCreditNN createNeuralNetwork() { + + String nnName = "NNUserCreditClassifierNN"; + + UserCreditNN nn = new UserCreditNN(nnName); + // set custom parameters and recreate the network + nn.setLearningRate(learningRate); + nn.removeAllNodesAndLayers(); + nn.create(); + return nn; + } + + public double[] createNNInputs(Instance instance) { + + /* + * Converting all String attributes into Double attributes. + */ + Instance convertedInstance = instanceBuilder.createInstance(instance); + + int nInputNodes = nn.getInputNodeCount(); + + double[] x = new double[nInputNodes]; + + for (int i = 0; i < nInputNodes; i++) { + + String attrName = this.availableAttributeNames.get(i); + Attribute a = convertedInstance.getAttributeByName(attrName); + + if (a instanceof DoubleAttribute) { + x[i] = (Double) a.getValue(); + } else { + if (a == null) { + throw new RuntimeException( + "Failed to find attribute with name: '" + attrName + + "'. Instance: " + + convertedInstance.toString()); + } else { + throw new RuntimeException( + "Invalid attribute type. Only " + + DoubleAttribute.class.getSimpleName() + + " attribute" + + " types can be used in NN. Actual attribute type: " + + a.getClass().getSimpleName()); + } + } + + } + + return x; + } + + public double[] createNNOutputs(Instance i) { + + int nOutputNodes = nn.getOutputNodeCount(); + + double[] y = new double[nOutputNodes]; + for (int n = 0; n < nOutputNodes; n++) { + String category = i.getConcept().getName(); + y[n] = getOutputValue(n, category); + } + + return y; + } + + public UserInstanceBuilder getInstanceBuilder() { + return this.instanceBuilder; + } + + public double getLearningRate() { + return learningRate; + } + + /** + * @return the name + */ + public String getName() { + return name; + } + + private double getOutputValue(int i, String category) { + if (categories[i].equals(category)) { + return 1.0; + } else { + return 0.0; + } + } + + /** + * @return the verbose + */ + public boolean isVerbose() { + return verbose; + } + + public void save() { + + String filename = SERIALIZATION_PATH + this.getName(); + try { + File f = new File(filename); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(this); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException("Error while saving data into file: '" + + filename + "'", e); + } + + System.out.println("saved classifier in file: " + filename); + } + + public void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + public void setNTrainingIterations(int trainingIterations) { + nTrainingIterations = trainingIterations; + } + + /** + * @param verbose + * the verbose to set + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + public boolean train() { + + long t0 = System.currentTimeMillis(); + + if (ts == null) { + throw new RuntimeException( + "Can't train classifier - training dataset is null."); + } + + if (nn.getInputNodeCount() != availableAttributeNames.size()) { + throw new RuntimeException( + "Number of attributes doesn't match with the number of input nodes." + + "Attributes: " + availableAttributeNames.size() + + ", Input nodes: " + nn.getInputNodeCount()); + } + + trainNeuralNetwork(nTrainingIterations); + + System.out.print(" Neural network training completed in "); + System.out.println((System.currentTimeMillis() - t0) + " (ms)"); + + return true; + } + + private void trainNeuralNetwork(int nIterations) { + + for (int i = 1; i <= nIterations; i++) { + for (Instance instance : ts.getInstances().values()) { + double[] nnInput = createNNInputs(instance); + double[] nnExpectedOutput = createNNOutputs(instance); + + nn.train(nnInput, nnExpectedOutput); + } + + if (verbose) { + System.out.println("finished training pass: " + i + " out of " + + nIterations); + } + } + } + + public void trainOnAttribute(String name) { + availableAttributeNames.add(name); + } + + /** + * This methods facilitates the loading of training attributes + */ + public void useDefaultAttributes() { + trainOnAttribute(CreditInstance.ATTR_NAME_JOB_CLASS); + trainOnAttribute(CreditInstance.ATTR_NAME_INCOME_TYPE); + trainOnAttribute(CreditInstance.ATTR_NAME_AGE); + trainOnAttribute(CreditInstance.ATTR_NAME_CAR_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_CREDIT_SCORE); + trainOnAttribute(CreditInstance.ATTR_NAME_MORTGAGE_DOWN_PAYMENT); + trainOnAttribute(CreditInstance.ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_OTHER_PROPERTY_OWNERSHIP); + trainOnAttribute(CreditInstance.ATTR_NAME_CRIMINAL_RECORD); + trainOnAttribute(CreditInstance.ATTR_NAME_BANKRUPTCY); + trainOnAttribute(CreditInstance.ATTR_NAME_RETIREMENT_ACCOUNT); + } + +} diff --git a/src/org/yooreeka/examples/credit/UserCreditNN.java b/src/org/yooreeka/examples/credit/UserCreditNN.java new file mode 100644 index 0000000..17090e1 --- /dev/null +++ b/src/org/yooreeka/examples/credit/UserCreditNN.java @@ -0,0 +1,211 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit; + +import org.yooreeka.algos.taxis.networks.neural.core.BaseNN; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Layer; + +public class UserCreditNN extends BaseNN { + + private static final long serialVersionUID = 5049921699478904263L; + + public UserCreditNN(String name) { + super(name); + + create(); + } + + public void create() { + createNN_11_7_5(); + } + + private void createNN_11_7_5() { + + // 1. Define Layers, Nodes and Node Biases + Layer inputLayer = createInputLayer(0, // layer id + 11 // number of nodes + ); + + Layer hiddenLayer = createHiddenLayer(1, // layer id + 7, // number of nodes + new double[] { 0.5, -1, 1.5, 0.5, 1, -0.2, 0.1 } // node biases + ); + + Layer outputLayer = createOutputLayer(2, // layer id + 5, // number of nodes + new double[] { -1.5, 0.5, -1, 0.5, 1 } // node biases + ); + + setInputLayer(inputLayer); + setOutputLayer(outputLayer); + addHiddenLayer(hiddenLayer); + + // 2. Define links and weights between nodes + // Id format: + + // Weights for links from Input Layer to Hidden Layer + setLink("0:0", "1:0", 0.25); + setLink("0:0", "1:1", -0.7); + setLink("0:0", "1:2", 0.25); + setLink("0:0", "1:3", 0.25); + setLink("0:0", "1:4", -0.3); + setLink("0:0", "1:5", 0.25); + setLink("0:0", "1:6", -0.5); + + setLink("0:1", "1:0", 0.25); + setLink("0:1", "1:1", -0.5); + setLink("0:1", "1:2", 0.25); + setLink("0:1", "1:3", 0.25); + setLink("0:1", "1:4", 0.5); + setLink("0:1", "1:5", 0.25); + setLink("0:1", "1:6", 0.5); + + setLink("0:2", "1:0", 0.25); + setLink("0:2", "1:1", -0.5); + setLink("0:2", "1:2", 0.25); + setLink("0:2", "1:3", 0.25); + setLink("0:2", "1:4", -0.5); + setLink("0:2", "1:5", 0.25); + setLink("0:2", "1:6", -0.5); + + setLink("0:3", "1:0", 0.25); + setLink("0:3", "1:1", -0.5); + setLink("0:3", "1:2", -0.25); + setLink("0:3", "1:3", -0.25); + setLink("0:3", "1:4", -0.5); + setLink("0:3", "1:5", 0.25); + setLink("0:3", "1:6", 0.5); + + setLink("0:4", "1:0", 0.25); + setLink("0:4", "1:1", -0.5); + setLink("0:4", "1:2", 0.25); + setLink("0:4", "1:3", 0.25); + setLink("0:4", "1:4", -0.5); + setLink("0:4", "1:5", 0.25); + setLink("0:4", "1:6", -0.5); + + setLink("0:5", "1:0", 0.25); + setLink("0:5", "1:1", -0.5); + setLink("0:5", "1:2", 0.25); + setLink("0:5", "1:3", 0.25); + setLink("0:5", "1:4", -0.5); + setLink("0:5", "1:5", 0.25); + setLink("0:5", "1:6", -0.5); + + setLink("0:6", "1:0", -0.25); + setLink("0:6", "1:1", 0.5); + setLink("0:6", "1:2", -0.25); + setLink("0:6", "1:3", 0.25); + setLink("0:6", "1:4", -0.5); + setLink("0:6", "1:5", 0.25); + setLink("0:6", "1:6", 0.5); + + setLink("0:7", "1:0", 0.25); + setLink("0:7", "1:1", -0.5); + setLink("0:7", "1:2", 0.25); + setLink("0:7", "1:3", 0.25); + setLink("0:7", "1:4", -0.5); + setLink("0:7", "1:5", 0.25); + setLink("0:7", "1:6", -0.5); + + setLink("0:8", "1:0", 0.25); + setLink("0:8", "1:1", -0.5); + setLink("0:8", "1:2", 0.25); + setLink("0:8", "1:3", 0.25); + setLink("0:8", "1:4", -0.5); + setLink("0:8", "1:5", 0.25); + setLink("0:8", "1:6", 0.8); + + setLink("0:9", "1:0", 0.25); + setLink("0:9", "1:1", 0.5); + setLink("0:9", "1:2", -0.25); + setLink("0:9", "1:3", -0.25); + setLink("0:9", "1:4", 0.5); + setLink("0:9", "1:5", 0.25); + setLink("0:9", "1:6", 0.5); + + setLink("0:10", "1:0", 0.25); + setLink("0:10", "1:1", -0.5); + setLink("0:10", "1:2", 0.25); + setLink("0:10", "1:3", 0.25); + setLink("0:10", "1:4", 0.5); + setLink("0:10", "1:5", 0.25); + setLink("0:10", "1:6", -0.5); + + // Weights for links from Hidden Layer to Output Layer + + setLink("1:0", "2:0", -0.5); + setLink("1:1", "2:0", 0.5); + setLink("1:2", "2:0", 0.5); + setLink("1:3", "2:0", 0.5); + setLink("1:4", "2:0", 0.5); + setLink("1:5", "2:0", -0.5); + setLink("1:6", "2:0", 0.5); + + setLink("1:0", "2:1", -0.5); + setLink("1:1", "2:1", 0.5); + setLink("1:2", "2:1", -0.5); + setLink("1:3", "2:1", -0.5); + setLink("1:4", "2:1", 0.5); + setLink("1:5", "2:1", -0.5); + setLink("1:6", "2:1", 0.5); + + setLink("1:0", "2:2", -0.5); + setLink("1:1", "2:2", 0.5); + setLink("1:2", "2:2", -0.5); + setLink("1:3", "2:2", -0.5); + setLink("1:4", "2:2", 0.5); + setLink("1:5", "2:2", -0.5); + setLink("1:6", "2:2", 0.5); + + setLink("1:0", "2:3", -0.5); + setLink("1:1", "2:3", 0.5); + setLink("1:2", "2:3", -0.5); + setLink("1:3", "2:3", -0.5); + setLink("1:4", "2:3", 0.5); + setLink("1:5", "2:3", -0.5); + setLink("1:6", "2:3", 0.5); + + setLink("1:0", "2:4", -0.5); + setLink("1:1", "2:4", 0.5); + setLink("1:2", "2:4", -0.5); + setLink("1:3", "2:4", -0.5); + setLink("1:4", "2:4", 0.5); + setLink("1:5", "2:4", -0.5); + setLink("1:6", "2:4", 0.5); + + if (isVerbose()) { + System.out.println("NN created"); + } + + } + +} diff --git a/src/org/yooreeka/examples/credit/data/UseCaseData.java b/src/org/yooreeka/examples/credit/data/UseCaseData.java new file mode 100644 index 0000000..5baa5f8 --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/UseCaseData.java @@ -0,0 +1,194 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.credit.data.users.BadUserType; +import org.yooreeka.examples.credit.data.users.DangerousUserType; +import org.yooreeka.examples.credit.data.users.ExcellentUserType; +import org.yooreeka.examples.credit.data.users.GoodUserType; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.data.users.UserType; +import org.yooreeka.examples.credit.data.users.VeryGoodUserType; +import org.yooreeka.examples.credit.util.CreditDataUtils; +import org.yooreeka.examples.credit.util.DataGenerator; + +/** + * Example for how to configure and generate file with transactions. + */ +public class UseCaseData { + + /* + * Generated transactions will be saved into this file. + */ + public static String TRAINING_USERS_FILENAME = YooreekaConfigurator + .getHome() + "/data/ch06/generated-training-users.txt"; + + public static String TEST_USERS_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch06/generated-test-users.txt"; + + public static void main(String[] args) { + + UseCaseData useCaseData = new UseCaseData(100000, 50000); + // UseCaseData useCaseData = new UseCaseData(10000,5000); + useCaseData.create(); + } + + DataGenerator dataGenerator = new DataGenerator(); + // INSTANCE VARIABLES + int nTrainingUsers; + + int nTestUsers; + + public UseCaseData(int nTrainingUsers, int nTestUsers) { + this.nTrainingUsers = nTrainingUsers; + this.nTestUsers = nTestUsers; + } + + public void create() { + + System.out + .println("Creating data for the credit worthiness (score) use case:"); + System.out.println(" Number of users in the training set: " + + nTrainingUsers); + System.out.println(" Number of users in the testing set: " + + nTestUsers); + System.out + .println("___________________________________________________________"); + + List trainingUserTypes = createUserTypes(nTrainingUsers); + int userIdSequenceStart = 1; + generateUsers(TRAINING_USERS_FILENAME, userIdSequenceStart, + trainingUserTypes); + + dataGenerator.setNoiseOn(true); + + List testUserTypes = createUserTypes(nTestUsers); + userIdSequenceStart = 500000; + // generateUsers(TEST_USERS_FILENAME, 2*nTrainingUsers, testUserTypes); + generateUsers(TEST_USERS_FILENAME, userIdSequenceStart, testUserTypes); + + System.out.println("Done!"); + } + + public void create(boolean overwrite) { + if (overwrite) { + TRAINING_USERS_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch06/training-users.txt"; + TEST_USERS_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch06/test-users.txt"; + } + create(); + } + + public List createUserTypes(int nUsers) { + List allUserTypes = new ArrayList(); + + // Excellent credit users + // 5% of the total number of users + UserType userType = new ExcellentUserType(); + userType.setNUsers((int) (nUsers * 0.05)); + + allUserTypes.add(userType); + + // Very good credit users + // 15% of the total number of users + userType = new VeryGoodUserType(); + userType.setNUsers((int) (nUsers * 0.15)); + + allUserTypes.add(userType); + + // Good credit users + // 50% of the total number of users + userType = new GoodUserType(); + userType.setNUsers((int) (nUsers * 0.50)); + + allUserTypes.add(userType); + + // Bad credit users + // 25% of the total number of users + userType = new BadUserType(); + userType.setNUsers((int) (nUsers * 0.25)); + + allUserTypes.add(userType); + + // Dangerous credit users + // 5% of the total number of users + userType = new DangerousUserType(); + userType.setNUsers((int) (nUsers * 0.05)); + allUserTypes.add(userType); + + return allUserTypes; + } + + public void generateUsers(String filename, int nextUserId, + List userTypes) { + + dataGenerator.setNextUserId(nextUserId); + System.out.println("Generating users..."); + List allUsers = dataGenerator.generateUsers(userTypes); + System.out.println("Saving users into '" + filename + "'"); + CreditDataUtils.saveUsers(filename, allUsers); + } + + /** + * @return the nTestUsers + */ + public int getTestUsers() { + return nTestUsers; + } + + /** + * @return the nTrainingUsers + */ + public int getTrainingUsers() { + return nTrainingUsers; + } + + /** + * @param testUsers + * the nTestUsers to set + */ + public void setTestUsers(int n) { + nTestUsers = n; + } + + /** + * @param trainingUsers + * the nTrainingUsers to set + */ + public void setTrainingUsers(int n) { + nTrainingUsers = n; + } +} diff --git a/src/org/yooreeka/examples/credit/data/UserDataset.java b/src/org/yooreeka/examples/credit/data/UserDataset.java new file mode 100644 index 0000000..9e7f3be --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/UserDataset.java @@ -0,0 +1,80 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.examples.credit.data.users.User; + +public class UserDataset { + + private Map usersByUsernameMap; + + public UserDataset(List userList) { + this.usersByUsernameMap = new HashMap(userList.size()); + + for (User e : userList) { + String username = e.getUsername(); + usersByUsernameMap.put(username, e); + } + } + + public User findUserByUsername(String username) { + return usersByUsernameMap.get(username); + } + + public int getSize() { + return usersByUsernameMap.size(); + } + + public List getUsers() { + return new ArrayList(usersByUsernameMap.values()); + } + + public void printAll() { + for (Map.Entry e : usersByUsernameMap.entrySet()) { + User u = e.getValue(); + System.out.println(u); + } + } + + public void printUser(String username) { + User e = findUserByUsername(username); + if (e != null) { + System.out.println(e.toString()); + } else { + System.out.println("User not found (username: '" + username + "')"); + } + } +} diff --git a/src/org/yooreeka/examples/credit/data/UserLoader.java b/src/org/yooreeka/examples/credit/data/UserLoader.java new file mode 100644 index 0000000..14a2dcf --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/UserLoader.java @@ -0,0 +1,70 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data; + +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.util.CreditDataUtils; + +public class UserLoader { + + public static final String TRAINING_USERS_FILE = YooreekaConfigurator + .getHome() + "/data/ch06/training-users.txt"; + + public static final String TEST_USERS_FILE = YooreekaConfigurator.getHome() + + "/data/ch06/test-users.txt"; + + public static UserDataset loadTestDataset() { + List allUsers = loadUsers(TEST_USERS_FILE); + return new UserDataset(allUsers); + } + + public static UserDataset loadTestDataset(String filename) { + List allUsers = loadUsers(filename); + return new UserDataset(allUsers); + } + + public static UserDataset loadTrainingDataset() { + List allUsers = loadUsers(TRAINING_USERS_FILE); + return new UserDataset(allUsers); + } + + public static UserDataset loadTrainingDataset(String filename) { + List allUsers = loadUsers(filename); + return new UserDataset(allUsers); + } + + public static List loadUsers(String filename) { + return CreditDataUtils.loadUsers(filename); + } +} diff --git a/src/org/yooreeka/examples/credit/data/users/BadUserType.java b/src/org/yooreeka/examples/credit/data/users/BadUserType.java new file mode 100644 index 0000000..da776d9 --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/BadUserType.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class BadUserType extends UserType { + + { + setAge(new int[] { 1, 8, 9, 10 }); + setBancruptcy(new int[] { 0, 1 }); + setCarOwnership(new int[] { 0, 1 }); + setCreditScore(new int[] { 1, 2, 3, 4 }); + setCriminalRecord(new int[] { 0 }); + setDownPayment(new int[] { 1, 2 }); + setIncome(new int[] { 3, 4, 5, 6 }); + setJobClass(new int[] { 4, 5 }); + setMotorcycleOwnership(new int[] { 0, 1 }); + setPropertyOwnership(new int[] { 0 }); + setRetirementAccounts(new int[] { 1, 2 }); + } + + @Override + public String getUserType() { + return UserType.BAD; + } +} \ No newline at end of file diff --git a/src/org/yooreeka/examples/credit/data/users/DangerousUserType.java b/src/org/yooreeka/examples/credit/data/users/DangerousUserType.java new file mode 100644 index 0000000..8483a2f --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/DangerousUserType.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class DangerousUserType extends UserType { + + { + setAge(new int[] { 1, 2, 9, 10 }); + setBancruptcy(new int[] { 1 }); + setCarOwnership(new int[] { 0 }); + setCreditScore(new int[] { 1, 2 }); + setCriminalRecord(new int[] { 1 }); + setDownPayment(new int[] { 1, 2 }); + setIncome(new int[] { 1, 2, 3 }); + setJobClass(new int[] { 4, 5 }); + setMotorcycleOwnership(new int[] { 0 }); + setPropertyOwnership(new int[] { 0 }); + setRetirementAccounts(new int[] { 1 }); + } + + @Override + public String getUserType() { + return UserType.DANGEROUS; + } +} diff --git a/src/org/yooreeka/examples/credit/data/users/ExcellentUserType.java b/src/org/yooreeka/examples/credit/data/users/ExcellentUserType.java new file mode 100644 index 0000000..d938a16 --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/ExcellentUserType.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class ExcellentUserType extends UserType { + + { + setAge(new int[] { 1, 8, 9, 10 }); + setBancruptcy(new int[] { 0 }); + setCarOwnership(new int[] { 1 }); + setCreditScore(new int[] { 6, 7, 8 }); + setCriminalRecord(new int[] { 0 }); + setDownPayment(new int[] { 4 }); + setIncome(new int[] { 7, 8, 9, 10 }); + setJobClass(new int[] { 2, 3, 4, 5 }); + setMotorcycleOwnership(new int[] { 0, 1 }); + setPropertyOwnership(new int[] { 1 }); + setRetirementAccounts(new int[] { 5, 6, 7, 8 }); + } + + @Override + public String getUserType() { + return UserType.EXCELLENT; + } +} \ No newline at end of file diff --git a/src/org/yooreeka/examples/credit/data/users/GoodUserType.java b/src/org/yooreeka/examples/credit/data/users/GoodUserType.java new file mode 100644 index 0000000..7b0927d --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/GoodUserType.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class GoodUserType extends UserType { + + { + setAge(new int[] { 2, 3, 4, 5, 6, 7, 8 }); + setBancruptcy(new int[] { 0 }); + setCarOwnership(new int[] { 1 }); + setCreditScore(new int[] { 3, 4, 5, 6 }); + setCriminalRecord(new int[] { 0 }); + setDownPayment(new int[] { 2, 3 }); + setIncome(new int[] { 5, 6, 7, 8 }); + setJobClass(new int[] { 2, 3, 4, 5 }); + setMotorcycleOwnership(new int[] { 0, 1 }); + setPropertyOwnership(new int[] { 0, 1 }); + setRetirementAccounts(new int[] { 1, 2, 3, 4 }); + } + + @Override + public String getUserType() { + return UserType.GOOD; + } +} \ No newline at end of file diff --git a/src/org/yooreeka/examples/credit/data/users/User.java b/src/org/yooreeka/examples/credit/data/users/User.java new file mode 100644 index 0000000..693d51f --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/User.java @@ -0,0 +1,319 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class User { + + private String username; + private int jobClass; + private int carOwnership; + private int bicycleOwnership; + private int propertyOwnership; + private int retirementAccount; + private int creditScore; + private int age; + private int downPayment; + private int bankruptcy; + private int criminalRecord; + private int income; + + public User() { + // empty + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final User other = (User) obj; + if (age != other.age) + return false; + if (carOwnership != other.carOwnership) + return false; + if (creditScore != other.creditScore) + return false; + if (income != other.income) + return false; + if (jobClass != other.jobClass) + return false; + if (downPayment != other.downPayment) + return false; + if (bicycleOwnership != other.bicycleOwnership) + return false; + if (propertyOwnership != other.propertyOwnership) + return false; + if (criminalRecord != other.criminalRecord) + return false; + if (bankruptcy != other.bankruptcy) + return false; + if (retirementAccount != other.retirementAccount) + return false; + if (username == null) { + if (other.username != null) + return false; + } else if (!username.equals(other.username)) + return false; + return true; + } + + /** + * @return the age + */ + public int getAge() { + return age; + } + + /** + * @return the bankruptcy + */ + public int getBankruptcy() { + return bankruptcy; + } + + /** + * @return the bicycleOwnership + */ + public int getBicycleOwnership() { + return bicycleOwnership; + } + + /** + * @return the carOwnership + */ + public int getCarOwnership() { + return carOwnership; + } + + public String getCategory() { + return username.substring(0, 2); + } + + /** + * @return the creditScore + */ + public int getCreditScore() { + return creditScore; + } + + /** + * @return the criminalRecord + */ + public int getCriminalRecord() { + return criminalRecord; + } + + /** + * @return the downPayment + */ + public int getDownPayment() { + return downPayment; + } + + /** + * @return the income + */ + public int getIncome() { + return income; + } + + /** + * @return the jobClass + */ + public int getJobClass() { + return jobClass; + } + + /** + * @return the propertyOwnership + */ + public int getPropertyOwnership() { + return propertyOwnership; + } + + /** + * @return the retirementAccount + */ + public int getRetirementAccount() { + return retirementAccount; + } + + public String getUsername() { + return username; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + age; + result = prime * result + carOwnership; + result = prime * result + creditScore; + result = prime * result + income; + result = prime * result + jobClass; + result = prime * result + downPayment; + result = prime * result + bicycleOwnership; + result = prime * result + propertyOwnership; + result = prime * result + criminalRecord; + result = prime * result + bankruptcy; + result = prime * result + retirementAccount; + result = prime * result + + ((username == null) ? 0 : username.hashCode()); + return result; + } + + public void loadFromExternalString(String text) { + + String[] values = text.split(":"); + + username = values[0]; + jobClass = Integer.parseInt(values[1]); + carOwnership = Integer.parseInt(values[2]); + bicycleOwnership = Integer.parseInt(values[3]); + propertyOwnership = Integer.parseInt(values[4]); + retirementAccount = Integer.parseInt(values[5]); + creditScore = Integer.parseInt(values[6]); + age = Integer.parseInt(values[7]); + downPayment = Integer.parseInt(values[8]); + bankruptcy = Integer.parseInt(values[9]); + criminalRecord = Integer.parseInt(values[10]); + income = Integer.parseInt(values[11]); + } + + /** + * @param age + * the age to set + */ + public void setAge(int age) { + this.age = age; + } + + /** + * @param bankruptcy + * the bankruptcy to set + */ + public void setBankruptcy(int bankruptcy) { + this.bankruptcy = bankruptcy; + } + + /** + * @param bicycleOwnership + * the bicycleOwnership to set + */ + public void setBicycleOwnership(int bicycleOwnership) { + this.bicycleOwnership = bicycleOwnership; + } + + /** + * @param carOwnership + * the carOwnership to set + */ + public void setCarOwnership(int carOwnership) { + this.carOwnership = carOwnership; + } + + /** + * @param creditScore + * the creditScore to set + */ + public void setCreditScore(int creditScore) { + this.creditScore = creditScore; + } + + /** + * @param criminalRecord + * the criminalRecord to set + */ + public void setCriminalRecord(int criminalRecord) { + this.criminalRecord = criminalRecord; + } + + /** + * @param downPayment + * the downPayment to set + */ + public void setDownPayment(int downPayment) { + this.downPayment = downPayment; + } + + /** + * @param income + * the income to set + */ + public void setIncome(int incomeType) { + this.income = incomeType; + } + + /** + * @param jobClass + * the jobClass to set + */ + public void setJobClass(int jobClass) { + this.jobClass = jobClass; + } + + /** + * @param propertyOwnership + * the propertyOwnership to set + */ + public void setPropertyOwnership(int propertyOwnership) { + this.propertyOwnership = propertyOwnership; + } + + /** + * @param retirementAccount + * the retirementAccount to set + */ + public void setRetirementAccount(int retirementAccount) { + this.retirementAccount = retirementAccount; + } + + public void setUsername(String username) { + this.username = username; + } + + public String toExternalString() { + return username + ":" + jobClass + ":" + carOwnership + ":" + + bicycleOwnership + ":" + propertyOwnership + ":" + + retirementAccount + ":" + creditScore + ":" + age + ":" + + downPayment + ":" + bankruptcy + ":" + criminalRecord + ":" + + income; + } + + @Override + public String toString() { + return toExternalString(); + } + +} diff --git a/src/org/yooreeka/examples/credit/data/users/UserType.java b/src/org/yooreeka/examples/credit/data/users/UserType.java new file mode 100644 index 0000000..75f4de6 --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/UserType.java @@ -0,0 +1,512 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +import java.util.HashMap; +import java.util.Random; + +public abstract class UserType { + + public static final String EXCELLENT = "EX"; + public static final String VERY_GOOD = "VG"; + public static final String GOOD = "GD"; + public static final String BAD = "BD"; + public static final String DANGEROUS = "DN"; + + private static volatile HashMap noiseLevels; + + /** + * This method allows the insertion of custom noise levels by credit type. + * + * @param type + * @param levels + */ + public static void addNoiseLevel(String type, Double[] levels) { + + if (noiseLevels.containsKey(type)) { + System.out.println("WARN: Replacing noise levels for credit type: " + + type); + } + UserType.noiseLevels.put(type, levels); + } + + /** + * This method returns the noise levels by credit type + * + * @return the noiseLevels + */ + public static HashMap getNoiseLevels() { + return UserType.noiseLevels; + } + + /** + * This method allows the insertion of custom noise levels in bulk + * + * @param noiseLevels + * the noiseLevels to set + */ + public static void setNoiseLevels(HashMap noiseLevels) { + UserType.noiseLevels = noiseLevels; + } + private Random rnd = new Random(); + private int nUsers; + private int[] jobClass; + private int[] carOwnership; + private int[] motorcycleOwnership; + private int[] propertyOwnership; + private int[] retirementAccounts; + private int[] creditScore; + private int[] age; + private int[] downPayment; + + private int[] bancruptcy; + + private int[] criminalRecord; + + private int[] income; + + static { + // Set the default noise levels + noiseLevels = new HashMap(); + + Double[] exLevels = new Double[] { 1.0d, 3.0d, 7.5d, 10.0d }; + Double[] vgLevels = new Double[] { 1.0d, 3.0d, 6.0d, 10.0d }; + Double[] gdLevels = new Double[] { 1.0d, 3.0d, 4.0d, 8.0d }; + Double[] bdLevels = new Double[] { 1.0d, 3.0d, 7.5d, 10.0d }; + Double[] dnLevels = new Double[] { 1.0d, 4.5d, 9.0d, 13.5d }; + + noiseLevels.put(EXCELLENT, exLevels); + noiseLevels.put(VERY_GOOD, vgLevels); + noiseLevels.put(GOOD, gdLevels); + noiseLevels.put(BAD, bdLevels); + noiseLevels.put(DANGEROUS, dnLevels); + } + + public UserType() { + // empty + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final UserType other = (UserType) obj; + if (getUserType() == null) { + if (other.getUserType() != null) + return false; + } else if (!getUserType().equals(other.getUserType())) + return false; + return true; + } + + // ----------------------------------------------------------------- + /** + * @return the age + */ + public int[] getAge() { + return age; + } + + // ----------------------------------------------------------------- + /** + * @return the bancruptcy + */ + public int[] getBancruptcy() { + return bancruptcy; + } + + // ----------------------------------------------------------------- + /** + * @return the carOwnership + */ + public int[] getCarOwnership() { + return carOwnership; + } + + // ----------------------------------------------------------------- + /** + * @return the creditScore + */ + public int[] getCreditScore() { + return creditScore; + } + + // ----------------------------------------------------------------- + /** + * @return the criminalRecord + */ + public int[] getCriminalRecord() { + return criminalRecord; + } + + // ----------------------------------------------------------------- + /** + * @return the downPayment + */ + public int[] getDownPayment() { + return downPayment; + } + + // ----------------------------------------------------------------- + /** + * @return the income + */ + public int[] getIncome() { + return income; + } + + // ----------------------------------------------------------------- + /** + * @return the jobClass + */ + public int[] getJobClass() { + return jobClass; + } + + // ----------------------------------------------------------------- + /** + * @return the motorcycleOwnership + */ + public int[] getMotorcycleOwnership() { + return motorcycleOwnership; + } + + public String getNoisyType() { + + double gaussian = rnd.nextGaussian(); + + String noisyType = null; + + String userType = getUserType(); + + Double[] nLevels = noiseLevels.get(userType); + + if (getUserType().equals(EXCELLENT)) { + + if (gaussian <= nLevels[0]) { + + noisyType = EXCELLENT; + + } else if (gaussian > nLevels[0] && gaussian <= nLevels[1]) { + + noisyType = VERY_GOOD; + + } else if (gaussian > nLevels[1] && gaussian <= nLevels[2]) { + + noisyType = GOOD; + + } else if (gaussian > nLevels[2] && gaussian <= nLevels[3]) { + + noisyType = BAD; + + } else { + + noisyType = DANGEROUS; + } + + } else if (getUserType().equals(VERY_GOOD)) { + + if (gaussian <= nLevels[0]) { + + noisyType = VERY_GOOD; + + } else if (gaussian > nLevels[0] && gaussian <= nLevels[1]) { + + noisyType = GOOD; + + } else if (gaussian > nLevels[1] && gaussian <= nLevels[2]) { + + noisyType = EXCELLENT; + + } else if (gaussian > nLevels[2] && gaussian <= nLevels[3]) { + + noisyType = BAD; + + } else { + + noisyType = DANGEROUS; + } + + } else if (getUserType().equals(GOOD)) { + + if (gaussian <= nLevels[0]) { + + noisyType = GOOD; + + } else if (gaussian > nLevels[0] && gaussian <= nLevels[1]) { + + noisyType = VERY_GOOD; + + } else if (gaussian > nLevels[1] && gaussian <= nLevels[2]) { + + noisyType = EXCELLENT; + + } else if (gaussian > nLevels[2] && gaussian <= nLevels[3]) { + + noisyType = BAD; + + } else { + + noisyType = DANGEROUS; + } + + } else if (getUserType().equals(BAD)) { + + if (gaussian <= nLevels[0]) { + + noisyType = BAD; + + } else if (gaussian > nLevels[0] && gaussian <= nLevels[1]) { + + noisyType = GOOD; + + } else if (gaussian > nLevels[1] && gaussian <= nLevels[2]) { + + noisyType = DANGEROUS; + + } else if (gaussian > nLevels[2] && gaussian <= nLevels[3]) { + + noisyType = VERY_GOOD; + + } else { + + noisyType = EXCELLENT; + } + + } else if (getUserType().equals(DANGEROUS)) { + + if (gaussian <= nLevels[0]) { + + noisyType = DANGEROUS; + + } else if (gaussian > nLevels[0] && gaussian <= nLevels[1]) { + + noisyType = BAD; + + } else if (gaussian > nLevels[1] && gaussian <= nLevels[2]) { + + noisyType = GOOD; + + } else if (gaussian > nLevels[2] && gaussian <= nLevels[3]) { + + noisyType = VERY_GOOD; + + } else { + + noisyType = EXCELLENT; + } + } + + return noisyType; + } + + public int getNUsers() { + return nUsers; + } + + // ----------------------------------------------------------------- + /** + * @return the propertyOwnership + */ + public int[] getPropertyOwnership() { + return propertyOwnership; + } + + // ----------------------------------------------------------------- + /** + * @return the retirementAccounts + */ + public int[] getRetirementAccounts() { + return retirementAccounts; + } + + public abstract String getUserType(); + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((getUserType() == null) ? 0 : getUserType().hashCode()); + return result; + } + + public int pickAge() { + return age[rnd.nextInt(age.length)]; + } + + public int pickBancruptcy() { + return bancruptcy[rnd.nextInt(bancruptcy.length)]; + } + + public int pickCarOwnership() { + return carOwnership[rnd.nextInt(carOwnership.length)]; + } + + public int pickCreditScore() { + return creditScore[rnd.nextInt(creditScore.length)]; + } + + public int pickCriminalRecord() { + return criminalRecord[rnd.nextInt(criminalRecord.length)]; + } + + public int pickDownPayment() { + return downPayment[rnd.nextInt(downPayment.length)]; + } + + public int pickIncome() { + return income[rnd.nextInt(income.length)]; + } + + /** + * This method, and the other "pickX()" methods in this class, select a + * random value from the set of eligible values for a particular + * UserType. Hence, clearly, the returned values will be + * different for the different UserTypes. + * + * @return a random selection from the set of eligible job classes. + */ + public int pickJobClass() { + return jobClass[rnd.nextInt(jobClass.length)]; + } + + public int pickMotorcycleOwnership() { + return motorcycleOwnership[rnd.nextInt(motorcycleOwnership.length)]; + } + + public int pickPropertyOwnership() { + return propertyOwnership[rnd.nextInt(propertyOwnership.length)]; + } + + public int pickRetirementAccounts() { + return retirementAccounts[rnd.nextInt(retirementAccounts.length)]; + } + + /** + * @param age + * the age to set + */ + public void setAge(int[] age) { + this.age = age; + } + + /** + * @param bancruptcy + * the bancruptcy to set + */ + public void setBancruptcy(int[] bancruptcy) { + this.bancruptcy = bancruptcy; + } + + /** + * @param carOwnership + * the carOwnership to set + */ + public void setCarOwnership(int[] carOwnership) { + this.carOwnership = carOwnership; + } + + /** + * @param creditScore + * the creditScore to set + */ + public void setCreditScore(int[] creditScore) { + this.creditScore = creditScore; + } + + /** + * @param criminalRecord + * the criminalRecord to set + */ + public void setCriminalRecord(int[] criminalRecord) { + this.criminalRecord = criminalRecord; + } + + /** + * @param downPayment + * the downPayment to set + */ + public void setDownPayment(int[] downPayment) { + this.downPayment = downPayment; + } + + // ----------------------------------------------------------------- + + /** + * @param income + * the income to set + */ + public void setIncome(int[] income) { + this.income = income; + } + + /** + * @param jobClass + * the jobClass to set + */ + public void setJobClass(int[] jobClass) { + this.jobClass = jobClass; + } + + /** + * @param motorcycleOwnership + * the motorcycleOwnership to set + */ + public void setMotorcycleOwnership(int[] bicycleOwnership) { + this.motorcycleOwnership = bicycleOwnership; + } + + public void setNUsers(int nUsers) { + this.nUsers = nUsers; + } + + /** + * @param propertyOwnership + * the propertyOwnership to set + */ + public void setPropertyOwnership(int[] propertyOwnership) { + this.propertyOwnership = propertyOwnership; + } + + /** + * @param retirementAccounts + * the retirementAccounts to set + */ + public void setRetirementAccounts(int[] retirementAccounts) { + this.retirementAccounts = retirementAccounts; + } + +} diff --git a/src/org/yooreeka/examples/credit/data/users/VeryGoodUserType.java b/src/org/yooreeka/examples/credit/data/users/VeryGoodUserType.java new file mode 100644 index 0000000..608f28d --- /dev/null +++ b/src/org/yooreeka/examples/credit/data/users/VeryGoodUserType.java @@ -0,0 +1,53 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.data.users; + +public class VeryGoodUserType extends UserType { + + { + setAge(new int[] { 1, 2, 3, 4, 5, 6, 7, 8 }); + setBancruptcy(new int[] { 0 }); + setCarOwnership(new int[] { 1 }); + setCreditScore(new int[] { 5, 6, 7 }); + setCriminalRecord(new int[] { 0 }); + setDownPayment(new int[] { 3, 4 }); + setIncome(new int[] { 4, 5, 6, 7 }); + setJobClass(new int[] { 2, 3, 4, 5 }); + setMotorcycleOwnership(new int[] { 0, 1 }); + setPropertyOwnership(new int[] { 1 }); + setRetirementAccounts(new int[] { 3, 4, 5 }); + } + + @Override + public String getUserType() { + return UserType.VERY_GOOD; + } +} \ No newline at end of file diff --git a/src/org/yooreeka/examples/credit/util/AttributeInfo.java b/src/org/yooreeka/examples/credit/util/AttributeInfo.java new file mode 100644 index 0000000..4645d3c --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/AttributeInfo.java @@ -0,0 +1,68 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +public class AttributeInfo { + private String name; + private int minValue; + private int maxValue; + + public AttributeInfo(String name, int min, int max) { + this.name = name; + this.minValue = min; + this.maxValue = max; + } + + public int getMaxValue() { + return maxValue; + } + + public int getMinValue() { + return minValue; + } + + public String getName() { + return name; + } + + public void setMaxValue(int maxValue) { + this.maxValue = maxValue; + } + + public void setMinValue(int minValue) { + this.minValue = minValue; + } + + public void setName(String name) { + this.name = name; + } + +} diff --git a/src/org/yooreeka/examples/credit/util/AttributeUtils.java b/src/org/yooreeka/examples/credit/util/AttributeUtils.java new file mode 100644 index 0000000..53fc69e --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/AttributeUtils.java @@ -0,0 +1,88 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import java.util.HashMap; +import java.util.Map; + +import org.yooreeka.examples.credit.CreditInstance; + +public class AttributeUtils { + + private static Map attributeInfoMap = new HashMap(); + + static { + AttributeInfo ai = null; + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_AGE, 1, 10); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_CAR_OWNERSHIP, 0, 1); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_CREDIT_SCORE, 1, 8); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_INCOME_TYPE, 1, 10); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_JOB_CLASS, 1, 5); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_MORTGAGE_DOWN_PAYMENT, + 1, 4); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo( + CreditInstance.ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP, 0, 1); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo( + CreditInstance.ATTR_NAME_OTHER_PROPERTY_OWNERSHIP, 0, 1); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_CRIMINAL_RECORD, 0, 1); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_BANKRUPTCY, 0, 1); + attributeInfoMap.put(ai.getName(), ai); + + ai = new AttributeInfo(CreditInstance.ATTR_NAME_RETIREMENT_ACCOUNT, 1, + 8); + attributeInfoMap.put(ai.getName(), ai); + } + + public static double getNormalizedValue(String attrName, double value) { + AttributeInfo ai = attributeInfoMap.get(attrName); + return (value - ai.getMinValue()) + / (ai.getMaxValue() - ai.getMinValue()); + } +} diff --git a/src/org/yooreeka/examples/credit/util/BootstrapTrainingSetBuilder.java b/src/org/yooreeka/examples/credit/util/BootstrapTrainingSetBuilder.java new file mode 100644 index 0000000..80d8750 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/BootstrapTrainingSetBuilder.java @@ -0,0 +1,121 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import java.util.Map; +import java.util.Random; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Instance; + +/** + * Builds bootstrap training sets from the original training set. + */ +public class BootstrapTrainingSetBuilder { + + private TrainingSet originalTrainingSet; + + /** + * + * @param originalTrainingSet + * bootstrap training sets will be derived from this training + * set. + * @param bootstrapSampleSize + * size of bootstrap training sets that should be produced. + */ + public BootstrapTrainingSetBuilder(TrainingSet originalTrainingSet) { + + this.originalTrainingSet = originalTrainingSet; + } + + public TrainingSet buildBootstrapSet() { + + int N = originalTrainingSet.getSize(); + + Map instances = originalTrainingSet.getInstances(); + + Instance[] selectedInstances = new Instance[N]; + /* + * Building a new training set of size N by sampling N instances from + * the original data set with replacement. As a result, some instances + * from the original data set will be missing and some will be + * duplicated. + */ + Random rnd = new Random(); + + // pick a center + int center = rnd.nextInt(N); + + int countN = 0; + + while (countN < N) { + + if (countN % (N / 5) == 0) { + center = rnd.nextInt(N); + } + + int selectedInstanceId = pickInstanceId(N, center); + + Instance selectedInstance = instances.get(selectedInstanceId); + selectedInstances[countN] = selectedInstance; + countN++; + } + + TrainingSet tS = new TrainingSet(selectedInstances); + + return tS; + } + + private int pickInstanceId(int N, int center) { + + Random rnd = new Random(); + boolean loop = true; + int selectedInstanceId = -1; + + // create the scale factor + double scale = (N / 2) / 4.0d; + + while (loop) { + + // center the distribution to be N/2 left and right of the center + // with almost certainty + selectedInstanceId = new Double(center + rnd.nextGaussian() * scale) + .intValue(); + + // do not break the loop unless we found a valid instance + if (selectedInstanceId >= 0 && selectedInstanceId < N) { + loop = false; + } + } + return selectedInstanceId; + } + +} diff --git a/src/org/yooreeka/examples/credit/util/ClassifierResults.java b/src/org/yooreeka/examples/credit/util/ClassifierResults.java new file mode 100644 index 0000000..65c59d1 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/ClassifierResults.java @@ -0,0 +1,70 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +public class ClassifierResults { + private String classifierId; + private boolean[] results; + private int nCorrect; + + public ClassifierResults(String classifierId, int n) { + this.classifierId = classifierId; + this.results = new boolean[n]; + this.nCorrect = 0; + } + + public double getAccuracy() { + return (double) nCorrect / (double) results.length; + } + + public String getClassifierId() { + return classifierId; + } + + public int getN() { + return results.length; + } + + public int getNCorrect() { + return nCorrect; + } + + public boolean getResult(int i) { + return results[i]; + } + + public void setResult(int i, boolean value) { + results[i] = value; + if (value) { + nCorrect++; + } + } +} diff --git a/src/org/yooreeka/examples/credit/util/CreditDataUtils.java b/src/org/yooreeka/examples/credit/util/CreditDataUtils.java new file mode 100644 index 0000000..3e62db8 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/CreditDataUtils.java @@ -0,0 +1,100 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.examples.credit.data.users.User; + +public class CreditDataUtils { + + public static List loadUsers(String filename) { + List users = new ArrayList(); + + FileReader fReader = null; + try { + fReader = new FileReader(filename); + } catch (FileNotFoundException fnfX) { + fnfX.printStackTrace(); + } + + try { + BufferedReader reader = new BufferedReader(fReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.trim().length() > 0) { + User user = new User(); + user.loadFromExternalString(line); + users.add(user); + } + } + } catch (IOException ioX) { + throw new RuntimeException("Failed to load users from file: '" + + filename + "' ", ioX); + } + + try { + fReader.close(); + } catch (IOException ioX) { + ioX.printStackTrace(); + } + + return users; + } + + public static void saveUsers(String filename, List users) { + try { + FileWriter fout = new FileWriter(filename); + BufferedWriter writer = new BufferedWriter(fout); + for (User user : users) { + writer.write(user.toExternalString()); + writer.write("\n"); + } + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to save users in file: '" + + filename + "' ", e); + } + } + + private CreditDataUtils() { + // empty + } + +} diff --git a/src/org/yooreeka/examples/credit/util/CreditErrorEstimator.java b/src/org/yooreeka/examples/credit/util/CreditErrorEstimator.java new file mode 100644 index 0000000..2d6b339 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/CreditErrorEstimator.java @@ -0,0 +1,231 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.credit.BaggingCreditClassifier; +import org.yooreeka.examples.credit.BoostingCreditClassifier; +import org.yooreeka.examples.credit.CreditConcept; +import org.yooreeka.examples.credit.DTCreditClassifier; +import org.yooreeka.examples.credit.NBCreditClassifier; +import org.yooreeka.examples.credit.NNCreditClassifier; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; + +public class CreditErrorEstimator { + + private Classifier classifier; + private UserInstanceBuilder instanceBuilder; + private UserDataset testDS; + private ClassifierResults classifierResults; + + int[][] confusionMatrix = new int[5][5]; + + private int correctCount = 0; + private int misclassifiedInstanceCount = 0; + private boolean verbose = true; + + public CreditErrorEstimator(UserDataset testDS, + BaggingCreditClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + this.classifierResults = new ClassifierResults(classifier.getName(), + testDS.getSize()); + } + + public CreditErrorEstimator(UserDataset testDS, + BoostingCreditClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + this.classifierResults = new ClassifierResults(classifier.getName(), + testDS.getSize()); + } + + public CreditErrorEstimator(UserDataset testDS, + DTCreditClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + this.classifierResults = new ClassifierResults(classifier.getName(), + testDS.getSize()); + } + + public CreditErrorEstimator(UserDataset testDS, + NBCreditClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + this.classifierResults = new ClassifierResults(classifier.getName(), + testDS.getSize()); + } + + public CreditErrorEstimator(UserDataset testDS, + NNCreditClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + this.classifierResults = new ClassifierResults(classifier.getName(), + testDS.getSize()); + } + + public double getAccuracy() { + return (double) correctCount / (double) testDS.getSize(); + } + + /** + * @return the confusionMatrix + */ + public int[][] getConfusionMatrix() { + return confusionMatrix; + } + + public int getCorrectCount() { + return correctCount; + } + + public int getMisclassifiedInstanceCount() { + return this.misclassifiedInstanceCount; + } + + public ClassifierResults getResults() { + return classifierResults; + } + + public boolean isVerbose() { + return verbose; + } + + public void run() { + + correctCount = 0; + misclassifiedInstanceCount = 0; + + int idx = 0; + + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 5; j++) { + confusionMatrix[i][j] = 0; + } + } + + long tStart = System.currentTimeMillis(); + + for (User user : testDS.getUsers()) { + + Instance instance = instanceBuilder.createInstance(user); + Concept concept = classifier.classify(instance); + Concept expectedConcept = new CreditConcept(user.getCategory()); + + String actualCreditLabel = expectedConcept.getName(); + String predictedCreditLabel = concept.getName(); + + // Build the confusion matrix + int i = CreditConcept.getIndex(actualCreditLabel); + int j = CreditConcept.getIndex(predictedCreditLabel); + + confusionMatrix[i][j]++; + + if (actualCreditLabel.equals(predictedCreditLabel)) { + + correctCount++; + + classifierResults.setResult(idx, true); + + } else { + // Uncomment the following lines to see the details of the + // misclassifications + // System.out.print("Classified as: " + concept.getName() + + // " "); + // instance.print(); + + misclassifiedInstanceCount++; + + classifierResults.setResult(idx, false); + } + + idx++; + } + + if (verbose) { + + long tEnd = System.currentTimeMillis(); + + // SUMMARY + System.out.println(" Classification completed in " + 0.001 + * (tEnd - tStart) + " seconds.\n"); + int totalCount = testDS.getSize(); + + System.out.println(" Total test dataset txns: " + totalCount); + + System.out.println(" Classified correctly: " + getCorrectCount() + + ", Misclassified: " + getMisclassifiedInstanceCount()); + + System.out.println(" Accuracy: " + getAccuracy()); + System.out + .println("___________________________________________________________\n"); + // DETAILS + System.out.println(" CONFUSION MATRIX"); + System.out + .println("___________________________________________________________\n"); + + System.out.printf("%4s", ""); + for (int i = 0; i < 5; i++) { + System.out.printf("%7s", CreditConcept.getLabel(i)); + } + System.out.println(); + + for (int i = 0; i < 5; i++) { + System.out.printf("%4s", CreditConcept.getLabel(i)); + for (int j = 0; j < 5; j++) { + System.out.printf("%7s", confusionMatrix[i][j]); + } + System.out.println(); + } + System.out + .println("___________________________________________________________\n"); + + } + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/org/yooreeka/examples/credit/util/DataGenerator.java b/src/org/yooreeka/examples/credit/util/DataGenerator.java new file mode 100644 index 0000000..9263ae3 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/DataGenerator.java @@ -0,0 +1,130 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.yooreeka.examples.credit.data.users.User; +import org.yooreeka.examples.credit.data.users.UserType; + +public class DataGenerator { + + private long nextUserId = 1; + + private boolean isNoiseOn = false; + private HashMap userTypeDistributions; + + public DataGenerator() { + userTypeDistributions = new HashMap(); + } + + private long generateNextUniqueUserId() { + return nextUserId++; + } + + public User generateUser(UserType userType) { + + User user = new User(); + + long userId = generateNextUniqueUserId(); + + String username; + + if (isNoiseOn) { + username = userType.getNoisyType(); + } else { + username = userType.getUserType(); + } + + username = username + String.valueOf(userId); + + user.setUsername(username); + + user.setAge(userType.pickAge()); + user.setCarOwnership(userType.pickCarOwnership()); + user.setCreditScore(userType.pickCreditScore()); + user.setIncome(userType.pickIncome()); + user.setJobClass(userType.pickJobClass()); + user.setDownPayment(userType.pickDownPayment()); + user.setBicycleOwnership(userType.pickMotorcycleOwnership()); + user.setPropertyOwnership(userType.pickPropertyOwnership()); + user.setCriminalRecord(userType.pickCriminalRecord()); + user.setBankruptcy(userType.pickBancruptcy()); + user.setRetirementAccount(userType.pickRetirementAccounts()); + + return user; + } + + public List generateUsers(List userTypes) { + List allUsers = new ArrayList(); + + for (UserType userType : userTypes) { + allUsers.addAll(generateUsers(userType, userType.getNUsers())); + } + + return allUsers; + } + + public List generateUsers(UserType userType, int n) { + + List users = new ArrayList(); + + userTypeDistributions.put(userType, n); + + for (int i = 0; i < n; i++) { + User u = generateUser(userType); + users.add(u); + } + + return users; + } + + /** + * @return the isNoiseOn + */ + public boolean isNoiseOn() { + return isNoiseOn; + } + + public void setNextUserId(long nextUserId) { + this.nextUserId = nextUserId; + } + + /** + * @param isNoiseOn + * the isNoiseOn to set + */ + public void setNoiseOn(boolean isNoiseOn) { + this.isNoiseOn = isNoiseOn; + } +} diff --git a/src/org/yooreeka/examples/credit/util/UserInstanceBuilder.java b/src/org/yooreeka/examples/credit/util/UserInstanceBuilder.java new file mode 100644 index 0000000..9b8cb73 --- /dev/null +++ b/src/org/yooreeka/examples/credit/util/UserInstanceBuilder.java @@ -0,0 +1,167 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.credit.util; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.core.DoubleAttribute; +import org.yooreeka.algos.taxis.core.StringAttribute; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.credit.CreditConcept; +import org.yooreeka.examples.credit.CreditInstance; +import org.yooreeka.examples.credit.data.UserDataset; +import org.yooreeka.examples.credit.data.users.User; + +public class UserInstanceBuilder { + + private boolean useDoubleAttributes; + + public UserInstanceBuilder() { + this(false); + } + + /** + * + * @param useDoubleAttributes + * determines whether instance builder should produce instances + * with string attributes or double attributes. + */ + public UserInstanceBuilder(boolean useDoubleAttributes) { + + this.useDoubleAttributes = useDoubleAttributes; + } + + private CreditInstance convertToDoubleAttributes(Instance instance) { + + CreditInstance creditInstance = (CreditInstance) instance; + + List attributes = new ArrayList(); + + for (Attribute a : creditInstance.getAtrributes()) { + DoubleAttribute da = null; + if (a instanceof StringAttribute) { + String name = a.getName(); + double value = Double.valueOf((String) a.getValue()); + // double normalizedValue = value; + double normalizedValue = AttributeUtils.getNormalizedValue( + name, value); + da = new DoubleAttribute(name, normalizedValue); + } else if (a instanceof DoubleAttribute) { + da = (DoubleAttribute) a; + } else { + throw new RuntimeException("Unexpected attribute type: " + + a.getClass().getSimpleName() + ", attribute name: " + + a.getName() + ", attribute value: " + a.getValue()); + } + + attributes.add(da); + } + + return new CreditInstance(creditInstance.getConcept(), attributes); + } + + public Instance createInstance(Instance i) { + if (useDoubleAttributes) { + return convertToDoubleAttributes(i); + } else { + return i; + } + } + + public Instance createInstance(User u) { + List attributes = new ArrayList(); + + attributes.add(new StringAttribute(CreditInstance.ATTR_NAME_JOB_CLASS, + String.valueOf(u.getJobClass()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_INCOME_TYPE, String.valueOf(u + .getIncome()))); + + attributes.add(new StringAttribute(CreditInstance.ATTR_NAME_AGE, String + .valueOf(u.getAge()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_CAR_OWNERSHIP, String.valueOf(u + .getCarOwnership()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_CREDIT_SCORE, String.valueOf(u + .getCreditScore()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_MORTGAGE_DOWN_PAYMENT, String + .valueOf(u.getDownPayment()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_MOTOR_BICYCLE_OWNERSHIP, String + .valueOf(u.getBicycleOwnership()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_OTHER_PROPERTY_OWNERSHIP, String + .valueOf(u.getPropertyOwnership()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_CRIMINAL_RECORD, String.valueOf(u + .getCriminalRecord()))); + + attributes.add(new StringAttribute(CreditInstance.ATTR_NAME_BANKRUPTCY, + String.valueOf(u.getBankruptcy()))); + + attributes.add(new StringAttribute( + CreditInstance.ATTR_NAME_RETIREMENT_ACCOUNT, String.valueOf(u + .getRetirementAccount()))); + + CreditConcept c = new CreditConcept(u.getCategory()); + + CreditInstance instance = new CreditInstance(c, attributes); + + return createInstance(instance); + } + + public TrainingSet createTrainingSet(UserDataset ds) { + List users = ds.getUsers(); + int nUsers = users.size(); + Instance[] instances = new Instance[nUsers]; + for (int i = 0; i < nUsers; i++) { + User u = users.get(i); + instances[i] = createInstance(u); + } + + TrainingSet tS = new TrainingSet(instances); + + return tS; + } + +} diff --git a/src/org/yooreeka/examples/fraud/DTFraudClassifier.java b/src/org/yooreeka/examples/fraud/DTFraudClassifier.java new file mode 100644 index 0000000..c402c7a --- /dev/null +++ b/src/org/yooreeka/examples/fraud/DTFraudClassifier.java @@ -0,0 +1,136 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.tree.DecisionTreeClassifier; +import org.yooreeka.examples.fraud.data.Transaction; +import org.yooreeka.examples.fraud.data.TransactionDataset; +import org.yooreeka.examples.fraud.data.TransactionInstanceBuilder; + +public class DTFraudClassifier extends DecisionTreeClassifier { + + private static final long serialVersionUID = 5491106283513021975L; + + public static DTFraudClassifier loadClassifier(String filename) { + + Object o = null; + File f = new File(filename); + if (f.exists()) { + try { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file: '" + filename + + "'", e); + } + } else { + throw new IllegalArgumentException("File doesn't exist: '" + + filename + "'."); + } + + System.out.println("loaded classifier from file: " + filename); + + return (DTFraudClassifier) o; + + } + + public static void saveClassifier(String filename, DTFraudClassifier o) { + + try { + File f = new File(filename); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(o); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException("Error while saving data into file: '" + + filename + "'", e); + } + + System.out.println("saved classifier in file: " + filename); + } + + private TransactionInstanceBuilder instanceBuilder; + + public DTFraudClassifier(String name, TransactionDataset ds) { + + super(name, ds.createTrainingDataset()); + this.instanceBuilder = ds.getInstanceBuilder(); + } + + public DTFraudClassifier(TransactionDataset ds) { + this(DTFraudClassifier.class.getSimpleName(), ds); + } + + public Concept classify(Transaction t) { + return classify(instanceBuilder.createInstance(t)); + } + + @Override + protected Concept createConcept(String category) { + return new TransactionConcept(category); + } + + public TransactionInstanceBuilder getInstanceBuilder() { + return instanceBuilder; + } + + public void setInstanceBuilder(TransactionInstanceBuilder instanceBuilder) { + this.instanceBuilder = instanceBuilder; + } + + public void useDefaultAttributes() { + trainOnAttribute(TransactionInstance.ATTR_NAME_N_DESCRIPTION, false); + trainOnAttribute(TransactionInstance.ATTR_NAME_N_LOCATION, false); + trainOnAttribute(TransactionInstance.ATTR_NAME_N_TXN_AMT, false); + + } +} diff --git a/src/org/yooreeka/examples/fraud/NNFraudClassifier.java b/src/org/yooreeka/examples/fraud/NNFraudClassifier.java new file mode 100644 index 0000000..ee4a7a0 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/NNFraudClassifier.java @@ -0,0 +1,356 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.algos.taxis.core.DoubleAttribute; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.fraud.data.Transaction; +import org.yooreeka.examples.fraud.data.TransactionDataset; +import org.yooreeka.examples.fraud.data.TransactionInstanceBuilder; + +public class NNFraudClassifier implements Classifier, java.io.Serializable { + + private static final long serialVersionUID = -1567098614540042277L; + + private static final String SERIALIZATION_PATH = YooreekaConfigurator + .getHome() + "\\data\\ch05\\"; + + public static NNFraudClassifier load(String filename) { + + Object o = null; + File f = new File(SERIALIZATION_PATH + filename); + if (f.exists()) { + try { + FileInputStream fInStream = new FileInputStream(f); + BufferedInputStream bufInStream = new BufferedInputStream( + fInStream); + ObjectInputStream objInStream = new ObjectInputStream( + bufInStream); + o = objInStream.readObject(); + objInStream.close(); + } catch (Exception e) { + throw new RuntimeException( + "Error while loading data from file: '" + filename + + "'", e); + } + } else { + throw new IllegalArgumentException("File doesn't exist: '" + + filename + "'."); + } + + System.out.println("loaded classifier from file: " + filename); + + return (NNFraudClassifier) o; + + } + + private boolean verbose = false; + + private String name; + + /* + * Neural Network that will be used by this classifier. + */ + private TransactionNN nn; + + private int DEFAULT_TRAINING_ITERATIONS = 10; + + /* + * Number of times to feed training instances into the network during + * training. + */ + private int nTrainingIterations = DEFAULT_TRAINING_ITERATIONS; + + private TransactionDataset ds; + + private transient TrainingSet ts; + + private TransactionInstanceBuilder instanceBuilder; + + /* + * Attribute names that should be used as Neural Network inputs. + */ + private List availableAttributeNames; + + public NNFraudClassifier(String name, TransactionDataset ds) { + + this.name = name; + + this.ds = ds; + + this.ts = ds.createTrainingDataset(); + + this.instanceBuilder = ds.getInstanceBuilder(); + + this.availableAttributeNames = new ArrayList(); + + nn = createNeuralNetwork(); + + } + + public NNFraudClassifier(TransactionDataset ds) { + this(NNFraudClassifier.class.getSimpleName(), ds); + } + + public Concept classify(Instance instance) { + + double[] x = createNNInputs(instance); + + double[] y = nn.classify(x); + + Concept c = createConceptFromNNOutput(y); + + if (verbose) { + System.out.println("\nAssessment:\n >> This is a " + c.getName()); + } + return c; + } + + public Concept classify(String transactionId) { + setVerbose(true); + Transaction t = ds.findTransactionById(transactionId); + return classify(t); + } + + public Concept classify(Transaction t) { + if (verbose) { + System.out.println("Transaction:\n >> " + t.toString()); + } + return classify(instanceBuilder.createInstance(t)); + } + + private Concept createConceptFromNNOutput(double[] y) { + + double threshold = 0.5; + + Concept c = null; + + if (y[0] >= threshold) { + c = new TransactionConcept(TransactionConcept.CONCEPT_LABEL_FRAUD); + } else { + c = new TransactionConcept(TransactionConcept.CONCEPT_LABEL_VALID); + } + + return c; + } + + private TransactionNN createNeuralNetwork() { + + String nnName = TransactionNN.class.getSimpleName(); + + return new TransactionNN(nnName); + } + + public double[] createNNInputs(Instance instance) { + + int nInputNodes = nn.getInputNodeCount(); + + double[] x = new double[nInputNodes]; + + for (int i = 0; i < nInputNodes; i++) { + + String attrName = this.availableAttributeNames.get(i); + Attribute a = instance.getAttributeByName(attrName); + + if (a instanceof DoubleAttribute) { + x[i] = (Double) a.getValue(); + } else { + if (a == null) { + throw new RuntimeException( + "Failed to find attribute with name: '" + attrName + + "'. Instance: " + instance.toString()); + } else { + throw new RuntimeException( + "Invalid attribute type. Only " + + DoubleAttribute.class.getSimpleName() + + " attribute" + + " types can be used in NN. Actual attribute type: " + + a.getClass().getSimpleName()); + } + } + + } + + return x; + } + + public double[] createNNOutputs(Instance i) { + + int nOutputNodes = nn.getOutputNodeCount(); + + double[] y = new double[nOutputNodes]; + + if (TransactionConcept.CONCEPT_LABEL_FRAUD.equals(i.getConcept() + .getName())) { + y[0] = 1; + } else { + y[0] = 0; + } + return y; + } + + public TransactionInstanceBuilder getInstanceBuilder() { + return this.instanceBuilder; + } + + /** + * @return the name + */ + public String getName() { + return name; + } + + /** + * @return the verbose + */ + public boolean isVerbose() { + return verbose; + } + + public void save() { + + String filename = SERIALIZATION_PATH + this.getName(); + try { + File f = new File(filename); + FileOutputStream foutStream = new FileOutputStream(f); + BufferedOutputStream boutStream = new BufferedOutputStream( + foutStream); + ObjectOutputStream objOutputStream = new ObjectOutputStream( + boutStream); + objOutputStream.writeObject(this); + objOutputStream.flush(); + boutStream.close(); + } catch (IOException e) { + throw new RuntimeException("Error while saving data into file: '" + + filename + "'", e); + } + + System.out.println("saved classifier in file: " + filename); + } + + /** + * @param name + * the name to set + */ + public void setName(String name) { + this.name = name; + } + + public void setNTrainingIterations(int trainingIterations) { + nTrainingIterations = trainingIterations; + } + + /** + * @param verbose + * the verbose to set + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + + public boolean train() { + + if (ts == null) { + throw new RuntimeException( + "Can't train classifier - training dataset is null."); + } + + if (nn == null) { + throw new RuntimeException( + "No Neural Network found. Can't proceed."); + } + + if (nn.getInputNodeCount() != availableAttributeNames.size()) { + throw new RuntimeException( + "Number of attributes doesn't match with the number of input nodes." + + "Attributes: " + availableAttributeNames.size() + + ", Input nodes: " + nn.getInputNodeCount()); + } + + if (nn.getOutputNodeCount() != 1) { + throw new RuntimeException("NN has " + nn.getOutputNodeCount() + + " output nodes. " + + "Classifier expects network with only one output node."); + } + + // Build and train NN + trainNeuralNetwork(nTrainingIterations); + + return true; + } + + private void trainNeuralNetwork(int nIterations) { + + for (int i = 1; i <= nIterations; i++) { + for (Instance instance : ts.getInstances().values()) { + double[] nnInput = createNNInputs(instance); + double[] nnExpectedOutput = createNNOutputs(instance); + + nn.train(nnInput, nnExpectedOutput); + } + + if (verbose) { + System.out.println("finished training pass: " + i + " out of " + + nIterations); + } + } + + } + + public void trainOnAttribute(String name) { + availableAttributeNames.add(name); + } + + /** + * This methods facilitates the loading of training attributes + */ + public void useDefaultAttributes() { + trainOnAttribute(TransactionInstance.ATTR_NAME_N_TXN_AMT); + trainOnAttribute(TransactionInstance.ATTR_NAME_N_LOCATION); + trainOnAttribute(TransactionInstance.ATTR_NAME_N_DESCRIPTION); + } +} diff --git a/src/org/yooreeka/examples/fraud/TransactionConcept.java b/src/org/yooreeka/examples/fraud/TransactionConcept.java new file mode 100644 index 0000000..6c8d68b --- /dev/null +++ b/src/org/yooreeka/examples/fraud/TransactionConcept.java @@ -0,0 +1,92 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud; + +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class TransactionConcept implements Concept { + + public static final String CONCEPT_LABEL_FRAUD = "FRAUD_TXN"; + public static final String CONCEPT_LABEL_VALID = "VALID_TXN"; + + private String name; + + public TransactionConcept(boolean isFraud) { + if (isFraud) { + name = CONCEPT_LABEL_FRAUD; + } else { + name = CONCEPT_LABEL_VALID; + } + } + + public TransactionConcept(String name) { + this.name = name; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final TransactionConcept other = (TransactionConcept) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + return true; + } + + public Instance[] getInstances() { + throw new UnsupportedOperationException("not implemented."); + } + + public String getName() { + return name; + } + + public Concept getParent() { + return null; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; + } + +} diff --git a/src/org/yooreeka/examples/fraud/TransactionInstance.java b/src/org/yooreeka/examples/fraud/TransactionInstance.java new file mode 100644 index 0000000..c8b306a --- /dev/null +++ b/src/org/yooreeka/examples/fraud/TransactionInstance.java @@ -0,0 +1,99 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud; + +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; + +public class TransactionInstance implements Instance { + + public static final String ATTR_NAME_N_TXN_AMT = "n_txnamt"; + public static final String ATTR_NAME_N_LOCATION = "n_location"; + public static final String ATTR_NAME_N_DESCRIPTION = "n_description"; + public static final String ATTR_NAME_USERID = "userid"; + public static final String ATTR_NAME_TXNID = "txnid"; + public static final String ATTR_NAME_TXN_AMT = "txnamt"; + public static final String ATTR_NAME_LOCATION_X = "location_x"; + public static final String ATTR_NAME_LOCATION_Y = "location_y"; + public static final String ATTR_NAME_DESCRIPTION = "description"; + + protected TransactionConcept concept; + protected Attribute[] attributes; + + public TransactionInstance(TransactionConcept c, Attribute[] attrs) { + this.concept = c; + this.attributes = attrs; + } + + public Attribute[] getAtrributes() { + return attributes; + } + + public Attribute getAttributeByName(String attrName) { + Attribute matchedAttribute = null; + + if (attributes != null) { + for (Attribute a : attributes) { + if (attrName.equalsIgnoreCase(a.getName())) { + matchedAttribute = a; + break; + } + } + } + + return matchedAttribute; + } + + public TransactionConcept getConcept() { + return concept; + } + + public void print() { + if (attributes != null) { + for (Attribute a : attributes) { + + if (a == null || a.getName() == null) { + System.out.print(" - "); + } else { + if (a.getValue() == null) { + System.out.print(" - "); + } else { + System.out.print(" - " + a.getName() + " = " + + a.getValue()); + } + } + } + } + + System.out.println(" --> " + getConcept().getName()); + } + +} diff --git a/src/org/yooreeka/examples/fraud/TransactionNN.java b/src/org/yooreeka/examples/fraud/TransactionNN.java new file mode 100644 index 0000000..b35640c --- /dev/null +++ b/src/org/yooreeka/examples/fraud/TransactionNN.java @@ -0,0 +1,106 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud; + +import org.yooreeka.algos.taxis.networks.neural.core.BaseNN; +import org.yooreeka.algos.taxis.networks.neural.core.intf.Layer; + +public class TransactionNN extends BaseNN { + + private static final long serialVersionUID = -3840865001527729603L; + + public TransactionNN(String name) { + super(name); + + createNN351(); + } + + /* + * Creates: 3 -> 5 -> 1 network. + */ + private void createNN351() { + + // 1. Define Layers, Nodes and Node Biases + Layer inputLayer = createInputLayer(0, // layer id + 3 // number of nodes + ); + + Layer hiddenLayer = createHiddenLayer(1, // layer id + 5, // number of nodes + new double[] { 1, 1.5, 1, 0.5, 1 } // node biases + ); + + Layer outputLayer = createOutputLayer(2, // layer id + 1, // number of nodes + new double[] { 1.5 } // node biases + ); + + setInputLayer(inputLayer); + setOutputLayer(outputLayer); + addHiddenLayer(hiddenLayer); + + // 2. Define links and weights between nodes + // Id format: + + // Weights for links from Input Layer to Hidden Layer + setLink("0:0", "1:0", 0.25); + setLink("0:0", "1:1", -0.5); + setLink("0:0", "1:2", 0.25); + setLink("0:0", "1:3", 0.25); + setLink("0:0", "1:4", -0.5); + + setLink("0:1", "1:0", 0.25); + setLink("0:1", "1:1", -0.5); + setLink("0:1", "1:2", 0.25); + setLink("0:1", "1:3", 0.25); + setLink("0:1", "1:4", -0.5); + + setLink("0:2", "1:0", 0.25); + setLink("0:2", "1:1", -0.5); + setLink("0:2", "1:2", 0.25); + setLink("0:2", "1:3", 0.25); + setLink("0:2", "1:4", -0.5); + + // Weights for links from Hidden Layer to Output Layer + + setLink("1:0", "2:0", -0.5); + setLink("1:1", "2:0", 0.5); + setLink("1:2", "2:0", -0.5); + setLink("1:3", "2:0", -0.5); + setLink("1:4", "2:0", 0.5); + + if (isVerbose()) { + System.out.println("NN created"); + } + + } + +} diff --git a/src/org/yooreeka/examples/fraud/data/Transaction.java b/src/org/yooreeka/examples/fraud/data/Transaction.java new file mode 100644 index 0000000..f219ae5 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/data/Transaction.java @@ -0,0 +1,124 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.data; + +public class Transaction implements java.io.Serializable { + + private static final long serialVersionUID = -4537757080789309552L; + + private String description; + + private TransactionLocation location; + + private double amount; + + private boolean fraud; + + private int userId; + + private long txnId; + + public Transaction() { + } + + public double getAmount() { + return amount; + } + + public String getDescription() { + return description; + } + + public TransactionLocation getLocation() { + return location; + } + + public long getTxnId() { + return txnId; + } + + public int getUserId() { + return userId; + } + + public boolean isFraud() { + return fraud; + } + + public void loadFromExternalString(String text) { + + String[] values = text.split(":"); + + userId = Integer.parseInt(values[0]); + txnId = Long.parseLong(values[1]); + description = values[2]; + amount = Double.parseDouble(values[3]); + double x = Double.parseDouble(values[4]); + double y = Double.parseDouble(values[5]); + location = new TransactionLocation(x, y); + fraud = Boolean.parseBoolean(values[6]); + } + + public void setAmount(double amount) { + this.amount = amount; + } + + public void setDescription(String description) { + this.description = description; + } + + public void setFraud(boolean fraud) { + this.fraud = fraud; + } + + public void setLocation(TransactionLocation location) { + this.location = location; + } + + public void setTxnId(long txnId) { + this.txnId = txnId; + } + + public void setUserId(int userId) { + this.userId = userId; + } + + public String toExternalString() { + return userId + ":" + txnId + ":" + description + ":" + amount + ":" + + location.getX() + ":" + location.getY() + ":" + fraud; + } + + @Override + public String toString() { + return toExternalString(); + } + +} diff --git a/src/org/yooreeka/examples/fraud/data/TransactionDataset.java b/src/org/yooreeka/examples/fraud/data/TransactionDataset.java new file mode 100644 index 0000000..b817d96 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/data/TransactionDataset.java @@ -0,0 +1,134 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.examples.fraud.util.UserStatisticsCalculator; + +public class TransactionDataset implements java.io.Serializable { + + private static final long serialVersionUID = 3061645520644719411L; + + private Map> txnsByUserIdMap; + private Map txnsByTxnIdMap; + private Integer maxUserId; + private TransactionInstanceBuilder instanceBuilder; + + public TransactionDataset(List txnsList) { + this.txnsByUserIdMap = new HashMap>(); + this.txnsByTxnIdMap = new HashMap(txnsList.size()); + + for (Transaction e : txnsList) { + + txnsByTxnIdMap.put(String.valueOf(e.getTxnId()), e); + + Integer userId = e.getUserId(); + List userTxns = txnsByUserIdMap.get(userId); + if (userTxns == null) { + userTxns = new ArrayList(); + txnsByUserIdMap.put(userId, userTxns); + } + + if (maxUserId == null || e.getUserId() > maxUserId) { + maxUserId = e.getUserId(); + } + + userTxns.add(e); + } + + instanceBuilder = new TransactionInstanceBuilder(); + + } + + public void calculateUserStats() { + UserStatisticsCalculator userStatsCalculator = new UserStatisticsCalculator(); + + instanceBuilder.setUserStatisticsMap(userStatsCalculator + .calculateStatistics(this)); + } + + public TrainingSet createTrainingDataset() { + return instanceBuilder.createTrainingSet(this); + } + + public Transaction findTransactionById(String id) { + return txnsByTxnIdMap.get(id); + } + + public List findUserTxns(Integer userId) { + return new ArrayList(txnsByUserIdMap.get(userId)); + } + + /** + * @return the instanceBuilder + */ + public TransactionInstanceBuilder getInstanceBuilder() { + return instanceBuilder; + } + + public Integer getMaxUserId() { + return maxUserId; + } + + public int getSize() { + return txnsByTxnIdMap.size(); + } + + public List getTransactions() { + return new ArrayList(txnsByTxnIdMap.values()); + } + + public List getUsers() { + return new ArrayList(txnsByUserIdMap.keySet()); + } + + public void printAll() { + for (Map.Entry e : txnsByTxnIdMap.entrySet()) { + Transaction t = e.getValue(); + System.out.println(t); + } + } + + public void printTransaction(String id) { + Transaction e = findTransactionById(id); + if (e != null) { + System.out.println(e.toString()); + } else { + System.out.println("Transaction not found (txn id: '" + id + "')"); + } + } + +} diff --git a/src/org/yooreeka/examples/fraud/data/TransactionInstanceBuilder.java b/src/org/yooreeka/examples/fraud/data/TransactionInstanceBuilder.java new file mode 100644 index 0000000..64aea1d --- /dev/null +++ b/src/org/yooreeka/examples/fraud/data/TransactionInstanceBuilder.java @@ -0,0 +1,224 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.yooreeka.algos.taxis.core.DoubleAttribute; +import org.yooreeka.algos.taxis.core.StringAttribute; +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.fraud.TransactionConcept; +import org.yooreeka.examples.fraud.TransactionInstance; +import org.yooreeka.examples.fraud.util.UserStatistics; +import org.yooreeka.util.metrics.JaccardCoefficient; +import org.yooreeka.util.metrics.SimilarityMeasure; + +public class TransactionInstanceBuilder implements java.io.Serializable { + + private static final long serialVersionUID = -2334221990318430678L; + + /* + * For every user we keep a set of user-specific values to normalize data. + */ + private Map userStatisticsMap; + + /* + * Similarity measure that will be used to evaluate similarity between + * transaction descriptions. + */ + private SimilarityMeasure descriptionSim; + + public TransactionInstanceBuilder() { + userStatisticsMap = new HashMap(); + descriptionSim = new JaccardCoefficient(); + } + + private Double calculateDescriptionSimilarity(String txnDescription, + UserStatistics u) { + + String[] termsX = tokenizeTxnDescription(txnDescription); + Set validTxnDescriptions = u.getDescriptions(); + + double bestSim = 0.0; + for (String valueY : validTxnDescriptions) { + String[] termsY = u.getDescriptionTokens(valueY); + if (termsY == null) { + termsY = tokenizeTxnDescription(valueY); + u.setDescriptionTokens(valueY, termsY); + } + double sim = descriptionSim.similarity(termsX, termsY); + if (sim > bestSim) { + bestSim = sim; + } + } + + return bestSim; + } + + public TransactionInstance createInstance(Transaction t) { + + int userId = t.getUserId(); + UserStatistics userStats = getUserStatistics(userId); + + if (userStats == null) { + throw new RuntimeException( + "Can't create instance. There are no statistics for user: " + + userId); + } + + /* + * Calculate distance between user location centroid and instance + * location + */ + TransactionLocation nLocation = normalizeLocation(t.getLocation(), + userStats); + TransactionLocation nCentroid = normalizeLocation( + userStats.getLocationCentroid(), userStats); + double nLocationDistance = nCentroid.distance(nLocation); + + double nAmt = normalizeAmount(t.getAmount(), userStats); + + double nDescriptionSim = calculateDescriptionSimilarity( + t.getDescription(), userStats); + + double nUserId = t.getUserId(); + + List attributes = new ArrayList(); + + // Attributes that will be used by NN + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_N_TXN_AMT, nAmt)); + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_N_LOCATION, nLocationDistance)); + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_N_DESCRIPTION, nDescriptionSim)); + + // Adding informational attributes + attributes.add(new StringAttribute( + TransactionInstance.ATTR_NAME_USERID, String.valueOf(nUserId))); + attributes.add(new StringAttribute(TransactionInstance.ATTR_NAME_TXNID, + String.valueOf(t.getTxnId()))); + + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_TXN_AMT, t.getAmount())); + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_LOCATION_X, t.getLocation() + .getX())); + attributes.add(new DoubleAttribute( + TransactionInstance.ATTR_NAME_LOCATION_Y, t.getLocation() + .getY())); + attributes.add(new StringAttribute( + TransactionInstance.ATTR_NAME_DESCRIPTION, t.getDescription())); + + TransactionConcept c = null; + if (t.isFraud()) { + c = new TransactionConcept(TransactionConcept.CONCEPT_LABEL_FRAUD); + } else { + c = new TransactionConcept(TransactionConcept.CONCEPT_LABEL_VALID); + } + + return new TransactionInstance(c, attributes.toArray(new Attribute[0])); + } + + public TrainingSet createTrainingSet(TransactionDataset data) { + List txns = data.getTransactions(); + int nTxns = txns.size(); + Instance[] instances = new Instance[nTxns]; + for (int i = 0; i < nTxns; i++) { + Transaction t = txns.get(i); + instances[i] = createInstance(t); + } + return new TrainingSet(instances); + } + + public Map getUserStatistics() { + return userStatisticsMap; + } + + public UserStatistics getUserStatistics(int userId) { + return userStatisticsMap.get(userId); + } + + /** + * @return the userStatisticsMap + */ + public Map getUserStatisticsMap() { + return userStatisticsMap; + } + + private Double normalizeAmount(Double amt, UserStatistics u) { + Double min = u.getTxnAmtMin(); + Double max = u.getTxnAmtMax(); + Double v = (amt - min) / (max - min); + return v; // Valid values should fall into [0..1] and fraud outside. + } + + private TransactionLocation normalizeLocation(TransactionLocation location, + UserStatistics u) { + + double nX = (location.getX() - u.getLocationMinX()) + / (u.getLocationMaxX() - u.getLocationMinX()); + + double nY = (location.getY() - u.getLocationMinY()) + / (u.getLocationMaxY() - u.getLocationMinY()); + + return new TransactionLocation(nX, nY); + } + + public void printUserStats(int userId) { + UserStatistics userProps = userStatisticsMap.get(userId); + System.out.println("Properties for userId: " + userId + + " calculated from training data:"); + System.out.println(userProps.toString()); + } + + /** + * @param userStatisticsMap + * the userStatisticsMap to set + */ + public void setUserStatisticsMap( + Map userStatisticsMap) { + this.userStatisticsMap = userStatisticsMap; + } + + private String[] tokenizeTxnDescription(String description) { + String[] terms = description.split("\\s"); + + return terms; + } + +} diff --git a/src/org/yooreeka/examples/fraud/data/TransactionLoader.java b/src/org/yooreeka/examples/fraud/data/TransactionLoader.java new file mode 100644 index 0000000..b107668 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/data/TransactionLoader.java @@ -0,0 +1,59 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.data; + +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.fraud.util.FraudDataUtils; + +public class TransactionLoader { + + public static final String TRAINING_TXNS_FILE = YooreekaConfigurator + .getHome() + "/data/ch05/fraud/training-txns.txt"; + + public static final String TEST_TXNS_FILE = YooreekaConfigurator.getHome() + + "/data/ch05/fraud/test-txns.txt"; + + public static TransactionDataset loadTestDataset() { + List allTxns = loadTxns(TEST_TXNS_FILE); + return new TransactionDataset(allTxns); + } + + public static TransactionDataset loadTrainingDataset() { + List allTxns = loadTxns(TRAINING_TXNS_FILE); + return new TransactionDataset(allTxns); + } + + public static List loadTxns(String filename) { + return FraudDataUtils.loadTransactions(filename); + } +} diff --git a/src/org/yooreeka/examples/fraud/data/TransactionLocation.java b/src/org/yooreeka/examples/fraud/data/TransactionLocation.java new file mode 100644 index 0000000..a8c5d95 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/data/TransactionLocation.java @@ -0,0 +1,94 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.data; + +public class TransactionLocation implements java.io.Serializable { + + /** + * + */ + private static final long serialVersionUID = 7742289669577088001L; + + private double x; + private double y; + + public TransactionLocation(double x, double y) { + this.x = x; + this.y = y; + } + + public double distance(TransactionLocation location) { + return Math.sqrt((x - location.getX()) * (x - location.getX()) + + (y - location.getY()) * (y - location.getY())); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final TransactionLocation other = (TransactionLocation) obj; + if (Double.doubleToLongBits(x) != Double.doubleToLongBits(other.x)) + return false; + if (Double.doubleToLongBits(y) != Double.doubleToLongBits(other.y)) + return false; + return true; + } + + public double getX() { + return x; + } + + public double getY() { + return y; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + long temp; + temp = Double.doubleToLongBits(x); + result = prime * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(y); + result = prime * result + (int) (temp ^ (temp >>> 32)); + return result; + } + + @Override + public String toString() { + return "[" + "x=" + x + ", y=" + y + "]"; + } + +} diff --git a/src/org/yooreeka/examples/fraud/util/DataGenerator.java b/src/org/yooreeka/examples/fraud/util/DataGenerator.java new file mode 100644 index 0000000..c0abfe5 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/DataGenerator.java @@ -0,0 +1,119 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.examples.fraud.data.Transaction; +import org.yooreeka.examples.fraud.data.TransactionLocation; + +public class DataGenerator { + + private long nextTxnId = 0; + + public DataGenerator() { + // default value + this.setNextTxnId(1); + } + + private double generateAmt(TransactionSetProfile user) { + return FraudDataUtils.nextTxnAmount(user.getTxnAmtMean(), + user.getTxnAmtStd()); + } + + private String generateDescription(TransactionSetProfile userParams) { + int txnDescriptionId; + String[] txnDescriptions; + txnDescriptions = userParams.getTxnDescriptions(); + txnDescriptionId = FraudDataUtils.randomInt(txnDescriptions.length); + return txnDescriptions[txnDescriptionId]; + } + + private TransactionLocation generateLocation( + TransactionSetProfile userParams) { + + int minX = userParams.getLocationMinX(); + int maxX = userParams.getLocationMaxX(); + int minY = userParams.getLocationMinY(); + int maxY = userParams.getLocationMaxY(); + + int x = FraudDataUtils.randomInt(minX, maxX); + int y = FraudDataUtils.randomInt(minY, maxY); + + return new TransactionLocation(x, y); + } + + private long generateNextUniqueTxnId() { + return nextTxnId++; + } + + private Transaction generateTxn(TransactionSetProfile userParams) { + Transaction e = new Transaction(); + + e.setUserId(userParams.getUserId()); + e.setTxnId(generateNextUniqueTxnId()); + + // Txn Amount + double amt = generateAmt(userParams); + e.setAmount(amt); + + // Txn Description + String txnDescription = generateDescription(userParams); + e.setDescription(txnDescription); + + // Txn Location + TransactionLocation location = generateLocation(userParams); + e.setLocation(location); + + // Txn fraud flag + e.setFraud(userParams.isFraud()); + + return e; + } + + public List generateTxns(TransactionSetProfile[] allUsers) { + List allTransactions = new ArrayList(); + for (int i = 0, n = allUsers.length; i < n; i++) { + TransactionSetProfile user = allUsers[i]; + + for (int j = 0; j < user.getNTxns(); j++) { + allTransactions.add(generateTxn(user)); + } + + } + return allTransactions; + } + + public void setNextTxnId(long nextTxnId) { + this.nextTxnId = nextTxnId; + } +} diff --git a/src/org/yooreeka/examples/fraud/util/FraudDataUtils.java b/src/org/yooreeka/examples/fraud/util/FraudDataUtils.java new file mode 100644 index 0000000..53fb086 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/FraudDataUtils.java @@ -0,0 +1,148 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.yooreeka.examples.fraud.data.Transaction; + +public class FraudDataUtils { + + private static Random rnd = new Random(); + + private static Random txnAmountRnd = new Random(); + + public static List loadTransactions(String filename) { + List txns = new ArrayList(); + try { + FileReader fReader = new FileReader(filename); + BufferedReader reader = new BufferedReader(fReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.trim().length() > 0) { + Transaction txn = new Transaction(); + txn.loadFromExternalString(line); + txns.add(txn); + } + } + + fReader.close(); + } catch (IOException e) { + throw new RuntimeException( + "Failed to load transactions from file: '" + filename + + "' ", e); + } + + return txns; + } + + static String[] loadTxnDescriptions(String filename) { + + List descriptions = new ArrayList(); + + FileReader fReader = null; + try { + fReader = new FileReader(filename); + } catch (FileNotFoundException fnfX) { + fnfX.printStackTrace(); + } + + try { + BufferedReader reader = new BufferedReader(fReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.trim().length() > 0) { + descriptions.add(line); + } + } + } catch (IOException e) { + throw new RuntimeException( + "Failed to load descriptions from file: '" + filename + + "' ", e); + } + + try { + fReader.close(); + } catch (IOException ioX) { + ioX.printStackTrace(); + } + + return descriptions.toArray(new String[descriptions.size()]); + } + + public static double nextTxnAmount(double mean, double std) { + double amt = 0.0; + do { + // deriving gaussian with our custom std and mean from Standard + // Normal Distribution. + amt = txnAmountRnd.nextGaussian() * std + mean; + } while (amt <= 0.0); + + BigDecimal db = new BigDecimal(amt); + db = db.setScale(2, BigDecimal.ROUND_HALF_UP); + return db.doubleValue(); + } + + static int randomInt(int n) { + return FraudDataUtils.randomInt(0, n); + } + + static int randomInt(int min, int max) { + return min + rnd.nextInt(max - min); + } + + static void saveTransactions(String filename, List txns) { + try { + FileWriter fout = new FileWriter(filename); + BufferedWriter writer = new BufferedWriter(fout); + for (Transaction txn : txns) { + writer.write(txn.toExternalString()); + writer.write("\n"); + } + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new RuntimeException( + "Failed to load descriptions from file: '" + filename + + "' ", e); + } + } + +} diff --git a/src/org/yooreeka/examples/fraud/util/FraudErrorEstimator.java b/src/org/yooreeka/examples/fraud/util/FraudErrorEstimator.java new file mode 100644 index 0000000..347aa41 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/FraudErrorEstimator.java @@ -0,0 +1,123 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import org.yooreeka.algos.taxis.core.intf.Classifier; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.fraud.DTFraudClassifier; +import org.yooreeka.examples.fraud.NNFraudClassifier; +import org.yooreeka.examples.fraud.TransactionConcept; +import org.yooreeka.examples.fraud.data.Transaction; +import org.yooreeka.examples.fraud.data.TransactionDataset; +import org.yooreeka.examples.fraud.data.TransactionInstanceBuilder; + +public class FraudErrorEstimator { + + private Classifier classifier; + private TransactionInstanceBuilder instanceBuilder; + private TransactionDataset testDS; + + private int correctCount = 0; + private int incorrectValidCount = 0; + private int incorrectFraudCount = 0; + private int totalFraudTxnsCount = 0; + + public FraudErrorEstimator(TransactionDataset testDS, + DTFraudClassifier classifier) { + + this.testDS = testDS; + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + } + + public FraudErrorEstimator(TransactionDataset testDS, + NNFraudClassifier classifier) { + + this.testDS = testDS; + + if (classifier.isVerbose()) { + classifier.setVerbose(false); + } + + this.classifier = classifier; + this.instanceBuilder = classifier.getInstanceBuilder(); + } + + public int getCorrectCount() { + return correctCount; + } + + public int getIncorrectFraudCount() { + return incorrectFraudCount; + } + + public int getIncorrectValidCount() { + return incorrectValidCount; + } + + public int getTotalFraudTxnsCount() { + return totalFraudTxnsCount; + } + + public void run() { + + for (Transaction txn : testDS.getTransactions()) { + Instance i = instanceBuilder.createInstance(txn); + Concept concept = classifier.classify(i); + Concept expectedConcept = new TransactionConcept(txn.isFraud()); + + if (txn.isFraud()) { + totalFraudTxnsCount++; + } + + if (concept.getName().equals(expectedConcept.getName())) { + correctCount++; + } else { + // Print classified instance + i.print(); + if (!txn.isFraud()) { + incorrectValidCount++; + } else { + incorrectFraudCount++; + } + } + } + + System.out.println("Total test dataset txns: " + testDS.getSize() + + ", Number of fraud txns:" + getTotalFraudTxnsCount()); + + System.out.println("Classified correctly: " + getCorrectCount() + + ", Misclassified valid txns: " + getIncorrectValidCount() + + ", Misclassified fraud txns: " + getIncorrectFraudCount()); + } + +} diff --git a/src/org/yooreeka/examples/fraud/util/TenUsersSample.java b/src/org/yooreeka/examples/fraud/util/TenUsersSample.java new file mode 100644 index 0000000..27f4801 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/TenUsersSample.java @@ -0,0 +1,363 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.fraud.data.Transaction; + +/** + * Example for how to configure and generate file with transactions. + */ +public class TenUsersSample { + + /* + * File with descriptions to be used for valid transactions. + */ + public static String TXN_DESC_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch05/fraud/descriptions.txt"; + + /* + * File with descriptions to be used for fraud transactions. + */ + public static String FRAUD_TXN_DESC_FILENAME = YooreekaConfigurator + .getHome() + "/data/ch05/fraud/fraud-descriptions.txt"; + + /* + * Generated transactions will be saved into this file. + */ + public static String TRAINING_TXN_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch05/fraud/generated-training-txns.txt"; + + public static String TEST_TXN_FILENAME = YooreekaConfigurator.getHome() + + "/data/ch05/fraud/generated-test-txns.txt"; + + public static TransactionSetProfile[] createUsersForTest() { + List allUserParams = new ArrayList(); + + String[] txnDescriptions = FraudDataUtils + .loadTxnDescriptions(TXN_DESC_FILENAME); + String[] fraudTxnDescriptions = FraudDataUtils + .loadTxnDescriptions(FRAUD_TXN_DESC_FILENAME); + + // Each user will have a set of valid and fraud txns. + // Using user ids from training set that didn't have any fraud txns. + + for (int userId = 21; userId <= 22; userId++) { + allUserParams.addAll(createUserType1(userId, 100, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 23; userId <= 24; userId++) { + allUserParams.addAll(createUserType2(userId, 100, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 25; userId <= 26; userId++) { + allUserParams.addAll(createUserType3(userId, 100, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 27; userId <= 28; userId++) { + allUserParams.addAll(createUserType4(userId, 100, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 29; userId <= 30; userId++) { + allUserParams.addAll(createUserType5(userId, 100, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + return allUserParams.toArray(new TransactionSetProfile[0]); + } + + public static TransactionSetProfile[] createUsersForTraining() { + List allUserParams = new ArrayList(); + + String[] txnDescriptions = FraudDataUtils + .loadTxnDescriptions(TXN_DESC_FILENAME); + String[] fraudTxnDescriptions = FraudDataUtils + .loadTxnDescriptions(FRAUD_TXN_DESC_FILENAME); + + // We have 5 types/profiles of users. + + // First, create 2 users for each profile with fraud txns. + + for (int userId = 1; userId <= 2; userId++) { + allUserParams.addAll(createUserType1(userId, 300, 25, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 3; userId <= 4; userId++) { + allUserParams.addAll(createUserType2(userId, 400, 15, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 5; userId <= 6; userId++) { + allUserParams.addAll(createUserType3(userId, 300, 30, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 7; userId <= 8; userId++) { + allUserParams.addAll(createUserType4(userId, 300, 10, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 9; userId <= 10; userId++) { + allUserParams.addAll(createUserType5(userId, 600, 20, + txnDescriptions, fraudTxnDescriptions)); + } + + // Now, create a couple of users from each profile without fraud txns + // these users will be used in test dataset as well + + for (int userId = 21; userId <= 22; userId++) { + allUserParams.addAll(createUserType1(userId, 400, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 23; userId <= 24; userId++) { + allUserParams.addAll(createUserType2(userId, 400, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 25; userId <= 26; userId++) { + allUserParams.addAll(createUserType3(userId, 400, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 27; userId <= 28; userId++) { + allUserParams.addAll(createUserType4(userId, 500, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + for (int userId = 29; userId <= 30; userId++) { + allUserParams.addAll(createUserType5(userId, 600, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + // Users that we will be using for test + for (int userId = 29; userId <= 30; userId++) { + allUserParams.addAll(createUserType5(userId, 600, 0, + txnDescriptions, fraudTxnDescriptions)); + } + + return allUserParams.toArray(new TransactionSetProfile[0]); + } + + /* + * Transaction sequence configuration for Type 1 User. + */ + public static List createUserType1(int userId, + int nValidTxns, int nFraudTxns, String[] txnDescriptions, + String[] fraudTxnDescriptions) { + + TransactionSetProfile[] profiles = new TransactionSetProfile[2]; + + profiles[0] = new TransactionSetProfile(); + profiles[1] = new TransactionSetProfile(); + + profiles[0].setUserId(userId); + profiles[0].setNTxns(nValidTxns); + profiles[0].setTxnDescriptions(txnDescriptions); + profiles[0].setLocations(700, 700, 1000, 1000); + profiles[0].setTxnAmtMean(50); + profiles[0].setTxnAmtStd(20); + profiles[0].setFraud(false); + + profiles[1].setUserId(userId); + profiles[1].setNTxns(nFraudTxns); + profiles[1].setTxnAmtMean(4000); + profiles[1].setTxnAmtStd(100); + profiles[1].setLocations(50, 50, 200, 200); + profiles[1].setTxnDescriptions(fraudTxnDescriptions); + profiles[1].setFraud(true); + + return Arrays.asList(profiles); + + } + + /* + * Transaction sequence configuration for Type 2 User. + */ + public static List createUserType2(int userId, + int nValidTxns, int nFraudTxns, String[] txnDescriptions, + String[] fraudTxnDescriptions) { + + TransactionSetProfile[] profiles = new TransactionSetProfile[2]; + + profiles[0] = new TransactionSetProfile(); + profiles[1] = new TransactionSetProfile(); + + profiles[0].setUserId(userId); + profiles[0].setNTxns(nValidTxns); + profiles[0].setTxnDescriptions(txnDescriptions); + profiles[0].setLocations(500, 500, 1000, 1000); + profiles[0].setTxnAmtMean(60); + profiles[0].setTxnAmtStd(20); + profiles[0].setFraud(false); + + profiles[1].setUserId(userId); + profiles[1].setNTxns(nFraudTxns); + profiles[1].setTxnAmtMean(1000); + profiles[1].setTxnAmtStd(100); + profiles[1].setLocations(100, 100, 600, 600); + profiles[1].setTxnDescriptions(fraudTxnDescriptions); + profiles[1].setFraud(true); + + return Arrays.asList(profiles); + + } + + /* + * Transaction sequence configuration for Type 3 User. + */ + public static List createUserType3(int userId, + int nValidTxns, int nFraudTxns, String[] txnDescriptions, + String[] fraudTxnDescriptions) { + + TransactionSetProfile[] profiles = new TransactionSetProfile[2]; + + profiles[0] = new TransactionSetProfile(); + profiles[1] = new TransactionSetProfile(); + + profiles[0].setUserId(userId); + profiles[0].setNTxns(nValidTxns); + profiles[0].setTxnDescriptions(txnDescriptions); + profiles[0].setLocations(500, 500, 800, 800); + profiles[0].setTxnAmtMean(80); + profiles[0].setTxnAmtStd(20); + profiles[0].setFraud(false); + + profiles[1].setUserId(userId); + profiles[1].setNTxns(nFraudTxns); + profiles[1].setTxnAmtMean(800); + profiles[1].setTxnAmtStd(50); + profiles[1].setLocations(100, 100, 400, 400); + profiles[1].setTxnDescriptions(fraudTxnDescriptions); + profiles[1].setFraud(true); + + return Arrays.asList(profiles); + + } + + /* + * Transaction sequence configuration for Type 4 User. + */ + public static List createUserType4(int userId, + int nValidTxns, int nFraudTxns, String[] txnDescriptions, + String[] fraudTxnDescriptions) { + + TransactionSetProfile[] profiles = new TransactionSetProfile[2]; + + profiles[0] = new TransactionSetProfile(); + profiles[1] = new TransactionSetProfile(); + + profiles[0].setUserId(userId); + profiles[0].setNTxns(nValidTxns); + profiles[0].setTxnDescriptions(txnDescriptions); + profiles[0].setLocations(100, 100, 400, 400); + profiles[0].setTxnAmtMean(200); + profiles[0].setTxnAmtStd(20); + profiles[0].setFraud(false); + + profiles[1].setUserId(userId); + profiles[1].setNTxns(nFraudTxns); + profiles[1].setTxnAmtMean(2000); + profiles[1].setTxnAmtStd(100); + profiles[1].setLocations(600, 600, 800, 800); + profiles[1].setTxnDescriptions(fraudTxnDescriptions); + profiles[1].setFraud(true); + + return Arrays.asList(profiles); + } + + /* + * Transaction sequence configuration for Type 5 User. + */ + public static List createUserType5(int userId, + int nValidTxns, int nFraudTxns, String[] txnDescriptions, + String[] fraudTxnDescriptions) { + + TransactionSetProfile[] profiles = new TransactionSetProfile[2]; + profiles[0] = new TransactionSetProfile(); + profiles[1] = new TransactionSetProfile(); + + profiles[0].setUserId(userId); + profiles[0].setNTxns(nValidTxns); + profiles[0].setTxnAmtMean(700); + profiles[0].setTxnAmtStd(500); + profiles[0].setTxnDescriptions(txnDescriptions); + profiles[0].setLocations(100, 100, 400, 400); + profiles[0].setFraud(false); + + profiles[1].setUserId(userId); + profiles[1].setNTxns(nFraudTxns); + profiles[1].setTxnAmtMean(700); + profiles[1].setTxnAmtStd(100); + profiles[1].setLocations(500, 500, 700, 700); + profiles[1].setTxnDescriptions(fraudTxnDescriptions); + profiles[1].setFraud(true); + + return Arrays.asList(profiles); + } + + public static void generateTxns(String txnFilename, int startTxnId, + TransactionSetProfile[] allUsers) { + DataGenerator dataGenerator = new DataGenerator(); + dataGenerator.setNextTxnId(startTxnId); + System.out.println("Generating transactions..."); + List allTxns = dataGenerator.generateTxns(allUsers); + System.out.println("Saving transactions into '" + txnFilename + "'"); + FraudDataUtils.saveTransactions(txnFilename, allTxns); + } + + public static void main(String[] args) { + + TransactionSetProfile[] userProfiles = createUsersForTraining(); + generateTxns(TRAINING_TXN_FILENAME, 1, userProfiles); + userProfiles = createUsersForTest(); + generateTxns(TEST_TXN_FILENAME, 500000, userProfiles); + } + + public static void printTxns(String txnFilename) { + System.out.println("Loading transactions from '" + txnFilename + "'"); + List allTxns = FraudDataUtils + .loadTransactions(txnFilename); + System.out.println("Printing loaded transactions:"); + for (Transaction e : allTxns) { + System.out.println(e); + } + } +} diff --git a/src/org/yooreeka/examples/fraud/util/TransactionSetProfile.java b/src/org/yooreeka/examples/fraud/util/TransactionSetProfile.java new file mode 100644 index 0000000..a4f8061 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/TransactionSetProfile.java @@ -0,0 +1,145 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +/** + * Configuration properties to control generation of user transactions. + */ +public class TransactionSetProfile { + + /* + * Identifies Credit Card User. + */ + private int userId; + + private int nTxns; + + /* + * Mean value for transaction amount. + */ + private double txnAmtMean; + + /* + * Standard deviation for transaction amount. + */ + private double txnAmtStd; + + /* + * Location coordinates. + */ + private int locationMinX; + private int locationMaxX; + private int locationMinY; + private int locationMaxY; + + /* + * Descriptions that will be used for valid transactions. + */ + private String[] txnDescriptions; + + private boolean isFraud; + + public TransactionSetProfile() { + // empty + } + + public int getLocationMaxX() { + return locationMaxX; + } + + public int getLocationMaxY() { + return locationMaxY; + } + + public int getLocationMinX() { + return locationMinX; + } + + public int getLocationMinY() { + return locationMinY; + } + + public int getNTxns() { + return nTxns; + } + + public double getTxnAmtMean() { + return txnAmtMean; + } + + public double getTxnAmtStd() { + return txnAmtStd; + } + + public String[] getTxnDescriptions() { + return txnDescriptions; + } + + public int getUserId() { + return userId; + } + + public boolean isFraud() { + return isFraud; + } + + public void setFraud(boolean isFraud) { + this.isFraud = isFraud; + } + + public void setLocations(int minX, int minY, int maxX, int maxY) { + this.locationMinX = minX; + this.locationMinY = minY; + this.locationMaxX = maxX; + this.locationMaxY = maxY; + } + + public void setNTxns(int txns) { + nTxns = txns; + } + + public void setTxnAmtMean(double txnAmtMean) { + this.txnAmtMean = txnAmtMean; + } + + public void setTxnAmtStd(double txnAmtStd) { + this.txnAmtStd = txnAmtStd; + } + + public void setTxnDescriptions(String[] txnDescriptions) { + this.txnDescriptions = txnDescriptions; + } + + public void setUserId(int userId) { + this.userId = userId; + } + +} diff --git a/src/org/yooreeka/examples/fraud/util/UserStatistics.java b/src/org/yooreeka/examples/fraud/util/UserStatistics.java new file mode 100644 index 0000000..47983c7 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/UserStatistics.java @@ -0,0 +1,153 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.yooreeka.examples.fraud.data.TransactionLocation; + +/** + * Holds user-specific statistics that are calculated from training data. + */ +public class UserStatistics implements java.io.Serializable { + + private static final long serialVersionUID = -7537387975282866317L; + + private int userId; + private double txnAmtMin; + private double txnAmtMax; + private Map descriptionTokensMap; + private TransactionLocation locationCentroid; + private double locationMinX; + private double locationMaxX; + private double locationMinY; + private double locationMaxY; + + public UserStatistics() { + descriptionTokensMap = new HashMap(); + } + + public Set getDescriptions() { + return descriptionTokensMap.keySet(); + } + + public String[] getDescriptionTokens(String d) { + return this.descriptionTokensMap.get(d); + } + + public TransactionLocation getLocationCentroid() { + return locationCentroid; + } + + public double getLocationMaxX() { + return locationMaxX; + } + + public double getLocationMaxY() { + return locationMaxY; + } + + public double getLocationMinX() { + return locationMinX; + } + + public double getLocationMinY() { + return locationMinY; + } + + public Double getTxnAmtMax() { + return txnAmtMax; + } + + public Double getTxnAmtMin() { + return txnAmtMin; + } + + public int getUserId() { + return userId; + } + + public void setDescriptions(Set descriptions) { + descriptionTokensMap.clear(); + for (String d : descriptions) { + this.descriptionTokensMap.put(d, null); + } + } + + public void setDescriptionTokens(String d, String[] tokens) { + this.descriptionTokensMap.put(d, tokens); + } + + public void setLocationCentroid(TransactionLocation locationCentroid) { + this.locationCentroid = locationCentroid; + } + + public void setLocationMaxX(double locationMaxX) { + this.locationMaxX = locationMaxX; + } + + public void setLocationMaxY(double locationMaxY) { + this.locationMaxY = locationMaxY; + } + + public void setLocationMinX(double locationMinX) { + this.locationMinX = locationMinX; + } + + public void setLocationMinY(double locationMinY) { + this.locationMinY = locationMinY; + } + + public void setTxnAmtMax(Double txnAmountMax) { + this.txnAmtMax = txnAmountMax; + } + + public void setTxnAmtMin(Double txnAmountMin) { + this.txnAmtMin = txnAmountMin; + } + + public void setUserId(int userId) { + this.userId = userId; + } + + @Override + public String toString() { + return "[userId=" + userId + ", txnAmtMin=" + txnAmtMin + + ", txnAmtMax=" + txnAmtMax + ", locationMinX=" + locationMinX + + ", locationMaxX=" + locationMaxX + ", locationMinY=" + + locationMinY + ", locationMaxY=" + locationMaxY + + ", descriptions=" + descriptionTokensMap.keySet().toString() + + ", locationCentroid=" + locationCentroid.toString() + "]"; + } + +} diff --git a/src/org/yooreeka/examples/fraud/util/UserStatisticsCalculator.java b/src/org/yooreeka/examples/fraud/util/UserStatisticsCalculator.java new file mode 100644 index 0000000..4dc95b3 --- /dev/null +++ b/src/org/yooreeka/examples/fraud/util/UserStatisticsCalculator.java @@ -0,0 +1,164 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.fraud.util; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.yooreeka.examples.fraud.data.Transaction; +import org.yooreeka.examples.fraud.data.TransactionDataset; +import org.yooreeka.examples.fraud.data.TransactionLocation; + +public class UserStatisticsCalculator { + + public Map calculateStatistics( + TransactionDataset trainingData) { + + Map statsByUserMap = new HashMap(); + + List users = trainingData.getUsers(); + + for (Integer userId : users) { + List userTxns = trainingData.findUserTxns(userId); + + UserStatistics userProps = calculateUserProperties(userId, userTxns); + + statsByUserMap.put(userId, userProps); + } + + return statsByUserMap; + + } + + private UserStatistics calculateUserProperties(Integer userId, + List userTxns) { + + UserStatistics props = new UserStatistics(); + + props.setUserId(userId); + + /* + * Unique descriptions of non-fraud transactions from training set for + * this user. + */ + Set descriptions = new HashSet(); + + /* + * Total number of non-fraud transactions from training set for this + * user. + */ + int nonFraudTxnCount = 0; + + /* + * All locations of non-fraud transaction from training set for this + * user. + */ + List locations = new ArrayList(); + + Double minAmount = null; + Double maxAmount = null; + + Double locationMinX = null; + Double locationMaxX = null; + Double locationMinY = null; + Double locationMaxY = null; + + for (Transaction t : userTxns) { + if (t.isFraud()) { + // do not use fraud transactions to calculate user statistics + } else { + nonFraudTxnCount++; + + descriptions.add(t.getDescription()); + + locations.add(t.getLocation()); + + double x = t.getLocation().getX(); + double y = t.getLocation().getY(); + + // update min/max values for location + if (locationMinX == null || x < locationMinX) { + locationMinX = x; + } + if (locationMinY == null || y < locationMinY) { + locationMinY = y; + } + if (locationMaxX == null || x > locationMaxX) { + locationMaxX = x; + } + if (locationMaxY == null || y > locationMaxY) { + locationMaxY = y; + } + + double amt = t.getAmount(); + + if (minAmount == null || amt < minAmount) { + minAmount = amt; + } + + if (maxAmount == null || amt > maxAmount) { + maxAmount = amt; + } + + } + } + + TransactionLocation locationCentroid = centroid(locations); + props.setDescriptions(descriptions); + props.setLocationCentroid(locationCentroid); + props.setTxnAmtMin(minAmount); + props.setTxnAmtMax(maxAmount); + props.setLocationMaxX(locationMaxX); + props.setLocationMinX(locationMinX); + props.setLocationMaxY(locationMaxY); + props.setLocationMinY(locationMinY); + + return props; + } + + private TransactionLocation centroid(List locations) { + double x = 0.0; + double y = 0.0; + double n = locations.size(); + + for (TransactionLocation location : locations) { + x += location.getX(); + y += location.getY(); + } + + return new TransactionLocation(x / n, y / n); + } + +} diff --git a/src/org/yooreeka/examples/newsgroups/NewsCrawler.java b/src/org/yooreeka/examples/newsgroups/NewsCrawler.java new file mode 100644 index 0000000..55a8bf5 --- /dev/null +++ b/src/org/yooreeka/examples/newsgroups/NewsCrawler.java @@ -0,0 +1,195 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.newsgroups; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.internet.crawling.core.BasicWebCrawler; +import org.yooreeka.util.internet.crawling.core.CrawlData; +import org.yooreeka.util.internet.crawling.core.URLFilter; +import org.yooreeka.util.internet.crawling.core.URLNormalizer; + +/** + * A basic news crawler. + * + * Remember to use setOffline(false), if you want to use the local + * files + * + * @author Babis Marmanis + */ +public class NewsCrawler { + + public static final int DEFAULT_MAX_DEPTH = 1; + public static final int DEFAULT_MAX_DOCS = 1000; + + private BasicWebCrawler webCrawler; + + private String crawlDataDir; + + private int maxDepth = DEFAULT_MAX_DEPTH; + + private int maxDocs = DEFAULT_MAX_DOCS; + + private List seedUrls; + + /** + * This variable determines whether we will crawl the Internet or local + * files Remember to use setOffline(false), if you want to use the + * local files + */ + private boolean isOffline = false; + + /* + * Directory that contains "previously unseen" documents. + */ + public static final String TEST_FILES_DIR_CH7 = YooreekaConfigurator + .getHome() + "/data/ch07/test"; + + public NewsCrawler(String rootDir, int maxDepth, int maxDocs) { + + this.crawlDataDir = buildUniqueDirectoryName(rootDir); + + this.maxDepth = maxDepth; + + this.maxDocs = maxDocs; + + seedUrls = new ArrayList(); + + webCrawler = new BasicWebCrawler(crawlDataDir); + + } + + public void addSeedUrl(String val) { + URLNormalizer urlNormalizer = new URLNormalizer(); + seedUrls.add(urlNormalizer.normalizeUrl(val)); + } + + private String buildUniqueDirectoryName(String rootDir) { + return rootDir + System.getProperty("file.separator") + "crawl-" + + System.currentTimeMillis(); + } + + public CrawlData getCrawlData() { + return webCrawler.getCrawlData(); + } + + /** + * @return the rootDir + */ + public String getCrawlDataDir() { + return crawlDataDir; + } + + public List getSeedUrls() { + return seedUrls; + } + + /** + * @return the isOffline + */ + public boolean isOffline() { + return isOffline; + } + + private List loadFileUrls(String dir) { + + List fileUrls = new ArrayList(); + + File dirFile = new File(dir); + + File[] docs = dirFile.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith(".html"); + } + }); + + try { + for (File f : docs) { + URL url = f.toURI().toURL(); + fileUrls.add(url.toExternalForm()); + } + } catch (IOException e) { + throw new RuntimeException( + "Error while converting filename into URL: ", e); + } + + return fileUrls; + } + + public void run() { + + webCrawler.addSeedUrls(getSeedUrls()); + + URLFilter urlFilter = new URLFilter(); + + if (isOffline()) { + urlFilter.setAllowFileUrls(true); + urlFilter.setAllowHttpUrls(false); + } else { + urlFilter.setAllowFileUrls(false); + urlFilter.setAllowHttpUrls(true); + } + webCrawler.setURLFilter(urlFilter); + + long t0 = System.currentTimeMillis(); + + /* run crawl - crawler will fetch and parse the documents */ + webCrawler.fetchAndProcess(maxDepth, maxDocs); + + System.out.println("Timer (s): [Crawler processed data] --> " + + (System.currentTimeMillis() - t0) * 0.001); + } + + public void setAllSeedUrls() { + + seedUrls.clear(); + + List fileUrls = loadFileUrls(TEST_FILES_DIR_CH7); + + for (String url : fileUrls) { + addSeedUrl(url); + } + } + + /** + * @param isOffline + * the isOffline to set + */ + public void setOffline(boolean isOffline) { + this.isOffline = isOffline; + } +} diff --git a/src/org/yooreeka/examples/recommender/MovieLensRMSESample.java b/src/org/yooreeka/examples/recommender/MovieLensRMSESample.java new file mode 100644 index 0000000..d819fa0 --- /dev/null +++ b/src/org/yooreeka/examples/recommender/MovieLensRMSESample.java @@ -0,0 +1,61 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.recommender; + +import org.yooreeka.algos.reco.collab.data.MovieLensData; +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.evaluation.RMSEEstimator; +import org.yooreeka.algos.reco.collab.recommender.MovieLensDelphi; + +public class MovieLensRMSESample { + + public static void main(String[] args) throws Exception { + + int testSize = Integer.parseInt(args[0]); + + MovieLensDataset ds = MovieLensData.createDataset(testSize); + + // Create an instance of our recommender + MovieLensDelphi delphi = new MovieLensDelphi(ds); + + // Create an instance of the RMSE estimator + RMSEEstimator rmseEstimator = new RMSEEstimator(); + + // Calculate the RMSE + // rmseEstimator.calculateRMSE(delphi); + + // Compare RMSEs + for (int i = 0; i < 25; i++) { + delphi.setSimilarityThreshold(0.05d + i * 0.01d); + rmseEstimator.compareRMSEs(delphi); + } + } +} diff --git a/src/org/yooreeka/examples/recommender/RatingGrapher.java b/src/org/yooreeka/examples/recommender/RatingGrapher.java new file mode 100644 index 0000000..f0698c0 --- /dev/null +++ b/src/org/yooreeka/examples/recommender/RatingGrapher.java @@ -0,0 +1,174 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.recommender; + +import java.io.File; +import java.util.Collection; + +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.Item; +import org.yooreeka.algos.reco.collab.model.Rating; +import org.yooreeka.algos.reco.collab.model.User; +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.gui.XyGui; + +public class RatingGrapher { + + private static Dataset getMovieLensData() { + String dataDir = YooreekaConfigurator + .getProperty(YooreekaConfigurator.MOVIELENS_DATA_DIR); + File users = new File(dataDir, MovieLensDataset.USERS_FILENAME); + File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME); + File ratings = new File(dataDir, MovieLensDataset.RATINGS_FILENAME); + Dataset ds = new MovieLensDataset("MovieLensDataset", users, items, + ratings); + return ds; + } + + public static void main(String[] args) { + // RatingGrapher.plotAverageItemRating(); + // RatingGrapher.plotAverageUserRating(); + RatingGrapher.plotRatingsDistribution(); + + } + + /** + * Plots average item rating for MovieLens dataset. + */ + public static void plotAverageItemRating() { + Dataset ds = getMovieLensData(); + Collection items = ds.getItems(); + double[] x = new double[items.size()]; + double[] y = new double[items.size()]; + int i = 0; + for (Item item : items) { + x[i] = item.getId(); + y[i] = item.getAverageRating(); + i++; + } + + XyGui gui = new XyGui(ds.getName(), x, y); + gui.plot(); + } + + /** + * Plots average user rating for MovieLens dataset. + */ + public static void plotAverageUserRating() { + Dataset ds = getMovieLensData(); + Collection users = ds.getUsers(); + double[] x = new double[users.size()]; + double[] y = new double[users.size()]; + int i = 0; + for (User user : users) { + x[i] = user.getId(); + y[i] = user.getAverageRating(); + i++; + } + + XyGui gui = new XyGui(ds.getName(), x, y); + gui.plot(); + } + + public static void plotNumberOfRatingsPerItem() { + Dataset ds = getMovieLensData(); + Collection items = ds.getItems(); + double[] x = new double[items.size()]; + double[] y = new double[items.size()]; + int i = 0; + for (Item item : items) { + x[i] = item.getId(); + y[i] = item.getAllRatings().size(); + i++; + } + + XyGui gui = new XyGui(ds.getName(), x, y); + gui.plot(); + } + + public static void plotNumberOfRatingsPerUser() { + Dataset ds = getMovieLensData(); + Collection users = ds.getUsers(); + double[] x = new double[users.size()]; + double[] y = new double[users.size()]; + int i = 0; + for (User user : users) { + x[i] = user.getId(); + y[i] = user.getAllRatings().size(); + i++; + } + + XyGui gui = new XyGui(ds.getName(), x, y); + gui.plot(); + } + + public static void plotRatingsDistribution() { + Dataset ds = getMovieLensData(); + plotRatingsDistribution( + "Ratings for all items by all users, n=" + ds.getRatingsCount(), + ds.getRatings()); + } + + private static void plotRatingsDistribution(String plotName, + Collection ratings) { + double[] x = { 1, 2, 3, 4, 5 }; + double[] y = { 0.0, 0.0, 0.0, 0.0, 0.0 }; + + if (ratings != null && ratings.size() > 0) { + for (Rating r : ratings) { + y[r.getRating() - 1]++; + } + + int nRatings = ratings.size(); + for (int i = 0, n = x.length; i < n; i++) { + y[i] = y[i] / nRatings; + } + } + XyGui gui = new XyGui(plotName, x, y); + gui.plot(); + } + + public static void plotRatingsDistributionForItem(int itemId) { + Dataset ds = getMovieLensData(); + Collection ratings = ds.getItem(itemId).getAllRatings(); + plotRatingsDistribution("Ratings distribution for item: " + itemId + + ", n=" + ratings.size(), ratings); + } + + public static void plotRatingsDistributionForUser(int userId) { + Dataset ds = getMovieLensData(); + Collection ratings = ds.getUser(userId).getAllRatings(); + plotRatingsDistribution("Ratings distribution for user: " + userId + + ", n=" + ratings.size(), ratings); + } + +} diff --git a/src/org/yooreeka/examples/recommender/Recommender.java b/src/org/yooreeka/examples/recommender/Recommender.java new file mode 100644 index 0000000..1ed7cdc --- /dev/null +++ b/src/org/yooreeka/examples/recommender/Recommender.java @@ -0,0 +1,119 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.recommender; + +import java.io.File; +import java.util.List; + +import org.yooreeka.algos.reco.collab.data.MovieLensDataset; +import org.yooreeka.algos.reco.collab.model.Dataset; +import org.yooreeka.algos.reco.collab.model.RecommendationType; +import org.yooreeka.algos.reco.collab.recommender.Delphi; +import org.yooreeka.algos.reco.collab.recommender.PredictedItemRating; + +/** + * @deprecated not used at the moment. + */ +@Deprecated +class Recommender { + + // private static final Logger logger = Logger.getLogger(Recommender.class); + + public static void main(String[] args) throws Exception { + Recommender m = new Recommender(args[0]); + boolean useSimilarityCacheWhenAvailable = true; + m.recommendOnMovieLens(useSimilarityCacheWhenAvailable); + } + + private Dataset dataset; + + private Recommender(String dataDir) { + // Load MovieLens dataset + File users = new File(dataDir, MovieLensDataset.USERS_FILENAME); + File items = new File(dataDir, MovieLensDataset.ITEMS_FILENAME); + File ratings = new File(dataDir, MovieLensDataset.RATINGS_FILENAME); + this.dataset = new MovieLensDataset("MovieLensDataset", users, items, + ratings); + } + + private void printFirstN(List sortedRecommendations, + int printNum) { + for (int i = 0, n = sortedRecommendations.size(); i < n && i < printNum; i++) { + System.out.println(sortedRecommendations.get(i)); + } + } + + private void printMinMax(List c) { + int minId = 0; + double minIdRating = 6.0; + int maxId = 0; + double maxIdRating = 0.0; + for (PredictedItemRating r : c) { + if (r.getRating() < minIdRating) { + minId = r.getItemId(); + minIdRating = r.getRating(); + } + if (r.getRating() > maxIdRating) { + maxId = r.getItemId(); + maxIdRating = r.getRating(); + } + } + System.out.println("minId=" + minId + ",minIdRating=" + maxIdRating + + ",maxId=" + maxId + ",maxIdRating=" + maxIdRating); + } + + private void recommendOnMovieLens(boolean useSimilarityCache) + throws Exception { + + long start = System.currentTimeMillis(); + Delphi delphi = new Delphi(dataset, + RecommendationType.ITEM_PENALTY_BASED, useSimilarityCache); + System.out.println("Time:" + (System.currentTimeMillis() - start) + / 1000 + "(sec)"); + List r = delphi.recommend(4); + System.out.println("4: size: " + r.size()); + printMinMax(r); + printFirstN(r, 3); + r = delphi.recommend(3); + System.out.println("3: size: " + r.size()); + printMinMax(r); + printFirstN(r, 3); + r = delphi.recommend(100); + System.out.println("100: size: " + r.size()); + printMinMax(r); + printFirstN(r, 3); + r = delphi.recommend(50); + System.out.println("50: size: " + r.size()); + printMinMax(r); + printFirstN(r, 3); + } + +} diff --git a/src/org/yooreeka/examples/search/DocRank.java b/src/org/yooreeka/examples/search/DocRank.java new file mode 100644 index 0000000..df50dd0 --- /dev/null +++ b/src/org/yooreeka/examples/search/DocRank.java @@ -0,0 +1,57 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.search; + +import org.yooreeka.algos.search.ranking.DocRankMatrixBuilder; +import org.yooreeka.algos.search.ranking.PageRankMatrixH; +import org.yooreeka.algos.search.ranking.Rank; + +/** + * A PageRank-like algorithm for documents. + * + * @author Babis Marmanis + * + */ +public class DocRank extends Rank { + + DocRankMatrixBuilder docRankBuilder; + + public DocRank(String luceneIndexDir, int termsToKeep) { + docRankBuilder = new DocRankMatrixBuilder(luceneIndexDir); + docRankBuilder.setTermsToKeep(termsToKeep); + docRankBuilder.run(); + } + + @Override + public PageRankMatrixH getH() { + return docRankBuilder.getH(); + } +} diff --git a/src/org/yooreeka/examples/search/LuceneIndexer.java b/src/org/yooreeka/examples/search/LuceneIndexer.java new file mode 100644 index 0000000..66e0c73 --- /dev/null +++ b/src/org/yooreeka/examples/search/LuceneIndexer.java @@ -0,0 +1,87 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.search; + +import java.io.File; +import java.io.IOException; + +import org.yooreeka.algos.search.lucene.LuceneIndexBuilder; +import org.yooreeka.util.internet.crawling.core.CrawlData; +import org.yooreeka.util.internet.crawling.core.CrawlDataProcessor; +import org.yooreeka.util.internet.crawling.util.FileUtils; + +public class LuceneIndexer { + + private String baseDir; + + private String luceneIndexDir; + + public LuceneIndexer(String dir) { + + baseDir = dir; + luceneIndexDir = baseDir + System.getProperty("file.separator") + + "lucene-index"; + } + + public String getLuceneDir() { + + return luceneIndexDir; + } + + public void run() { + + // load existing data + CrawlData crawlData = new CrawlData(baseDir); + crawlData.init(); + + File luceneIndexRootDir = new File(getLuceneDir()); + + // Delete the index directory, if it exists + FileUtils.deleteDir(luceneIndexRootDir); + luceneIndexRootDir.mkdirs(); + + CrawlDataProcessor luceneIndexBuilder = null; + try { + luceneIndexBuilder = new LuceneIndexBuilder(luceneIndexRootDir, + crawlData); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + System.out.print("Starting the indexing ... "); + + luceneIndexBuilder.run(); + + System.out.println("Indexing completed! \n"); + } + +} diff --git a/src/org/yooreeka/examples/search/MySearcher.java b/src/org/yooreeka/examples/search/MySearcher.java new file mode 100644 index 0000000..c66a174 --- /dev/null +++ b/src/org/yooreeka/examples/search/MySearcher.java @@ -0,0 +1,360 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.search; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.yooreeka.algos.search.data.SearchResult; +import org.yooreeka.algos.search.lucene.LuceneIndexBuilder; +import org.yooreeka.algos.search.ranking.Rank; +import org.yooreeka.algos.taxis.bayesian.NaiveBayes; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.util.internet.behavior.UserClick; +import org.yooreeka.util.internet.behavior.UserQuery; + +public class MySearcher { + + /** + * An arbitrary small value + */ + public static final double EPSILON = 0.0001; + + private static final String PRETTY_LINE = "_______________________________________________________________________"; + + private File indexFile; + private NaiveBayes learner = null; + + private boolean verbose = true; + + public MySearcher(String indexDir) { + indexFile = new File(indexDir); + } + + public boolean isVerbose() { + return verbose; + } + + private void printResults(String header, String query, + SearchResult[] values, boolean showDocTitle) { + + if (verbose) { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + + boolean printEntrySeparator = false; + if (showDocTitle) { // multiple lines per entry + printEntrySeparator = true; + } + + pw.print("\n"); + pw.println(header); + if (query != null) { + pw.println(query); + } + pw.print("\n"); + for (int i = 0, n = values.length; i < n; i++) { + if (values[i] != null) { + if (showDocTitle) { + pw.printf("Document Title: %s\n", values[i].getTitle()); + } + pw.printf( + "Document URL: %-46s --> Relevance Score: %.15f\n", + values[i].getUrl(), values[i].getScore()); + if (printEntrySeparator) { + pw.printf(PRETTY_LINE); + pw.printf("\n"); + } + } else { + pw.printf("Document: %s\n", + "Not available, values[i] is NULL"); + } + } + if (!printEntrySeparator) { + pw.print(PRETTY_LINE); + } + + System.out.println(sw.toString()); + } + } + + public SearchResult[] search(String query, int numberOfMatches) { + + SearchResult[] docResults = null; + + IndexSearcher is = null; + + Directory dir = null; + try { + dir = FSDirectory.open(indexFile); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + DirectoryReader dirReader = null; + try { + + dirReader = DirectoryReader.open(dir); + is = new IndexSearcher(dirReader); + + } catch (IOException ioX) { + System.out.println("ERROR: " + ioX.getMessage()); + } + + StandardQueryParser queryParserHelper = new StandardQueryParser(); + Query q = null; + + try { + + q = queryParserHelper.parse(query, + LuceneIndexBuilder.INDEX_FIELD_CONTENT); + + } catch (QueryNodeException e) { + e.printStackTrace(); + } + + TopDocs hits = null; + try { + hits = is.search(q, numberOfMatches); + + docResults = new SearchResult[hits.scoreDocs.length]; + + for (int i = 0; i < hits.scoreDocs.length; i++) { + + Document hitDoc = is.doc(hits.scoreDocs[i].doc); + + docResults[i] = new SearchResult(hitDoc.get("docid"), + hitDoc.get("doctype"), hitDoc.get("title"), + hitDoc.get("url"), hits.scoreDocs[i].score); + } + + dirReader.close(); + dir.close(); + + } catch (IOException ioX) { + System.out.println("ERROR: " + ioX.getMessage()); + } catch (Exception e) { + e.printStackTrace(); + } + + String header = "Search results using Lucene index scores:"; + boolean showTitle = true; + printResults(header, "Query: " + query, docResults, showTitle); + + return docResults; + } + + /** + * A method that combines the score of an index based search and the score + * of the PageRank algorithm to achieve better relevance results. + */ + public SearchResult[] search(String query, int numberOfMatches, Rank pR) { + + SearchResult[] docResults = search(query, numberOfMatches); + + String url; + + int n = pR.getH().getSize(); + + /** + * TODO: 2.3 -- The PageRank scaling factor m (Book Section 2.3) + * + * When the number of pages in your graph are few, the PageRank values + * need some boosting. As the number of pages increases m approaches the + * value 1 quickly because 1/n goes to zero. + */ + double m = 1 - (double) 1 / n; + + // actualNumberOfMatches <= numberOfMatches + int i = 0; + + while (i < docResults.length && docResults[i] != null) { + + url = docResults[i].getUrl(); + + double hScore = docResults[i].getScore() + * Math.pow(pR.getPageRank(url), m); + + // Update the score of the results + docResults[i].setScore(hScore); + + i++; + } + + // sort results by score + SearchResult.sortByScore(docResults); + + String header = "Search results using combined Lucene scores and page rank scores:"; + boolean showTitle = false; + printResults(header, "Query: " + query, docResults, showTitle); + + return docResults; + } + + /** + * A method that combines the score of an index based search and the score + * of the PageRank algorithm to achieve better relevance results, while + * personalizing the result set based on past user clicks on the same or + * similar queries. + * + * NOTE: You would typically refactor all these search methods in order to + * consider it production quality code. Here, we repeat the code of the + * previous method, so that it is easier to read. + * + * @param userID + * identifies the person who issues the query + * @param query + * is the whole query + * @param numberOfMatches + * defines the maximim number of desired matches + * @param pR + * the PageRank vector + * @return the result set + */ + public SearchResult[] search(UserQuery uQuery, int numberOfMatches, Rank pR) { + + SearchResult[] docResults = search(uQuery.getQueryString(), + numberOfMatches); + + String url; + + int docN = docResults.length; + + if (docN > 0) { + + int loop = (docN < numberOfMatches) ? docN : numberOfMatches; + + for (int i = 0; i < loop; i++) { + + url = docResults[i].getUrl(); + + UserClick uClick = new UserClick(uQuery, url); + + /** + * TODO: 2.6 -- Weighing the scores to meet your needs (Book + * Section 2.4.2) + * + * At this point, we have three scores of relevance. The + * relevance score that is based on the index search, the + * PageRank score, and the score that is based on the user's + * prior selections. There is no golden formula for everybody. + * Below we are selecting a formula that we think would make + * sense for most people. + * + * Feel free to change the formula, experiment with different + * weighting factors, to find out the choices that are most + * appropriate for your own site. + * + */ + double indexScore = docResults[i].getScore(); + + double pageRankScore = pR.getPageRank(url); + + double userClickScore = 0.0; + + for (Concept bC : learner.getTset().getConceptSet()) { + if (bC.getName().equalsIgnoreCase(url)) { + userClickScore = learner.getProbability(bC, uClick); + } + } + + // Create the final score + double hScore; + + if (userClickScore == 0) { + + hScore = indexScore * pageRankScore * EPSILON; + + } else { + + hScore = indexScore * pageRankScore * userClickScore; + } + + // Update the score of the results + docResults[i].setScore(hScore); + + /* + * Uncomment this block to show the various scores in the + * BeanShell + * + * StringBuilder b = new StringBuilder(); + * + * System.out.println( + * "________________________________________________________________________________" + * ); + * + * b.append("Document : ").append(docResults[i].getUrl()). + * append("\n"); + * b.append("UserClick URL :").append(uClick.getUrl + * ()).append("\n"); b.append("\n"); + * b.append("Index score: ").append(indexScore).append(", "); + * b.append + * ("PageRank score: ").append(pageRankScore).append(", "); + * b.append("User click score: ").append(userClickScore); + * System.out.println(b.toString()); + */ + } + } + + // Sort array of results + SearchResult.sortByScore(docResults); + + String header = "Search results using combined Lucene scores, " + + "page rank scores and user clicks:"; + String query = "Query: user=" + uQuery.getUid() + ", query text=" + + uQuery.getQueryString(); + boolean showTitle = false; + printResults(header, query, docResults, showTitle); + + return docResults; + } + + public void setUserLearner(NaiveBayes nb) { + learner = nb; + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/org/yooreeka/examples/search/PageRank.java b/src/org/yooreeka/examples/search/PageRank.java new file mode 100644 index 0000000..a571237 --- /dev/null +++ b/src/org/yooreeka/examples/search/PageRank.java @@ -0,0 +1,56 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.search; + +import org.yooreeka.algos.search.ranking.PageRankMatrixBuilder; +import org.yooreeka.algos.search.ranking.PageRankMatrixH; +import org.yooreeka.algos.search.ranking.Rank; +import org.yooreeka.util.internet.crawling.core.CrawlData; + +public class PageRank extends Rank { + + PageRankMatrixBuilder pageRankBuilder; + + public PageRank(CrawlData crawlData) { + try { + pageRankBuilder = new PageRankMatrixBuilder(crawlData); + pageRankBuilder.run(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + @Override + public PageRankMatrixH getH() { + return pageRankBuilder.getH(); + } + +} diff --git a/src/org/yooreeka/examples/spamfilter/EmailClassifier.java b/src/org/yooreeka/examples/spamfilter/EmailClassifier.java new file mode 100644 index 0000000..a1993f9 --- /dev/null +++ b/src/org/yooreeka/examples/spamfilter/EmailClassifier.java @@ -0,0 +1,247 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.spamfilter; + +import java.util.HashMap; +import java.util.Map; + +import org.yooreeka.algos.taxis.bayesian.NaiveBayes; +import org.yooreeka.algos.taxis.core.AttributeValue; +import org.yooreeka.algos.taxis.core.intf.Attribute; +import org.yooreeka.algos.taxis.core.intf.Concept; +import org.yooreeka.algos.taxis.core.intf.Instance; +import org.yooreeka.examples.spamfilter.data.Email; +import org.yooreeka.examples.spamfilter.data.EmailData; +import org.yooreeka.examples.spamfilter.data.EmailDataset; +import org.yooreeka.util.metrics.JaccardCoefficient; + +public class EmailClassifier extends NaiveBayes { + + private EmailDataset emailDataset; + private int topNTerms; + private boolean verbose = true; + private double jaccardThreshold = 0.25; + + public EmailClassifier(EmailDataset emailDataset, int topNTerms) { + super("EmailClassifier", emailDataset.getTrainingSet(topNTerms)); + this.emailDataset = emailDataset; + this.topNTerms = topNTerms; + } + + @Override + protected void calculateConditionalProbabilities() { + + p = new HashMap>(); + + for (Instance i : tSet.getInstances().values()) { + + // In this specific implementation we have exactly one attribute + // In general, you need a loop over the attributes + Attribute a = i.getAtrributes()[0]; + + Map aMap = p.get(i.getConcept()); + + if (aMap == null) { + aMap = new HashMap(); + p.put(i.getConcept(), aMap); + } + + /** + * TODO: 5.3 + */ + AttributeValue bestAttributeValue = findBestAttributeValue(aMap, a); + + if (bestAttributeValue != null) { + + bestAttributeValue.count(); + + } else { + AttributeValue aV = new AttributeValue(a.getValue()); + // register attribute as representative attribute + aMap.put(a, aV); + } + } + } + + public String classify(Email email) { + EmailInstance i = emailDataset.toEmailInstance(email, topNTerms); + Concept c = classify(i); + if (verbose) { + System.out.println("Classified " + email.getId() + " as " + + c.getName()); + } + return c.getName(); + } + + @Override + public Concept classify(Instance instance) { + return super.classify(instance); + } + + /* + * Finds best match for attribute value among existing attribute value + * representatives. + * + * @param aMap map of all attribute representatives. + * + * @param a new attribute to compare against + * + * @return representative attribute that is the best match for a new + * attribute or null if no satisfactory match was found. + */ + private AttributeValue findBestAttributeValue( + Map aMap, Attribute a) { + + JaccardCoefficient jaccardCoeff = new JaccardCoefficient(); + + String aValue = (String) a.getValue(); + String[] aTerms = aValue.split(" "); + Attribute bestMatch = null; + double bestSim = 0.0; + + /* + * Here we only check representative attribute values. Other attribute + * values associated with representative attribute values will be + * ignored by this implementation. + */ + for (Attribute attr : aMap.keySet()) { + String attrValue = (String) attr.getValue(); + String[] attrTerms = attrValue.split(" "); + double sim = jaccardCoeff.similarity(aTerms, attrTerms); + if (sim > jaccardThreshold && sim > bestSim) { + bestSim = sim; + bestMatch = attr; + } + } + + return aMap.get(bestMatch); + } + + /** + * @return the jaccardThreshold + */ + public double getJaccardThreshold() { + return jaccardThreshold; + } + + @Override + public double getProbability(Instance i, Concept c) { + + double cP = 1; + + for (Attribute a : i.getAtrributes()) { + + if (a != null && attributeList.contains(a.getName())) { + + Map aMap = p.get(c); + + AttributeValue bestAttributeValue = findBestAttributeValue( + aMap, a); + + if (bestAttributeValue == null) { + + // the specific attribute value is not present for the + // current concept. + // Can you justify the following estimate? + // Can you think of a better choice? + cP *= ((double) 1 / (tSet.getSize() + 1)); + + } else { + + cP *= (bestAttributeValue.getCount() / conceptPriors.get(c)); + } + } + } + return (cP == 1) ? (double) 1 / tSet.getNumberOfConcepts() : cP; + } + + public void sample() { + + Email email; + // TRAINING SET + System.out.println("________________________________________________"); + System.out.println("Validating with emails from the training dataset"); + System.out.println("________________________________________________"); + email = emailDataset.findEmailById("biz-04.html"); + classify(email); + + email = emailDataset.findEmailById("usa-03.html"); + classify(email); + + // TEST SET + System.out.println("_______________________________________________"); + System.out.println("Testing with unseen emails"); + System.out.println("_______________________________________________"); + + EmailDataset testEmailDS = EmailData.createTestDataset(); + email = testEmailDS.findEmailById("biz-01.html"); + classify(email); + + email = testEmailDS.findEmailById("sport-01.html"); + classify(email); + + email = testEmailDS.findEmailById("usa-01.html"); + classify(email); + + email = testEmailDS.findEmailById("world-01.html"); + classify(email); + + email = testEmailDS.findEmailById("spam-biz-01.html"); + classify(email); + } + + /** + * @param jaccardThreshold + * the jaccardThreshold to set + */ + public void setJaccardThreshold(double jaccardThreshold) { + this.jaccardThreshold = jaccardThreshold; + } + + @Override + public boolean train() { + + if (emailDataset.getSize() == 0) { + System.out + .println("Can't train classifier - training dataset is empty."); + return false; + } + + for (String attrName : getTset().getAttributeNameSet()) { + trainOnAttribute(attrName); + } + + super.train(); + + return true; + } + +} diff --git a/src/org/yooreeka/examples/spamfilter/EmailInstance.java b/src/org/yooreeka/examples/spamfilter/EmailInstance.java new file mode 100644 index 0000000..85436c6 --- /dev/null +++ b/src/org/yooreeka/examples/spamfilter/EmailInstance.java @@ -0,0 +1,86 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.spamfilter; + +import java.util.Map; + +import org.yooreeka.algos.reco.collab.model.Content; +import org.yooreeka.algos.taxis.core.BaseConcept; +import org.yooreeka.algos.taxis.core.BaseInstance; +import org.yooreeka.algos.taxis.core.StringAttribute; +import org.yooreeka.examples.spamfilter.data.Email; + +/** + * Instance for classification. + */ +public class EmailInstance extends BaseInstance { + + private static int DEFAULT_TOP_N_TERMS = 10; + + private String id; + + public EmailInstance(String emailCategory, Email email) { + this(emailCategory, email, DEFAULT_TOP_N_TERMS); + } + + public EmailInstance(String emailCategory, Email email, int topNTerms) { + super(); + this.id = email.getId(); + // email category is our concept/class + this.setConcept(new BaseConcept(emailCategory)); + + /** + * TODO: 5.3 -- Considering more attributes as part of the EmailInstance + * + * -- Separate "subject" and "body" -- timestamp -- "from" -- "to" -- + * "to" cardinality + */ + // extract top N terms from email content and subject + String text = email.getSubject() + " " + email.getTextBody(); + Content content = new Content(email.getId(), text, topNTerms); + Map tfMap = content.getTFMap(); + + attributes = new StringAttribute[1]; + + String attrName = "Email_Text_Attribute"; + String attrValue = ""; + for (Map.Entry tfEntry : tfMap.entrySet()) { + attrValue = attrValue + " " + tfEntry.getKey(); + } + attributes[0] = new StringAttribute(attrName, attrValue); + } + + @Override + public String toString() { + return id; + } + +} diff --git a/src/org/yooreeka/examples/spamfilter/data/Email.java b/src/org/yooreeka/examples/spamfilter/data/Email.java new file mode 100644 index 0000000..6432704 --- /dev/null +++ b/src/org/yooreeka/examples/spamfilter/data/Email.java @@ -0,0 +1,119 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.spamfilter.data; + +/** + * Represents one email document. + */ +public class Email { + + /* + * ID that we will use to identify email. + */ + private String id; + + /* + * Email subject line + */ + private String subject; + + /* + * Email Text body + */ + private String textBody; + + private String from; + + private String to; + + int ruleFired = 0; + + public Email() { + // empty + } + + public String getFrom() { + return from; + } + + public String getId() { + return id; + } + + public int getRuleFired() { + return ruleFired; + } + + public String getSubject() { + return subject; + } + + public String getTextBody() { + return textBody; + } + + public String getTo() { + return to; + } + + public void setFrom(String from) { + this.from = from; + } + + public void setId(String id) { + this.id = id; + } + + public void setRuleFired(int ruleNum) { + System.out.println("Invoked " + this.getClass().getSimpleName() + + ".setRuleFired(" + ruleNum + "), current value ruleFired=" + + this.ruleFired + ", emailId: " + id); + this.ruleFired = ruleNum; + } + + public void setSubject(String subject) { + this.subject = subject; + } + + public void setTextBody(String textBody) { + this.textBody = textBody; + } + + public void setTo(String to) { + this.to = to; + } + + @Override + public String toString() { + return "id: " + id + "\n" + "from: " + from + "\n" + "to: " + to + "\n" + + "subject: " + subject + "\n" + textBody + "\n"; + } +} diff --git a/src/org/yooreeka/examples/spamfilter/data/EmailData.java b/src/org/yooreeka/examples/spamfilter/data/EmailData.java new file mode 100644 index 0000000..6da1161 --- /dev/null +++ b/src/org/yooreeka/examples/spamfilter/data/EmailData.java @@ -0,0 +1,223 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.spamfilter.data; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.examples.spamfilter.EmailClassifier; +import org.yooreeka.util.parsing.common.ProcessedDocument; +import org.yooreeka.util.parsing.html.HTMLDocumentParser; + +public class EmailData { + + /* + * List of html files that we will treat as emails. + */ + public static String[][] TRAINING_DATA = new String[][] { + { YooreekaConfigurator.getHome() + "/data/ch02/biz-02.html", + "A@sengerhost", "1@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/biz-03.html", + "B@sengerhost", "2@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/biz-04.html", + "C@sengerhost", "3@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/biz-05.html", + "D@sengerhost", "4@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/biz-06.html", + "E@sengerhost", "5@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/biz-07.html", + "F@sengerhost", "6@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/sport-02.html", + "G@sengerhost", "7@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/sport-03.html", + "H@sengerhost", "8@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/usa-02.html", + "I@sengerhost", "9@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/usa-03.html", + "J@sengerhost", "10@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/usa-04.html", + "K@sengerhost", "11@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/world-02.html", + "L@sengerhost", "12@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/world-03.html", + "M@sengerhost", "13@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/world-04.html", + "N@sengerhost", "14@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/world-05.html", + "O@sengerhost", "15@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/spam-biz-02.html", + "P@sengerhost", "16@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/spam-biz-03.html", + "Q@sengerhost", "17@host" } }; + + public static String[][] TEST_DATA = new String[][] { + { YooreekaConfigurator.getHome() + "/data/ch02/biz-01.html", + "aa@senderhost", "100@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/sport-01.html", + "bb@senderhost", "101@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/usa-01.html", + "cc@senderhost", "102@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/world-01.html", + "dd@senderhost", "103@host" }, + { YooreekaConfigurator.getHome() + "/data/ch02/spam-biz-01.html", + "friend@senderhost", "104@host" } }; + + public static EmailDataset createTestDataset() { + List allEmails = loadEmails(TEST_DATA); + return new EmailDataset(allEmails); + } + + public static EmailDataset createTrainingDataset() { + List allEmails = loadEmails(TRAINING_DATA); + return new EmailDataset(allEmails); + } + + public static Email loadEmailFromHtml(String htmlFile) { + + ProcessedDocument htmlDoc = processHtmlDoc(htmlFile); + Email email = new Email(); + email.setSubject(htmlDoc.getDocumentTitle()); + email.setTextBody(htmlDoc.getText()); + + return email; + } + + public static List loadEmails(String[][] allEmails) { + + List emailList = new ArrayList(); + for (String[] emailData : allEmails) { + String filename = emailData[0]; + Email email = loadEmailFromHtml(filename); + email.setFrom(emailData[1]); + email.setTo(emailData[2]); + // use filename as unique id + String id = filename.substring(filename.lastIndexOf("/") + 1); + email.setId(id); + + emailList.add(email); + } + + return emailList; + } + + public static void main(String[] args) { + // // Create and train classifier + // EmailDataset trainEmailDS = EmailData.createTrainingDataset(); + // EmailClassifier emailClassifier = new EmailClassifier(trainEmailDS, + // 10); + // emailClassifier.train(); + // + // // Let's classify some emails from training set. If we can't get them + // right + // // then we are in trouble :-) + // Email email = null; + // email = trainEmailDS.findEmailById("biz-04.html"); + // emailClassifier.classify(email); + // + // email = trainEmailDS.findEmailById("usa-03.html"); + // emailClassifier.classify(email); + // + // // Now, let's classify previously unseen emails + // + // EmailDataset testEmailDS = EmailData.createTestDataset(); + // email = testEmailDS.findEmailById("biz-01.html"); + // emailClassifier.classify(email); + // + // email = testEmailDS.findEmailById("sport-01.html"); + // emailClassifier.classify(email); + // + // email = testEmailDS.findEmailById("usa-01.html"); + // emailClassifier.classify(email); + // + // email = testEmailDS.findEmailById("world-01.html"); + // emailClassifier.classify(email); + // + // email = testEmailDS.findEmailById("spam-biz-01.html"); + // emailClassifier.classify(email); + + // Create and train classifier + EmailDataset trainEmailDS = EmailData.createTrainingDataset(); + EmailClassifier spamFilter = new EmailClassifier(trainEmailDS, 10); + spamFilter.train(); + + // Let's classify some emails from training set. If we can't get them + // right + // then we are in trouble :-) + Email email = null; + email = trainEmailDS.findEmailById("biz-04.html"); + spamFilter.classify(email); + + email = trainEmailDS.findEmailById("usa-03.html"); + spamFilter.classify(email); + + // Now, let's classify previously unseen emails + + EmailDataset testEmailDS = EmailData.createTestDataset(); + email = testEmailDS.findEmailById("biz-01.html"); + spamFilter.classify(email); + + email = testEmailDS.findEmailById("sport-01.html"); + spamFilter.classify(email); + + email = testEmailDS.findEmailById("usa-01.html"); + spamFilter.classify(email); + + email = testEmailDS.findEmailById("world-01.html"); + spamFilter.classify(email); + + email = testEmailDS.findEmailById("spam-biz-01.html"); + spamFilter.classify(email); + + } + + private static ProcessedDocument processHtmlDoc(String htmlFile) { + + ProcessedDocument doc = null; + try { + HTMLDocumentParser htmlParser = new HTMLDocumentParser(); + InputStream inputStream = new BufferedInputStream( + new FileInputStream(htmlFile)); + Reader reader = new InputStreamReader(inputStream, "UTF-8"); + doc = htmlParser.parse(reader); + } catch (Exception e) { + throw new RuntimeException("Failed to parse html from file: " + + htmlFile, e); + } + + return doc; + } +} diff --git a/src/org/yooreeka/examples/spamfilter/data/EmailDataset.java b/src/org/yooreeka/examples/spamfilter/data/EmailDataset.java new file mode 100644 index 0000000..3382ce0 --- /dev/null +++ b/src/org/yooreeka/examples/spamfilter/data/EmailDataset.java @@ -0,0 +1,137 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.examples.spamfilter.data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.algos.taxis.core.TrainingSet; +import org.yooreeka.examples.spamfilter.EmailInstance; + +public class EmailDataset { + + private Map emails; + + // By default we set up an email dataset for binary classification + private boolean isBinary = true; + + public EmailDataset(List emailList) { + this.emails = new HashMap(emailList.size()); + for (Email e : emailList) { + emails.put(e.getId(), e); + } + } + + private List createEmailInstances(int topNTerms) { + List allInstances = new ArrayList(); + for (Email email : getEmails()) { + EmailInstance i = toEmailInstance(email, topNTerms); + allInstances.add(i); + } + return allInstances; + } + + public Email findEmailById(String id) { + return emails.get(id); + } + + private String getEmailCategory(Email email) { + + if (isBinary()) { + if (email.getId().startsWith("spam-")) { + return "SPAM"; + } else { + return "NOT SPAM"; + } + } else { + // relying id to have pattern: "biz-???", "world-???", ... + String[] parts = email.getId().split("-"); + if (parts.length < 2) { + throw new RuntimeException( + "Unsupported id format. Expected id format: '-???'"); + } + return parts[0].toUpperCase(); + } + } + + public List getEmails() { + return new ArrayList(emails.values()); + } + + public int getSize() { + return emails.size(); + } + + public TrainingSet getTrainingSet(int topNTerms) { + List allInstances = createEmailInstances(topNTerms); + EmailInstance[] instances = allInstances + .toArray(new EmailInstance[allInstances.size()]); + return new TrainingSet(instances); + } + + /** + * @return the isBinary + */ + public boolean isBinary() { + return isBinary; + } + + public void printAll() { + for (Map.Entry e : emails.entrySet()) { + Email email = e.getValue(); + System.out.println(email); + } + } + + public void printEmail(String id) { + Email e = findEmailById(id); + if (e != null) { + System.out.println(e.toString()); + } else { + System.out.println("Email not found (email id: '" + id + "')"); + } + } + + /** + * @param isBinary + * the isBinary to set + */ + public void setBinary(boolean isBinary) { + this.isBinary = isBinary; + } + + public EmailInstance toEmailInstance(Email email, int topNTerms) { + String emailCategory = getEmailCategory(email); + return new EmailInstance(emailCategory, email, topNTerms); + } +} diff --git a/src/org/yooreeka/util/C.java b/src/org/yooreeka/util/C.java new file mode 100644 index 0000000..89c70da --- /dev/null +++ b/src/org/yooreeka/util/C.java @@ -0,0 +1,64 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-2012 Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util; + +/** + * Various constants to avoid typing literals and values in the code. + * + * @author Babis Marmanis + * + */ +public class C { + + /* + * NUMERICAL CONSTANTS + */ + public final static int ZERO_INT = 0; + public final static long ZERO_LONG = 0; + public final static double ZERO_DOUBLE = 0.0; + + public final static int ONE_INT = 1; + public final static long ONE_LONG = 1; + public final static double ONE_DOUBLE = 1.0; + + /* + * LITERAL CONSTANTS + */ + public static final String EMPTY_STRING=""; + public static final String LINE_FEED="\n"; + public static final String UNDERSCORE="_"; + public static final String DASH="-"; + public static final String SEMICOLON=";"; + public static final String COMMA=","; + public static final String COLON=":"; + public static final String DOT="."; + +} diff --git a/src/org/yooreeka/util/P.java b/src/org/yooreeka/util/P.java new file mode 100644 index 0000000..6a43ac1 --- /dev/null +++ b/src/org/yooreeka/util/P.java @@ -0,0 +1,57 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util; + +import java.nio.charset.Charset; + +/** + * + * @author Babis Marmanis + * + */ +public class P { + + /** + * Print a 54 character (-) horizontal line. + */ + public static void hline() { + println("---------- ---------- ---------- ---------- ---------- ----------"); + } + + public static void main(String[] args) { + println(Charset.defaultCharset().displayName()); + println("" + P.class.getName()); + } + + public static void println(String s) { + System.out.println(s); + } +} diff --git a/src/org/yooreeka/util/gui/GraphGui.java b/src/org/yooreeka/util/gui/GraphGui.java new file mode 100644 index 0000000..ad01801 --- /dev/null +++ b/src/org/yooreeka/util/gui/GraphGui.java @@ -0,0 +1,152 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.gui; + +import java.awt.Color; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.swing.BorderFactory; +import javax.swing.JFrame; +import javax.swing.JScrollPane; +import javax.swing.WindowConstants; + +import org.jgraph.JGraph; +import org.jgraph.graph.DefaultCellViewFactory; +import org.jgraph.graph.DefaultEdge; +import org.jgraph.graph.DefaultGraphCell; +import org.jgraph.graph.DefaultGraphModel; +import org.jgraph.graph.GraphConstants; +import org.jgraph.graph.GraphLayoutCache; +import org.jgraph.graph.GraphModel; + +public class GraphGui { + + private double nodeWidth = 500; + private double nodeHeight = 20; + + private JGraph graph = null; + + private Map nodeCells = new HashMap(); + private List edgeCells = new ArrayList(); + + public GraphGui() { + createGraph(); + } + + public void addEdge(String sourceNodeName, String targetNodeName) { + DefaultGraphCell sourceNodeCell = getNodeForEdge(sourceNodeName); + DefaultGraphCell targetNodeCell = getNodeForEdge(targetNodeName); + DefaultGraphCell edgeCell = createEdge(sourceNodeCell, targetNodeCell); + edgeCells.add(edgeCell); + } + + public void addNode(String name, String extraText, double x, double y) { + String nodeLabel = name; + if (extraText != null) { + nodeLabel += " (" + extraText + ")"; + } + DefaultGraphCell nodeCell = createCell(nodeLabel, x, y); + nodeCells.put(name, nodeCell); + } + + private DefaultGraphCell createCell(String name, double x, double y) { + DefaultGraphCell cell = new DefaultGraphCell(name); + GraphConstants.setBounds(cell.getAttributes(), new Rectangle2D.Double( + x, y, nodeWidth, nodeHeight)); + GraphConstants.setBorder(cell.getAttributes(), + BorderFactory.createRaisedBevelBorder()); + GraphConstants.setOpaque(cell.getAttributes(), true); + GraphConstants.setGradientColor(cell.getAttributes(), Color.orange); + cell.addPort(new Point2D.Double(0, 0)); + return cell; + } + + private DefaultGraphCell createEdge(DefaultGraphCell source, + DefaultGraphCell target) { + DefaultEdge edge = new DefaultEdge(); + source.addPort(); + edge.setSource(source.getChildAt(source.getChildCount() - 1)); + target.addPort(); + edge.setTarget(target.getChildAt(target.getChildCount() - 1)); + GraphConstants.setLabelAlongEdge(edge.getAttributes(), true); + GraphConstants.setLineEnd(edge.getAttributes(), + GraphConstants.ARROW_CLASSIC); + // GraphConstants.setRouting(edge.getAttributes(), + // GraphConstants.ROUTING_DEFAULT); + // GraphConstants.setRouting(edge.getAttributes(), + // GraphConstants.ROUTING_SIMPLE); + return edge; + } + + private void createGraph() { + GraphModel model = new DefaultGraphModel(); + GraphLayoutCache view = new GraphLayoutCache(model, + new DefaultCellViewFactory()); + graph = new JGraph(model, view); + } + + private DefaultGraphCell getNodeForEdge(String nodeName) { + DefaultGraphCell nodeCell = nodeCells.get(nodeName); + if (nodeCell == null) { + throw new RuntimeException("Node doesn't exist " + "(nodeName=" + + nodeName + ")."); + } + return nodeCell; + } + + private void insertAllCells() { + List allCells = new ArrayList(); + allCells.addAll(nodeCells.values()); + allCells.addAll(edgeCells); + + DefaultGraphCell[] cells = allCells + .toArray(new DefaultGraphCell[nodeCells.size()]); + + graph.getGraphLayoutCache().insert(cells); + graph.setEditable(false); + } + + public void showGraph() { + insertAllCells(); + JFrame frame = new JFrame(); + frame.getContentPane().add(new JScrollPane(graph)); + // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE); + frame.pack(); + frame.setVisible(true); + } + +} diff --git a/src/org/yooreeka/util/gui/XyGui.java b/src/org/yooreeka/util/gui/XyGui.java new file mode 100644 index 0000000..e5a6ae6 --- /dev/null +++ b/src/org/yooreeka/util/gui/XyGui.java @@ -0,0 +1,203 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.gui; + +import java.awt.event.WindowEvent; + +import org.jfree.chart.ChartFactory; +import org.jfree.chart.ChartPanel; +import org.jfree.chart.JFreeChart; +import org.jfree.chart.plot.PlotOrientation; +import org.jfree.chart.util.ApplicationFrame; +import org.jfree.chart.util.RefineryUtilities; +import org.jfree.data.category.DefaultCategoryDataset; +import org.jfree.data.xy.XYSeries; +import org.jfree.data.xy.XYSeriesCollection; + +/** + * + * This is going to be a convenience class for doing basic XY plots. here is how + * it would be used within the Bean Shell interpreter: + * + * bsh % double[] x = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; bsh + * % double[] y = {1.0, 4.0, 9.0, 16.0, 20.0, 29.0, 35, 40., 42.0}; bsh % gui = + * new iweb2.util.gui.XyGui ("A plot",x,y); bsh % gui.plot(); + * + * @author Babis Marmanis + * + */ +public class XyGui extends ApplicationFrame { + + /** + * + */ + private static final long serialVersionUID = 2878334413514645876L; + + private StringBuilder errMsg; + private int loopInt; + + public XyGui(String title, double[] x, double[] y) { + + super(title); + + errMsg = new StringBuilder(); + setLoopInt(x.length); + + if (checkX(x) && checkY(x.length, y)) { + + XYSeries xydata = new XYSeries("X-Y Plot"); + + for (int i = 0; i < loopInt; i++) { + xydata.add(x[i], y[i]); + } + + XYSeriesCollection xycollection = new XYSeriesCollection(xydata); + + final JFreeChart chart = ChartFactory.createXYLineChart( + "XY Series", "X", "Y", xycollection, + PlotOrientation.VERTICAL, true, true, false); + + final ChartPanel chartPanel = new ChartPanel(chart); + chartPanel.setPreferredSize(new java.awt.Dimension(500, 270)); + setContentPane(chartPanel); + } else { + System.err.println(errMsg.toString()); + } + } + + /** + * @param title + * chart title + * @param nameForData1 + * identifier for a data group/series + * @param nameForData2 + * identifier for a data group/series + * @param items + * values/categories that correspond to data values + */ + public XyGui(String title, String nameForData1, String nameForData2, + String[] items, double[] data1, double[] data2) { + + super(title); + DefaultCategoryDataset dataset = new DefaultCategoryDataset(); + for (int i = 0, n = items.length; i < n; i++) { + dataset.addValue(data1[i], nameForData1, items[i]); + dataset.addValue(data2[i], nameForData2, items[i]); + } + + final JFreeChart chart = ChartFactory.createLineChart( + "User Similarity", "Items", "Rating", dataset, + PlotOrientation.VERTICAL, true, true, false); + + final ChartPanel chartPanel = new ChartPanel(chart); + chartPanel.setPreferredSize(new java.awt.Dimension(500, 270)); + setContentPane(chartPanel); + } + + private boolean checkX(double[] val) { + + boolean isOK = true; + + if (val == null || val.length <= 0) { + + errMsg.append("The array of data for the X-axis is null or does not contain data!"); + isOK = false; + } + + return isOK; + } + + private boolean checkY(int n, double[] val) { + + boolean isOK = true; + + if (val == null || val.length <= 0) { + errMsg.append("---------------------------------------------------------------------\n"); + errMsg.append("ERROR:\n"); + errMsg.append("The array of data for the Y-axis is null or does not contain data!"); + errMsg.append("---------------------------------------------------------------------\n"); + isOK = false; + } + + if (val.length > n) { + + errMsg.append("---------------------------------------------------------------------\n"); + errMsg.append("WARNING: \n"); + errMsg.append(" The length of the array for the Y-axis data is greater than \n"); + errMsg.append(" the length of the array for the X-axis data. \n"); + errMsg.append(" Only the first " + n + + " points will be considered in the plot."); + errMsg.append("---------------------------------------------------------------------\n"); + + } else if (val.length < n) { + + errMsg.append("---------------------------------------------------------------------\n"); + errMsg.append("WARNING:\n"); + errMsg.append(" The length of the array for the Y-axis data is less than \n"); + errMsg.append(" the length of the array for the X-axis data. \n"); + errMsg.append(" Only the first " + n + + " points of the X-will be considered in the plot."); + errMsg.append("---------------------------------------------------------------------\n"); + setLoopInt(val.length); + } + + return isOK; + } + + public void plot() { + this.pack(); + RefineryUtilities.centerFrameOnScreen(this); + this.setVisible(true); + } + + private void setLoopInt(int val) { + loopInt = val; + } + + /** + * Listens for the main window closing, and shuts down the application. + * + * @param event + * information about the window event. + */ + @Override + public void windowClosing(WindowEvent event) { + if (event.getWindow() == this) { + dispose(); + + // Overriding the ApplicationFrame behavior + // Do not shutdown the JVM + // System.exit(0); + // ----------------------------------------- + } + } + +} diff --git a/src/org/yooreeka/util/internet/behavior/UserClick.java b/src/org/yooreeka/util/internet/behavior/UserClick.java new file mode 100644 index 0000000..66311d4 --- /dev/null +++ b/src/org/yooreeka/util/internet/behavior/UserClick.java @@ -0,0 +1,157 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.behavior; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; + +import org.yooreeka.algos.taxis.core.BaseConcept; +import org.yooreeka.algos.taxis.core.BaseInstance; +import org.yooreeka.algos.taxis.core.StringAttribute; + +/** + * Auxiliary class that captures a user click. + * + * @author Babis Marmanis + * + */ +public class UserClick extends BaseInstance { + + UserQuery userQuery; + String url; + + public UserClick() { + super(); + } + + public UserClick(UserQuery uQ, String url) { + + super(); + + userQuery = uQ; + this.setConcept(new BaseConcept(url)); + + attributes = new StringAttribute[userQuery.getQueryTerms().length + 1]; + + attributes[0] = new StringAttribute("UserName", userQuery.getUid()); + + int j = 1; + for (String s : uQ.getQueryTerms()) { + attributes[j] = new StringAttribute("QueryTerm_" + j, s); + j++; + } + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final UserClick other = (UserClick) obj; + if (getUrl() == null) { + if (other.getUrl() != null) + return false; + } else if (!getUrl().equals(other.getUrl())) + return false; + if (userQuery == null) { + if (other.userQuery != null) + return false; + } else if (!userQuery.equals(other.userQuery)) + return false; + return true; + } + + /** + * The concept of a user click is its URL + * + * @return the url + */ + public String getUrl() { + return getConcept().getName(); + } + + /** + * @return the userQuery + */ + public UserQuery getUserQuery() { + return userQuery; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((getUrl() == null) ? 0 : getUrl().hashCode()); + result = prime * result + + ((userQuery == null) ? 0 : userQuery.hashCode()); + return result; + } + + @Override + public UserClick[] load(BufferedReader bR) throws IOException { + + ArrayList userClicks = new ArrayList(); + + String line; + boolean hasMoreLines = true; + + while (hasMoreLines) { + + line = bR.readLine(); + + if (line == null) { + + hasMoreLines = false; + + } else { + + String[] data = line.split(","); + + UserQuery uQ = new UserQuery(data[0], data[1]); + + UserClick userClick = new UserClick(uQ, data[2].substring(1, + data[2].length() - 1)); + + userClick.print(); + + userClicks.add(userClick); + } + } + + return userClicks.toArray(new UserClick[userClicks.size()]); + } + +} diff --git a/src/org/yooreeka/util/internet/behavior/UserQuery.java b/src/org/yooreeka/util/internet/behavior/UserQuery.java new file mode 100644 index 0000000..1a974f9 --- /dev/null +++ b/src/org/yooreeka/util/internet/behavior/UserQuery.java @@ -0,0 +1,159 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.behavior; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; + +/** + * This is a class that encapsulates a personalized query + * + * @author Babis Marmanis + * + */ +public class UserQuery { + + private String uid; + private String queryString; + private String[] queryTerms; + private Query query; + + public UserQuery(String uid, String q) throws IOException { + + setUid(uid); + setQueryString(q); + + PhraseQuery query = new PhraseQuery(); + query.add(new Term("content", q)); + + Term[] terms = query.getTerms(); + queryTerms = new String[terms.length]; + + for (int i = 0; i < terms.length; i++) { + + queryTerms[i] = terms[i].text(); + } + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final UserQuery other = (UserQuery) obj; + if (queryString == null) { + if (other.queryString != null) + return false; + } else if (!queryString.equals(other.queryString)) + return false; + if (!Arrays.equals(queryTerms, other.queryTerms)) + return false; + if (uid == null) { + if (other.uid != null) + return false; + } else if (!uid.equals(other.uid)) + return false; + return true; + } + + public String getName() { + return UserQuery.class.getCanonicalName(); + } + + public Query getQuery() { + return query; + } + + /** + * @return the query + */ + public String getQueryString() { + return queryString; + } + + /** + * @return the queryTerms + */ + public String[] getQueryTerms() { + return queryTerms; + } + + /** + * @return the uid + */ + public String getUid() { + return uid; + } + + public UserQuery getValue() { + + return this; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((queryString == null) ? 0 : queryString.hashCode()); + result = prime * result + Arrays.hashCode(queryTerms); + result = prime * result + ((uid == null) ? 0 : uid.hashCode()); + return result; + } + + public void setQuery(Query query) { + this.query = query; + } + + /** + * @param query + * the query to set + */ + public void setQueryString(String query) { + this.queryString = query; + } + + /** + * @param uid + * the uid to set + */ + public void setUid(String uid) { + this.uid = uid; + } + +} diff --git a/src/org/yooreeka/util/internet/crawling/FetchAndProcessCrawler.java b/src/org/yooreeka/util/internet/crawling/FetchAndProcessCrawler.java new file mode 100644 index 0000000..0342962 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/FetchAndProcessCrawler.java @@ -0,0 +1,310 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling; + +import java.util.ArrayList; +import java.util.List; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.internet.crawling.core.BasicWebCrawler; +import org.yooreeka.util.internet.crawling.core.CrawlData; +import org.yooreeka.util.internet.crawling.core.URLFilter; +import org.yooreeka.util.internet.crawling.core.URLNormalizer; + +public class FetchAndProcessCrawler { + + public static final int DEFAULT_MAX_DEPTH = 3; + public static final int DEFAULT_MAX_DOCS = 1000; + + // INSTANCE VARIABLES + // A reference to the crawler + BasicWebCrawler webCrawler; + + // The location where we will store the fetched data + String rootDir; + + // Total number of crawlers + int numberOfCrawlers = 4; + + // total number of iterations + int maxDepth = DEFAULT_MAX_DEPTH; + + // max number of pages that will be fetched within every crawl/iteration. + int maxDocs = DEFAULT_MAX_DOCS; + + List seedUrls; + + URLFilter urlFilter; + + public FetchAndProcessCrawler(String dir, int maxDepth, int maxDocs) { + + rootDir = dir; + + // If the root directory is not set or if its length is zero + if (rootDir == null || rootDir.trim().length() == 0) { + + // Create a default location for storing the data, relative to the + // IWEB2_HOME location + rootDir = System.getProperty("iweb2.home") + + System.getProperty("file.separator") + "data"; + } + + rootDir = rootDir + System.getProperty("file.separator") + "crawl-" + + System.currentTimeMillis(); + + this.maxDepth = maxDepth; + + this.maxDocs = maxDocs; + + this.seedUrls = new ArrayList(); + + /* default url filter configuration */ + this.urlFilter = new URLFilter(); + urlFilter.setAllowFileUrls(true); + urlFilter.setAllowHttpUrls(true); + + webCrawler = new BasicWebCrawler(rootDir); + + } + + public void addDocSpam() { + + String iWeb2Home = YooreekaConfigurator.getHome(); + + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-01.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-02.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-03.doc"); + } + + public void addUrl(String val) { + URLNormalizer urlNormalizer = new URLNormalizer(); + seedUrls.add(urlNormalizer.normalizeUrl(val)); + } + + public CrawlData getCrawlData() { + return webCrawler.getCrawlData(); + } + + /** + * @return the maxNumberOfCrawls + */ + public int getMaxNumberOfCrawls() { + return maxDepth; + } + + /** + * @return the maxNumberOfDocsPerCrawl + */ + public int getMaxNumberOfDocsPerCrawl() { + return maxDocs; + } + + /** + * @return the rootDir + */ + public String getRootDir() { + return rootDir; + } + + public List getSeedUrls() { + + return seedUrls; + } + + public void run() { + + webCrawler.addSeedUrls(getSeedUrls()); + + webCrawler.setURLFilter(urlFilter); + + long t0 = System.currentTimeMillis(); + + /* run crawl */ + webCrawler.fetchAndProcess(maxDepth, maxDocs); + + System.out.println("Timer (s): [Crawler processed data] --> " + + (System.currentTimeMillis() - t0) * 0.001); + + } + + public void setAllUrls() { + + setDefaultUrls(); + + String iWeb2Home = YooreekaConfigurator.getHome(); + + // Include the spam pages ... all of them! + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/spam-biz-03.html"); + } + + public void setDefaultUrls() { + + String iWeb2Home = YooreekaConfigurator.getHome(); + + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-04.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-05.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-06.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-07.html"); + + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-03.html"); + + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-04.html"); + + addUrl("file:///" + iWeb2Home + "/data/ch02/world-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-04.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-05.html"); + + setFilesOnlyUrlFilter(); + } + + private void setFilesOnlyUrlFilter() { + /* configure url filter to accept only file:// urls */ + URLFilter urlFilter = new URLFilter(); + urlFilter.setAllowFileUrls(true); + urlFilter.setAllowHttpUrls(false); + setUrlFilter(urlFilter); + } + + /** + * @param maxNumberOfCrawls + * the maxNumberOfCrawls to set + */ + public void setMaxNumberOfCrawls(int maxNumberOfCrawls) { + this.maxDepth = maxNumberOfCrawls; + } + + /** + * @param maxNumberOfDocsPerCrawl + * the maxNumberOfDocsPerCrawl to set + */ + public void setMaxNumberOfDocsPerCrawl(int maxNumberOfDocsPerCrawl) { + this.maxDocs = maxNumberOfDocsPerCrawl; + } + + /** + * @param rootDir + * the rootDir to set + */ + public void setRootDir(String rootDir) { + this.rootDir = rootDir; + } + + public void setUrlFilter(URLFilter urlFilter) { + this.urlFilter = urlFilter; + } + + public void setUrls(String val) { + + String iWeb2Home = YooreekaConfigurator.getHome(); + + setFilesOnlyUrlFilter(); + + this.seedUrls.clear(); + + if (val.equalsIgnoreCase("biz")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-04.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-05.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-06.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-07.html"); + + } else if (val.equalsIgnoreCase("sport")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-03.html"); + + } else if (val.equalsIgnoreCase("usa")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-04.html"); + + } else if (val.equalsIgnoreCase("world")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/world-01.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-02.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-03.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-04.html"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-05.html"); + } else if (val.equalsIgnoreCase("biz-docs")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-01.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-02.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-03.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-04.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-05.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-06.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/biz-07.doc"); + + } else if (val.equalsIgnoreCase("sport-docs")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-01.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-02.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/sport-03.doc"); + + } else if (val.equalsIgnoreCase("usa-docs")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-01.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-02.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-03.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/usa-04.doc"); + + } else if (val.equalsIgnoreCase("world-docs")) { + + addUrl("file:///" + iWeb2Home + "/data/ch02/world-01.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-02.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-03.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-04.doc"); + addUrl("file:///" + iWeb2Home + "/data/ch02/world-05.doc"); + } else { + throw new IllegalArgumentException("Unknown value: '" + val + "'"); + } + + } +} diff --git a/src/org/yooreeka/util/internet/crawling/YCrawler.java b/src/org/yooreeka/util/internet/crawling/YCrawler.java new file mode 100644 index 0000000..58f062a --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/YCrawler.java @@ -0,0 +1,197 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling; + +import java.util.List; +import java.util.regex.Pattern; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.parser.HtmlParseData; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import edu.uci.ics.crawler4j.url.WebURL; + +/** + * A general crawler based on the Crawler4J library. + * + * {@link http://code.google.com/p/crawler4j/} + * + */ +public class YCrawler extends WebCrawler { + + // PUBLIC STATIC CONSTANTS + public final static int CONNECTION_TIMEOUT = 5000; + + // PRIVATE STATIC CONSTANTS + private final static Pattern FILTERS = Pattern + .compile(".*(\\.(css|js|bmp|gif|jpe?g" + + "|png|tiff?|mid|mp2|mp3|mp4" + + "|wav|avi|mov|mpeg|ram|m4v" + + "|rm|smil|wmv|swf|wma|zip|rar|gz))$"); + + // INSTANCE VARIABLES + + public static void main(String[] args) throws Exception { + + YCrawler crawler = new YCrawler(); + + // To change the root dir you can invoke setRootDir() here + // before the setup() + CrawlController controller = crawler.setup(); + + /* + * Start the crawl. This is a blocking operation, meaning that your code + * will reach the line after this only when crawling is finished. + */ + controller.start(YCrawler.class, crawler.getNumberOfCrawlers()); + } + + /** + * The location where we will store the fetched data. Note that this is a + * location for all the crawls of this class. If you would like to change it + * use the setRootDir() method. + */ + private String rootDir; + + private int numberOfCrawlers = 5; + + private int connectionTimeout = CONNECTION_TIMEOUT; + + private int getNumberOfCrawlers() { + return numberOfCrawlers; + } + + private String getRootDir() { + + // If the root directory is not set or if its length is zero + if (rootDir == null || rootDir.trim().length() == 0) { + + // Create a default location for storing the data, relative to the + // IWEB2_HOME location + rootDir = System.getProperty("iweb2.home") + + System.getProperty("file.separator") + "data"; + } + + rootDir = rootDir + System.getProperty("file.separator") + "crawl-" + + System.currentTimeMillis(); + + return rootDir; + } + + public void setNumberOfCrawlers(int numberOfCrawlers) { + this.numberOfCrawlers = numberOfCrawlers; + } + + public void setRootDir(String rootDir) { + this.rootDir = rootDir; + } + + private CrawlController setup() { + + CrawlConfig crawlConfiguration = new CrawlConfig(); + crawlConfiguration.setConnectionTimeout(connectionTimeout); + crawlConfiguration.setCrawlStorageFolder(getRootDir()); + crawlConfiguration.setFollowRedirects(true); + crawlConfiguration.setIncludeBinaryContentInCrawling(true); + crawlConfiguration.setIncludeHttpsPages(true); + + // The default value is 100 + crawlConfiguration.setMaxConnectionsPerHost(32); + + // Try 32 Mb; the default is 1 Mb + crawlConfiguration.setMaxDownloadSize(32 * 1024 * 1024); + + // LIMIT THE MAX NUMBER OF PAGES!!! + // Unless you know what you are doing, technically and business wise ... + crawlConfiguration.setMaxPagesToFetch(64); + + PageFetcher pageFetcher = new PageFetcher(crawlConfiguration); + + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); + + RobotstxtServer robotsTextServer = new RobotstxtServer(robotstxtConfig, + pageFetcher); + + CrawlController controller = null; + try { + controller = new CrawlController(crawlConfiguration, pageFetcher, + robotsTextServer); + } catch (Exception e) { + // TODO proper logging ... + e.printStackTrace(); + } + + /* + * For each crawl, you need to add some seed urls. These are the first + * URLs that are fetched and then the crawler starts following links + * which are found in these pages + */ + controller.addSeed("http://arxiv.org/"); + + return controller; + } + + /** + * You should implement this function to specify whether the given url + * should be crawled or not (based on your crawling logic). + */ + @Override + public boolean shouldVisit(WebURL url) { + String href = url.getURL().toLowerCase(); + return !FILTERS.matcher(href).matches() + && href.startsWith("http://www.ics.uci.edu/"); + } + + /** + * This function is called when a page is fetched and ready to be processed + * by your program. + */ + @Override + public void visit(Page page) { + String url = page.getWebURL().getURL(); + System.out.println("URL: " + url); + + if (page.getParseData() instanceof HtmlParseData) { + HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); + String text = htmlParseData.getText(); + String html = htmlParseData.getHtml(); + List links = htmlParseData.getOutgoingUrls(); + + System.out.println("Text length: " + text.length()); + System.out.println("Html length: " + html.length()); + System.out.println("Number of outgoing links: " + links.size()); + } + } +} \ No newline at end of file diff --git a/src/org/yooreeka/util/internet/crawling/core/BasicWebCrawler.java b/src/org/yooreeka/util/internet/crawling/core/BasicWebCrawler.java new file mode 100644 index 0000000..b320b62 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/BasicWebCrawler.java @@ -0,0 +1,332 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +import java.util.List; + +import org.yooreeka.util.P; +import org.yooreeka.util.internet.crawling.db.FetchedDocsDB; +import org.yooreeka.util.internet.crawling.db.KnownUrlDB; +import org.yooreeka.util.internet.crawling.db.ProcessedDocsDB; +import org.yooreeka.util.internet.crawling.model.FetchedDocument; +import org.yooreeka.util.internet.crawling.model.KnownUrlEntry; +import org.yooreeka.util.internet.crawling.model.Outlink; +import org.yooreeka.util.internet.crawling.transport.common.Transport; +import org.yooreeka.util.internet.crawling.transport.file.FileTransport; +import org.yooreeka.util.internet.crawling.transport.http.HTTPTransport; +import org.yooreeka.util.internet.crawling.util.DocumentIdUtils; +import org.yooreeka.util.internet.crawling.util.UrlGroup; +import org.yooreeka.util.internet.crawling.util.UrlUtils; +import org.yooreeka.util.parsing.common.AbstractDocument; +import org.yooreeka.util.parsing.common.DocumentParser; +import org.yooreeka.util.parsing.common.DocumentParserFactory; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +public class BasicWebCrawler { + + private CrawlData crawlData; + + private URLFilter urlFilter; + + private static final int DEFAULT_MAX_BATCH_SIZE = 50; + + private long DEFAULT_PAUSE_IN_MILLIS = 500; + private long pauseBetweenFetchesInMillis = DEFAULT_PAUSE_IN_MILLIS; + + /* + * Number of URLs to fetch and parse at a time. + */ + private int maxBatchSize = DEFAULT_MAX_BATCH_SIZE; + + /* + * Number of fetched and parsed URLs so far. + */ + private int processedUrlCount = 0; + + public BasicWebCrawler(String rootDir) { + crawlData = new CrawlData(rootDir); + } + + public void addSeedUrls(List seedUrls) { + int seedUrlDepth = 0; + KnownUrlDB knownUrlsDB = crawlData.getKnownUrlsDB(); + for (String url : seedUrls) { + knownUrlsDB.addNewUrl(url, seedUrlDepth); + } + } + + public void fetchAndProcess(int maxDepth, int maxDocs) { + + boolean maxUrlsLimitReached = false; + int documentGroup = 1; + + crawlData.init(); + + if (maxBatchSize <= 0) { + throw new RuntimeException("Invalid value for maxBatchSize = " + + maxBatchSize); + } + + for (int depth = 0; depth < maxDepth; depth++) { + + int urlsProcessedAtThisDepth = 0; + + boolean noMoreUrlsAtThisDepth = false; + + while (maxUrlsLimitReached == false + && noMoreUrlsAtThisDepth == false) { + + System.out.println("Starting url group: " + documentGroup + + ", current depth: " + depth + ", total known urls: " + + crawlData.getKnownUrlsDB().getTotalUrlCount() + + ", maxDepth: " + maxDepth + ", maxDocs: " + maxDocs + + ", maxDocs per group: " + maxBatchSize + + ", pause between docs: " + + pauseBetweenFetchesInMillis + "(ms)"); + + List urlsToProcess = selectNextBatchOfUrlsToCrawl( + maxBatchSize, depth); + + /* for batch of urls create a separate document group */ + String currentGroupId = String.valueOf(documentGroup); + fetchPages(urlsToProcess, crawlData.getFetchedDocsDB(), + currentGroupId); + + // process downloaded data + processPages(currentGroupId, crawlData.getProcessedDocsDB(), + crawlData.getFetchedDocsDB()); + + // get processed doc, get links, add links to all-known-urls.dat + processLinks(currentGroupId, depth + 1, + crawlData.getProcessedDocsDB()); + + int lastProcessedBatchSize = urlsToProcess.size(); + processedUrlCount += lastProcessedBatchSize; + urlsProcessedAtThisDepth += lastProcessedBatchSize; + + System.out.println("Finished url group: " + documentGroup + + ", urls processed in this group: " + + lastProcessedBatchSize + ", current depth: " + depth + + ", total urls processed: " + processedUrlCount); + + documentGroup += 1; + + if (processedUrlCount >= maxDocs) { + maxUrlsLimitReached = true; + } + + if (lastProcessedBatchSize == 0) { + noMoreUrlsAtThisDepth = true; + } + } + + if (urlsProcessedAtThisDepth == 0) { + break; + } + + if (maxUrlsLimitReached) { + break; + } + + } + } + + private void fetchPages(List urls, FetchedDocsDB fetchedDocsDB, + String groupId) { + DocumentIdUtils docIdUtils = new DocumentIdUtils(); + int docSequenceInGroup = 1; + List urlGroups = UrlUtils.groupByProtocolAndHost(urls); + for (UrlGroup urlGroup : urlGroups) { + Transport t = getTransport(urlGroup.getProtocol()); + try { + t.init(); + for (String url : urlGroup.getUrls()) { + try { + FetchedDocument doc = t.fetch(url); + String documentId = docIdUtils.getDocumentId(groupId, + docSequenceInGroup); + doc.setDocumentId(documentId); + fetchedDocsDB.saveDocument(doc); + if (t.pauseRequired()) { + pause(); + } + } catch (Exception e) { + System.out + .println("Failed to fetch document from url: '" + + url + "'.\n" + e.getMessage()); + crawlData.getKnownUrlsDB().updateUrlStatus(url, + KnownUrlEntry.STATUS_PROCESSED_ERROR); + } + docSequenceInGroup++; + } + } finally { + t.clear(); + } + } + } + + public CrawlData getCrawlData() { + return crawlData; + } + + public long getPauseBetweenFetchesInMillis() { + return pauseBetweenFetchesInMillis; + } + + private Transport getTransport(String protocol) { + if ("http".equalsIgnoreCase(protocol)) { + return new HTTPTransport(); + } else if ("file".equalsIgnoreCase(protocol)) { + return new FileTransport(); + } else { + throw new RuntimeException("Unsupported protocol: '" + protocol + + "'."); + } + } + + public URLFilter getURLFilter() { + return urlFilter; + } + + public void pause() { + try { + Thread.sleep(pauseBetweenFetchesInMillis); + } catch (InterruptedException e) { + // do nothing + } + } + + private void processLinks(String groupId, int currentDepth, + ProcessedDocsDB parsedDocs) { + URLNormalizer urlNormalizer = new URLNormalizer(); + if (urlFilter == null) { + urlFilter = new URLFilter(); + urlFilter.setAllowFileUrls(true); + urlFilter.setAllowHttpUrls(false); + System.out + .println("Using default URLFilter configuration that only accepts 'file://' urls"); + } + + List docIds = parsedDocs.getDocumentIds(groupId); + for (String documentId : docIds) { + ProcessedDocument doc = parsedDocs.loadDocument(documentId); + // register url without any outlinks first + crawlData.getPageLinkDB().addLink(doc.getDocumentURL()); + List outlinks = doc.getOutlinks(); + for (Outlink outlink : outlinks) { + String url = outlink.getLinkUrl(); + String normalizedUrl = urlNormalizer.normalizeUrl(url); + if (urlFilter.accept(normalizedUrl)) { + crawlData.getKnownUrlsDB().addNewUrl(url, currentDepth); + crawlData.getPageLinkDB() + .addLink(doc.getDocumentURL(), url); + } + } + } + crawlData.getKnownUrlsDB().save(); + crawlData.getPageLinkDB().save(); + } + + private void processPages(String groupId, + ProcessedDocsDB parsedDocsService, FetchedDocsDB fetchedDocsDB) { + + List docIds = fetchedDocsDB.getDocumentIds(groupId); + + for (String id : docIds) { + AbstractDocument doc = null; + try { + doc = fetchedDocsDB.getDocument(id); + String url = doc.getDocumentURL(); + + String contentType = doc.getContentType(); + + DocumentParser docParser = DocumentParserFactory.getInstance() + .getDocumentParser(contentType); + + // DEBBUG + P.println(docParser.toString()); + P.println(doc.toString()); + + ProcessedDocument parsedDoc = docParser.parse(doc); + + parsedDocsService.saveDocument(parsedDoc); + + crawlData.getKnownUrlsDB().updateUrlStatus(url, + KnownUrlEntry.STATUS_PROCESSED_SUCCESS); + + } catch (Exception e) { + + if (doc != null) { + + System.out.println("ERROR:\n"); + System.out + .println("Unexpected exception while processing: '" + + id + "', "); + System.out.println(" URL='" + doc.getDocumentURL() + + "'\n"); + System.out.println("Exception message: " + e.getMessage()); + + } else { + System.out.println("ERROR:\n"); + System.out + .println("Unexpected exception while processing: '" + + id + "', "); + System.out.println("Exception message: " + e.getMessage()); + } + } + } + } + + private List selectNextBatchOfUrlsToCrawl(int maxBatchSize, + int depth) { + return crawlData.getKnownUrlsDB().findUnprocessedUrls(maxBatchSize, + depth); + } + + /** + * @deprecated use method that uses depth + * + * @param maxDocs + * @return + */ + @Deprecated + public List selectURLsForNextCrawl(int maxDocs) { + return crawlData.getKnownUrlsDB().findUnprocessedUrls(maxDocs); + } + + public void setPauseBetweenFetchesInMillis(long pauseBetweenFetchesInMillis) { + this.pauseBetweenFetchesInMillis = pauseBetweenFetchesInMillis; + } + + public void setURLFilter(URLFilter urlFilter) { + this.urlFilter = urlFilter; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/core/CrawlData.java b/src/org/yooreeka/util/internet/crawling/core/CrawlData.java new file mode 100644 index 0000000..c35ca61 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/CrawlData.java @@ -0,0 +1,99 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +import java.io.File; + +import org.yooreeka.util.internet.crawling.db.FetchedDocsDB; +import org.yooreeka.util.internet.crawling.db.KnownUrlDB; +import org.yooreeka.util.internet.crawling.db.PageLinkDB; +import org.yooreeka.util.internet.crawling.db.ProcessedDocsDB; + +public class CrawlData { + + private File crawlRootDir; + + private FetchedDocsDB fetchedDocsDB; + private ProcessedDocsDB processedDocsDB; + private KnownUrlDB knownUrlsDB; + private PageLinkDB pageLinkDB; + + public CrawlData(String rootDir) { + this.crawlRootDir = new File(rootDir); + crawlRootDir.mkdirs(); + + File fetchedDocsDBRoot = new File(crawlRootDir, "fetched"); + this.fetchedDocsDB = new FetchedDocsDB(fetchedDocsDBRoot); + + File processedDocsDBRoot = new File(crawlRootDir, "processed"); + this.processedDocsDB = new ProcessedDocsDB(processedDocsDBRoot); + + File knownUrlsDBRoot = new File(crawlRootDir, "knownurls"); + this.knownUrlsDB = new KnownUrlDB(knownUrlsDBRoot); + + File pageLinkDBRoot = new File(crawlRootDir, "pagelinks"); + this.pageLinkDB = new PageLinkDB(pageLinkDBRoot); + } + + public void delete() { + this.fetchedDocsDB.delete(); + this.processedDocsDB.delete(); + this.knownUrlsDB.delete(); + this.pageLinkDB.delete(); + } + + public File getCrawlRootDir() { + return crawlRootDir; + } + + public FetchedDocsDB getFetchedDocsDB() { + return fetchedDocsDB; + } + + public KnownUrlDB getKnownUrlsDB() { + return knownUrlsDB; + } + + public PageLinkDB getPageLinkDB() { + return pageLinkDB; + } + + public ProcessedDocsDB getProcessedDocsDB() { + return processedDocsDB; + } + + public void init() { + this.fetchedDocsDB.init(); + this.processedDocsDB.init(); + this.knownUrlsDB.init(); + this.pageLinkDB.init(); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/core/CrawlDataProcessor.java b/src/org/yooreeka/util/internet/crawling/core/CrawlDataProcessor.java new file mode 100644 index 0000000..3ad785f --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/CrawlDataProcessor.java @@ -0,0 +1,46 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +/** + * Represents module that performs processing based on crawling results. + *

+ * Some module examples are: + *

    + *
  • Build Lucene index
  • + *
  • Build matrix H for html pages
  • + *
  • Build matrix H for documents
  • + *
+ *

+ */ +public interface CrawlDataProcessor { + public void run(); +} diff --git a/src/org/yooreeka/util/internet/crawling/core/DocumentFilter.java b/src/org/yooreeka/util/internet/crawling/core/DocumentFilter.java new file mode 100644 index 0000000..23d252e --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/DocumentFilter.java @@ -0,0 +1,44 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +import org.yooreeka.util.internet.crawling.model.FetchedDocument; + +public class DocumentFilter { + + /* + * Supposed to detect if we've already processed document with the same + * content through some other url. + */ + public boolean duplicateContentExists(FetchedDocument doc) { + return false; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/core/URLFilter.java b/src/org/yooreeka/util/internet/crawling/core/URLFilter.java new file mode 100644 index 0000000..5ed6ad0 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/URLFilter.java @@ -0,0 +1,79 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +/** + * Performs url filtering before url is registered in 'known urls' database. + */ +public class URLFilter { + + private boolean allowFileUrls = true; + private boolean allowHttpUrls = true; + + public URLFilter() { + // empty + } + + /** + * Basic implementation of url filter. Only allows urls that start with + * 'http:' and 'file:' + * + *

+ * Other features that can be added are: + *

    + *
  • extract host from the url and check against robots.txt
  • + *
  • check against the list of excluded urls
  • + *
  • user defined criteria
  • + *
+ *

+ */ + public boolean accept(String url) { + boolean acceptUrl = false; + if (allowFileUrls && url.startsWith("file:")) { + acceptUrl = true; + } else if (allowHttpUrls && url.startsWith("http:")) { + acceptUrl = true; + } else { + acceptUrl = false; + System.out.println("DEBUG: Filtered url: '" + url + "'"); + } + + return acceptUrl; + } + + public void setAllowFileUrls(boolean flag) { + this.allowFileUrls = flag; + } + + public void setAllowHttpUrls(boolean flag) { + this.allowHttpUrls = flag; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/core/URLNormalizer.java b/src/org/yooreeka/util/internet/crawling/core/URLNormalizer.java new file mode 100644 index 0000000..eab2de9 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/core/URLNormalizer.java @@ -0,0 +1,77 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.core; + +import java.net.URL; + +/** + * Performs url normalization. + */ +public class URLNormalizer { + public URLNormalizer() { + // empty + } + + private String normalizeFileUrl(String fileUrl) { + try { + URL url = new URL(fileUrl); + return url.toExternalForm(); + } catch (Exception e) { + throw new RuntimeException("URL Normalization error: ", e); + } + } + + /** + * Implementation that does nothing. + * + *

+ * Other features that can be added are: + *

    + *
  • convert IP address into DNS name
  • + *
  • lower-case DNS name
  • + *
  • extract session id from the URL
  • + *
  • process escape sequences
  • + *
  • remove default port
  • + *
  • remove fragment portion from the url
  • + *
  • sort variables
  • + *
  • ...and a lot more
  • + *
+ *

+ * + */ + public String normalizeUrl(String url) { + String normalizedUrl = url; + if (url.startsWith("file://")) { + normalizedUrl = normalizeFileUrl(url); + } + return normalizedUrl; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/db/FetchedDocsDB.java b/src/org/yooreeka/util/internet/crawling/db/FetchedDocsDB.java new file mode 100644 index 0000000..b221ac3 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/db/FetchedDocsDB.java @@ -0,0 +1,305 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.db; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.util.internet.crawling.model.FetchedDocument; +import org.yooreeka.util.internet.crawling.util.DocumentIdUtils; +import org.yooreeka.util.internet.crawling.util.FileUtils; + +public class FetchedDocsDB { + + private File rootDirFile = null; + private Map groupFiles = null; + private DocumentIdUtils docIdUtils = new DocumentIdUtils(); + + public FetchedDocsDB(File rootDirFile) { + this.rootDirFile = rootDirFile; + } + + /* + * Creates directories for a new group if they don't exist yet. + */ + private void createGroup(String groupId) { + File groupFile = groupFiles.get(groupId); + if (groupFile == null) { + groupFile = new File(rootDirFile, String.valueOf(groupId)); + groupFile.mkdir(); + groupFiles.put(groupFile.getName(), groupFile); + } + } + + public void delete() { + FileUtils.deleteDir(rootDirFile); + } + + private String geFetchedFilePropertiesExt() { + return ".meta"; + } + + public List getAllGroupIds() { + List groupIds = new ArrayList(groupFiles.keySet()); + Collections.sort(groupIds); + return groupIds; + } + + private File getDataFile(String documentId) { + return getDocumentFile(documentId, getFetchedFileExt()); + } + + // document id contains the set encoded in it + public FetchedDocument getDocument(String documentId) { + File dataFile = getDataFile(documentId); + if (!dataFile.exists()) { + throw new RuntimeException("Document with id: '" + documentId + + "' doesn't exist."); + } + FetchedDocument doc = new FetchedDocument(); + doc.setDocumentId(documentId); + + byte[] data = readData(dataFile); + doc.setDocumentContent(data); + + File propsFile = getPropertiesFile(documentId); + if (!propsFile.exists()) { + throw new RuntimeException("Properties for document with id: '" + + documentId + "' don't exist."); + } + readMetaData(propsFile, doc); + + return doc; + } + + private File getDocumentFile(String documentId, String ext) { + String groupId = docIdUtils.getDocumentGroupId(documentId); + File docDirFile = new File(rootDirFile, groupId); + String docFilename = docIdUtils.getDocumentSequence(documentId) + ext; + File docFile = new File(docDirFile, docFilename); + return docFile; + } + + public List getDocumentIds() { + List documentIds = new ArrayList(); + for (File setFile : groupFiles.values()) { + documentIds.addAll(getDocumentIds(setFile)); + } + return documentIds; + } + + private List getDocumentIds(File setFile) { + File[] dataFiles = setFile.listFiles(new FilenameFilter() { + String ext = getFetchedFileExt(); + + public boolean accept(File dir, String name) { + if (name.endsWith(ext)) { + return true; + } else { + return false; + } + } + }); + + List documentIds = new ArrayList(); + String groupId = setFile.getName(); + if (dataFiles != null) { + for (File f : dataFiles) { + String name = f.getName(); + String itemId = name.substring(0, name.indexOf(".")); + String documentId = docIdUtils.getDocumentId(groupId, itemId); + documentIds.add(documentId); + } + } + return documentIds; + } + + public List getDocumentIds(String groupId) { + return getDocumentIds(new File(rootDirFile, groupId)); + } + + private String getFetchedFileExt() { + return ".fetched"; + } + + private File getPropertiesFile(String documentId) { + return getDocumentFile(documentId, geFetchedFilePropertiesExt()); + } + + public void init() { + init(true); + } + + private void init(boolean keepExistingData) { + groupFiles = new HashMap(); + if (rootDirFile.exists()) { + if (keepExistingData) { + /* Load information about existing groups */ + File[] existingFileGroups = rootDirFile + .listFiles(new FileFilter() { + public boolean accept(File f) { + return f.isDirectory(); + } + }); + for (File groupDirFile : existingFileGroups) { + groupFiles.put(groupDirFile.getName(), groupDirFile); + } + } else { + /* load all existing file groups */ + FileUtils.deleteDir(rootDirFile); + rootDirFile.mkdirs(); + } + } else { + rootDirFile.mkdirs(); + } + } + + private byte[] readData(File f) { + byte[] data = new byte[(int) f.length()]; + try { + BufferedInputStream in = new BufferedInputStream( + new FileInputStream(f)); + in.read(data); + in.close(); + } catch (IOException e) { + throw new RuntimeException("Error while reading file: '" + + f.getAbsolutePath() + "'", e); + } + return data; + } + + private void readMetaData(File f, FetchedDocument doc) { + try { + InputStreamReader is = new InputStreamReader( + new FileInputStream(f), "UTF-8"); + BufferedReader reader = new BufferedReader(is); + Map metadata = new HashMap(); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.length() == 0) { + continue; + } + + String[] values = line.split(":", 2); + String key = values[0]; + String value = values[1]; + if ("url".equalsIgnoreCase(key)) { + doc.setDocumentURL(value); + } else if ("host".equalsIgnoreCase(key)) { + // skip, do nothing + } else if ("Content-Type".equalsIgnoreCase(key)) { + doc.setContentType(value); + } else if ("Charset".equalsIgnoreCase(key)) { + doc.setContentCharset(value); + } else { + metadata.put(key, value); + } + } + reader.close(); + doc.setDocumentMetadata(metadata); + } catch (IOException e) { + throw new RuntimeException( + "Error while reading metadata from file: '" + + f.getAbsolutePath() + "'", e); + } + + } + + private void saveContent(File f, byte[] content) { + try { + FileOutputStream fout = new FileOutputStream(f); + BufferedOutputStream bout = new BufferedOutputStream(fout); + bout.write(content); + bout.flush(); + bout.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void saveDocument(FetchedDocument doc) { + /* create directory for current group if it doesn't exist yet. */ + String groupId = docIdUtils.getDocumentGroupId(doc.getDocumentId()); + createGroup(groupId); + + File dataFile = getDataFile(doc.getDocumentId()); + saveContent(dataFile, doc.getDocumentContent()); + + File metadataFile = getPropertiesFile(doc.getDocumentId()); + saveMetadata(metadataFile, doc); + } + + private void saveMetadata(File f, FetchedDocument doc) { + try { + OutputStreamWriter ow = new OutputStreamWriter( + new FileOutputStream(f), "UTF-8"); + BufferedWriter bw = new BufferedWriter(ow); + + writeProperty(bw, "url", doc.getDocumentURL()); + writeProperty(bw, "Content-Type", doc.getContentType()); + writeProperty(bw, "Charset", doc.getContentCharset()); + + Map metadata = doc.getDocumentMetadata(); + for (String key : metadata.keySet()) { + writeProperty(bw, key, metadata.get(key)); + } + bw.flush(); + bw.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void writeProperty(BufferedWriter w, String key, String value) + throws IOException { + w.write(key); + w.write(":"); + if (value != null) { + w.write(value); + } + w.newLine(); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/db/KnownUrlDB.java b/src/org/yooreeka/util/internet/crawling/db/KnownUrlDB.java new file mode 100644 index 0000000..29a4b4a --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/db/KnownUrlDB.java @@ -0,0 +1,279 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.db; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.util.internet.crawling.model.KnownUrlEntry; +import org.yooreeka.util.internet.crawling.util.FileUtils; + +public class KnownUrlDB { + + private static final String DB_FILENAME = "knownurlsdb.dat"; + + private Map processedURLs = new HashMap(); + + private Map unprocessedURLs = new HashMap(); + + private File rootDir = null; + private File dbFile = null; + + private static final String FIELD_DELIMITER = "|"; + + public KnownUrlDB(File f) { + this.rootDir = f; + + } + + public boolean addNewUrl(String url, int depth) { + boolean isAdded = false; + + if (isKnownUrl(url) == false) { + + String status = KnownUrlEntry.STATUS_UNPROCESSED; + KnownUrlEntry r = new KnownUrlEntry(); + r.setUrl(url); + r.setStatus(status); + r.setDepth(depth); + unprocessedURLs.put(url, r); + isAdded = true; + } else { + isAdded = false; + } + + return isAdded; + } + + public void delete() { + FileUtils.deleteDir(rootDir); + } + + public List findAllKnownUrls() { + List allUrls = new ArrayList(); + allUrls.addAll(unprocessedURLs.keySet()); + allUrls.addAll(processedURLs.keySet()); + return allUrls; + } + + public List findProcessedUrls(String status) { + ArrayList selectedUrls = new ArrayList(); + for (Map.Entry mapEntry : processedURLs + .entrySet()) { + KnownUrlEntry urlEntry = mapEntry.getValue(); + if (status.equalsIgnoreCase(urlEntry.getStatus())) { + selectedUrls.add(urlEntry.getUrl()); + } + } + return selectedUrls; + } + + public List findUnprocessedUrls() { + return new ArrayList(unprocessedURLs.keySet()); + } + + /** + * @deprecated will be removed. Use method with depth instead. + * + * @param maxDocs + * @return + */ + @Deprecated + public List findUnprocessedUrls(int maxDocs) { + return findUnprocessedUrls(maxDocs, 0); + } + + public List findUnprocessedUrls(int maxDocs, int depth) { + List selectedUrls = new ArrayList(); + + for (Map.Entry e : unprocessedURLs.entrySet()) { + if (selectedUrls.size() >= maxDocs) { + break; + } + KnownUrlEntry ku = e.getValue(); + if (ku.getDepth() == depth) { + selectedUrls.add(ku.getUrl()); + } + } + + return selectedUrls; + } + + public int getTotalUrlCount() { + return unprocessedURLs.size() + processedURLs.size(); + } + + public void init() { + rootDir.mkdirs(); + + this.dbFile = new File(rootDir, DB_FILENAME); + try { + + // creates a new file if the file doesn't exist + dbFile.createNewFile(); + + } catch (IOException e) { + throw new RuntimeException("Can't create db file: '" + + dbFile.getAbsolutePath() + "'.", e); + } + + load(); + } + + public boolean inProcessedUrl(String url) { + return processedURLs.containsKey(url); + } + + public boolean inUnprocessedUrl(String url) { + return unprocessedURLs.containsKey(url); + } + + public boolean isKnownUrl(String url) { + return processedURLs.containsKey(url) + || unprocessedURLs.containsKey(url); + } + + public boolean isSuccessfullyProcessed(String url) { + KnownUrlEntry r = processedURLs.get(url); + if (r != null + && KnownUrlEntry.STATUS_PROCESSED_SUCCESS.equalsIgnoreCase(r + .getStatus())) { + return true; + } else { + return false; + } + } + + private void load() { + try { + FileInputStream fis = new FileInputStream(dbFile); + InputStreamReader r = new InputStreamReader(fis, "UTF-8"); + BufferedReader br = new BufferedReader(r); + String line = null; + while ((line = br.readLine()) != null) { + int delimiterIndex = line.indexOf(FIELD_DELIMITER); + String status = line.substring(0, delimiterIndex); + int secondDelimiterIndex = line.indexOf(FIELD_DELIMITER, + delimiterIndex + 1); + int depth = Integer.valueOf(line.substring(delimiterIndex + + FIELD_DELIMITER.length(), secondDelimiterIndex)); + String url = line.substring(secondDelimiterIndex + + FIELD_DELIMITER.length()); + loadUrl(url, status, depth); + } + br.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to load data: ", e); + } + } + + private void loadUrl(String url, String status, int depth) { + if (isKnownUrl(url) == false) { + KnownUrlEntry r = new KnownUrlEntry(); + r.setUrl(url); + r.setStatus(status); + r.setDepth(depth); + if (KnownUrlEntry.STATUS_PROCESSED_SUCCESS.equalsIgnoreCase(status) + || KnownUrlEntry.STATUS_PROCESSED_ERROR + .equalsIgnoreCase(status)) { + processedURLs.put(url, r); + } else if (KnownUrlEntry.STATUS_UNPROCESSED + .equalsIgnoreCase(status)) { + unprocessedURLs.put(url, r); + } else { + throw new RuntimeException("Unsupported status value: '" + + status + "', url: '" + url + "'."); + } + } else { + throw new RuntimeException("Duplicate url: '" + url + "'"); + } + } + + public void save() { + try { + OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( + dbFile), "UTF-8"); + BufferedWriter bw = new BufferedWriter(w); + for (KnownUrlEntry r : unprocessedURLs.values()) { + writeRecord(bw, r); + } + for (KnownUrlEntry r : processedURLs.values()) { + writeRecord(bw, r); + } + bw.flush(); + bw.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to save data: ", e); + } + } + + public void updateUrlStatus(String url, String status) { + if (KnownUrlEntry.STATUS_PROCESSED_SUCCESS.equalsIgnoreCase(status) + || KnownUrlEntry.STATUS_PROCESSED_ERROR + .equalsIgnoreCase(status)) { + KnownUrlEntry r = unprocessedURLs.remove(url); + if (r != null) { + r.setStatus(status); + } else { + throw new RuntimeException("Unknown url: '" + url); + } + processedURLs.put(url, r); + } else if (KnownUrlEntry.STATUS_UNPROCESSED.equalsIgnoreCase(status)) { + KnownUrlEntry r = processedURLs.remove(url); + if (r != null) { + r.setStatus(status); + } else { + throw new RuntimeException("Unknown url: '" + url); + } + unprocessedURLs.put(url, r); + } + } + + private void writeRecord(BufferedWriter w, KnownUrlEntry ku) + throws IOException { + + w.write(ku.getStatus() + FIELD_DELIMITER + + String.valueOf(ku.getDepth()) + FIELD_DELIMITER + ku.getUrl()); + w.newLine(); + + } + +} diff --git a/src/org/yooreeka/util/internet/crawling/db/PageLinkDB.java b/src/org/yooreeka/util/internet/crawling/db/PageLinkDB.java new file mode 100644 index 0000000..fce1cd0 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/db/PageLinkDB.java @@ -0,0 +1,163 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.db; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.yooreeka.util.internet.crawling.util.FileUtils; + +public class PageLinkDB { + private static final String DB_FILENAME = "pagelinkdb.dat"; + + private Map> pageOutLinks = new HashMap>(); + private Map> pageInLinks = new HashMap>(); + + private File rootDir = null; + private File dbFile = null; + + public PageLinkDB(File f) { + this.rootDir = f; + } + + public void addLink(String pageUrl) { + Set outlinks = pageOutLinks.get(pageUrl); + if (outlinks == null) { + outlinks = new TreeSet(); + pageOutLinks.put(pageUrl, outlinks); + } + } + + public void addLink(String pageUrl, String outlinkUrl) { + Set outLinks = pageOutLinks.get(pageUrl); + if (outLinks == null) { + outLinks = new TreeSet(); + pageOutLinks.put(pageUrl, outLinks); + } + outLinks.add(outlinkUrl); + + Set inLinks = pageInLinks.get(outlinkUrl); + if (inLinks == null) { + inLinks = new TreeSet(); + pageInLinks.put(outlinkUrl, inLinks); + } + inLinks.add(pageUrl); + } + + public void delete() { + FileUtils.deleteDir(rootDir); + } + + public Set getInlinks(String url) { + Set result = pageInLinks.get(url); + return result != null ? result : new TreeSet(); + } + + public Set getOutlinks(String url) { + Set result = pageOutLinks.get(url); + return result != null ? result : new TreeSet(); + } + + public void init() { + rootDir.mkdirs(); + + this.dbFile = new File(rootDir, DB_FILENAME); + try { + // creates a new file if the file doesn't exist + dbFile.createNewFile(); + } catch (IOException e) { + throw new RuntimeException("Can't create db file: '" + + dbFile.getAbsolutePath() + "'.", e); + } + + load(); + } + + private void load() { + try { + InputStreamReader r = new InputStreamReader(new FileInputStream( + dbFile), "UTF-8"); + BufferedReader br = new BufferedReader(r); + String line = null; + String currentPage = null; + while ((line = br.readLine()) != null) { + int delimiterIndex = line.indexOf("|"); + String type = line.substring(0, delimiterIndex); + String value = line.substring(delimiterIndex + "|".length()); + if ("page".equalsIgnoreCase(type)) { + currentPage = value; + } else { + String outlink = value; + addLink(currentPage, outlink); + } + } + br.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to load data: ", e); + } + } + + public void save() { + try { + OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( + dbFile), "UTF-8"); + BufferedWriter bw = new BufferedWriter(w); + for (Map.Entry> mapEntry : pageOutLinks + .entrySet()) { + String pageUrl = mapEntry.getKey(); + writeRecord(bw, "page", pageUrl); + for (String outlink : mapEntry.getValue()) { + writeRecord(bw, "outlink", outlink); + } + } + bw.flush(); + bw.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to save data: ", e); + } + } + + private void writeRecord(BufferedWriter w, String id, String value) + throws IOException { + w.write(id + "|" + value); + w.newLine(); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/db/ProcessedDocsDB.java b/src/org/yooreeka/util/internet/crawling/db/ProcessedDocsDB.java new file mode 100644 index 0000000..6424090 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/db/ProcessedDocsDB.java @@ -0,0 +1,413 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.db; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.yooreeka.util.internet.crawling.model.Outlink; +import org.yooreeka.util.internet.crawling.util.DocumentIdUtils; +import org.yooreeka.util.internet.crawling.util.FileUtils; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +public class ProcessedDocsDB { + + private enum FileType { + CONTENT(".content", "content"), TXT(".txt", "txt"), PROPERTIES( + ".properties", "properties"), OUTLINKS(".outlinks", "outlinks"); + + private final String ext; + private final String dir; + + FileType(String ext, String dir) { + this.ext = ext; + this.dir = dir; + } + + public String getDir() { + return dir; + } + + public String getExt() { + return ext; + } + } + private File rootDirFile = null; + private Map groupFiles = null; + + private DocumentIdUtils docIdUtils = new DocumentIdUtils(); + + public ProcessedDocsDB(File rootDir) { + this.rootDirFile = rootDir; + } + + private File createDir(File parent, String dirName) { + File newDir = new File(parent, dirName); + if (!newDir.exists()) { + newDir.mkdir(); + } + return newDir; + } + + /* + * Creates directories for a new group if they don't exist yet. + */ + private void createGroup(String groupId) { + File groupFile = groupFiles.get(groupId); + if (groupFile == null) { + groupFile = new File(rootDirFile, String.valueOf(groupId)); + groupFile.mkdir(); + createDir(groupFile, FileType.CONTENT.getDir()); + createDir(groupFile, FileType.PROPERTIES.getDir()); + createDir(groupFile, FileType.OUTLINKS.getDir()); + createDir(groupFile, FileType.TXT.getDir()); + groupFiles.put(groupFile.getName(), groupFile); + } + } + + public void delete() { + FileUtils.deleteDir(rootDirFile); + } + + public List getAllGroupIds() { + return new ArrayList(groupFiles.keySet()); + } + + private byte[] getBytes(String text) { + try { + return text.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Error while saving data: ", e); + } + } + + private File getContentFile(String documentId) { + return getDocumentFile(documentId, FileType.CONTENT); + } + + private File getDocumentFile(String documentId, FileType type) { + String groupId = docIdUtils.getDocumentGroupId(documentId); + File groupDirFile = new File(rootDirFile, groupId); + File docDirFile = new File(groupDirFile, type.getDir()); + String itemId = docIdUtils.getDocumentSequence(documentId); + File docFile = new File(docDirFile, itemId + type.getExt()); + return docFile; + } + + public List getDocumentIds() { + List documentIds = new ArrayList(); + for (File groupFile : groupFiles.values()) { + documentIds.addAll(getDocumentIds(groupFile)); + } + return documentIds; + } + + private List getDocumentIds(File setFile) { + if (setFile == null) { + return new ArrayList(); + } + final FileType type = FileType.CONTENT; + File dir = new File(setFile, type.dir); + File[] dataFiles = dir.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + if (name.endsWith(type.ext)) { + return true; + } else { + return false; + } + } + }); + + String groupId = setFile.getName(); + List documentIds = new ArrayList(); + for (File f : dataFiles) { + String name = f.getName(); + String itemId = name.substring(0, name.indexOf(".")); + String documentId = docIdUtils.getDocumentId(groupId, itemId); + documentIds.add(documentId); + } + return documentIds; + } + + public List getDocumentIds(String groupId) { + return getDocumentIds(groupFiles.get(groupId)); + } + + private File getOutlinksFile(String documentId) { + return getDocumentFile(documentId, FileType.OUTLINKS); + } + + private File getPropertiesFile(String documentId) { + return getDocumentFile(documentId, FileType.PROPERTIES); + } + + private String getText(byte[] data) { + try { + return new String(data, "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Error loading data: ", e); + } + } + + private File getTextFile(String documentId) { + return getDocumentFile(documentId, FileType.TXT); + } + + public void init() { + init(true); + } + + private void init(boolean keepExistingData) { + groupFiles = new HashMap(); + + if (rootDirFile.exists()) { + if (keepExistingData) { + /* load all existing file groups */ + File[] existingFileGroups = rootDirFile + .listFiles(new FileFilter() { + public boolean accept(File f) { + return f.isDirectory(); + } + }); + for (File groupDirFile : existingFileGroups) { + groupFiles.put(groupDirFile.getName(), groupDirFile); + } + } else { + /* delete all existing data and create brand new directory */ + FileUtils.deleteDir(rootDirFile); + rootDirFile.mkdirs(); + } + } else { + rootDirFile.mkdirs(); + } + } + + public List loadAllDocumentsInGroup(String groupId) { + List allDocsInGroup = new ArrayList(); + + for (String docId : getDocumentIds(groupId)) { + ProcessedDocument doc = loadDocument(docId); + allDocsInGroup.add(doc); + } + + return allDocsInGroup; + } + + private String loadContent(File f) { + byte[] data = loadData(f); + return getText(data); + } + + private byte[] loadData(File f) { + byte[] data = new byte[(int) f.length()]; + try { + BufferedInputStream in = new BufferedInputStream( + new FileInputStream(f)); + in.read(data); + in.close(); + } catch (IOException e) { + throw new RuntimeException("Error while reading file: '" + + f.getAbsolutePath() + "'", e); + } + return data; + } + + /** + * Loads previously saved document details. + * + * @param documentId + * @return + */ + public ProcessedDocument loadDocument(String documentId) { + File propertiesFile = getPropertiesFile(documentId); + Map properties = loadProperties(propertiesFile, ":"); + + File contentFile = getContentFile(documentId); + String content = loadContent(contentFile); + + File textFile = getTextFile(documentId); + String text = loadText(textFile); + + File outlinksFile = getOutlinksFile(documentId); + List outlinks = loadOutlinks(outlinksFile); + + ProcessedDocument doc = new ProcessedDocument(); + doc.setDocumentType(properties.get("doctype")); + doc.setDocumentURL(properties.get("url")); + doc.setText(text); + doc.setContent(content); + doc.setOutlinks(outlinks); + doc.setDocumentId(documentId); + doc.setDocumentTitle(properties.get("title")); + + return doc; + } + + private List loadOutlinks(File f) { + List outlinks = new ArrayList(); + Map props = loadProperties(f, "|"); + + for (String key : props.keySet()) { + String url = key; + String anchorText = props.get(key); + Outlink o = new Outlink(url, anchorText); + outlinks.add(o); + } + return outlinks; + } + + private Map loadProperties(File f, String delimiter) { + Map props = new HashMap(); + try { + InputStreamReader r = new InputStreamReader(new FileInputStream(f), + "UTF-8"); + BufferedReader reader = new BufferedReader(r); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.length() == 0) { + continue; + } + + int delimiterIndex = line.indexOf(delimiter); + String key = line.substring(0, delimiterIndex); + String value = line.substring(delimiterIndex + 1); + props.put(key, value); + } + reader.close(); + } catch (IOException e) { + throw new RuntimeException( + "Error while reading metadata from file: '" + + f.getAbsolutePath() + "'", e); + } + return props; + } + + private String loadText(File f) { + byte[] data = loadData(f); + return getText(data); + } + + private void saveContent(File f, String content) { + saveData(f, getBytes(content)); + } + + private void saveData(File f, byte[] content) { + try { + FileOutputStream fout = new FileOutputStream(f); + BufferedOutputStream bout = new BufferedOutputStream(fout); + bout.write(content); + bout.flush(); + bout.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Persists the document. + * + * @param doc + */ + public void saveDocument(ProcessedDocument doc) { + String groupId = docIdUtils.getDocumentGroupId(doc.getDocumentId()); + createGroup(groupId); + + File contentFile = getContentFile(doc.getDocumentId()); + saveContent(contentFile, doc.getContent()); + + File textFile = getTextFile(doc.getDocumentId()); + saveText(textFile, doc.getText()); + + File propertiesFile = getPropertiesFile(doc.getDocumentId()); + Map props = new HashMap(); + props.put("url", doc.getDocumentURL()); + props.put("title", doc.getDocumentTitle()); + props.put("doctype", doc.getDocumentType()); + saveProperties(propertiesFile, props, ":"); + + File outlinksFile = getOutlinksFile(doc.getDocumentId()); + saveOutlinks(outlinksFile, doc.getOutlinks()); + } + + private void saveOutlinks(File f, List outlinks) { + Map props = new HashMap(); + for (Outlink outlink : outlinks) { + props.put(outlink.getLinkUrl(), outlink.getText()); + } + saveProperties(f, props, "|"); + } + + private void saveProperties(File f, Map props, + String delimiter) { + try { + OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( + f), "UTF-8"); + BufferedWriter bw = new BufferedWriter(w); + for (String key : props.keySet()) { + String value = props.get(key); + writeProperty(bw, key, value, delimiter); + } + bw.flush(); + bw.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void saveText(File f, String text) { + saveData(f, getBytes(text)); + } + + private void writeProperty(BufferedWriter w, String key, String value, + String delimiter) throws IOException { + w.write(key); + w.write(delimiter); + if (value != null) { + w.write(value); + } + w.newLine(); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/model/FetchedDocument.java b/src/org/yooreeka/util/internet/crawling/model/FetchedDocument.java new file mode 100644 index 0000000..f62974c --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/model/FetchedDocument.java @@ -0,0 +1,143 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.model; + +import java.nio.charset.Charset; +import java.util.Map; + +import org.yooreeka.util.P; +import org.yooreeka.util.parsing.common.AbstractDocument; + +/** + * Collection of raw (unprocessed) data about crawled/fetched document. + */ +public class FetchedDocument implements AbstractDocument { + + /* + * Document id that was assigned by the FetcherModule. + */ + private String documentId; + + /* + * Document URL. URL that was used to fetch the document. + */ + private String url; + + /* + * MIME content type that was derived from transport protocol (HTTP + * headers), document content or document URL. + */ + private String contentType; + + /* + * Character encoding that was derived from transport protocol (HTTP + * headers), document content. + */ + private String contentCharset; + + /* + * Raw document content. + */ + private byte[] documentContent; + + /* + * Various optional meta data about the document that was extracted from the + * protocol. + */ + private Map documentMetadata; + + public FetchedDocument() { + } + + public String getContentCharset() { + return contentCharset; + } + + public long getContentLength() { + return documentContent.length; + } + + public String getContentType() { + return contentType; + } + + public byte[] getDocumentContent() { + return documentContent; + } + + public String getDocumentId() { + return documentId; + } + + public Map getDocumentMetadata() { + return documentMetadata; + } + + public String getDocumentURL() { + return url; + } + + public void print() { + P.println("Document ID : " + this.documentId); + P.println("Content URL : " + this.url); + P.println("Content Type : " + this.contentType); + P.println("Content Charset: " + this.contentCharset); + P.hline(); + P.println("CONTENT\n" + + new String(this.getDocumentContent(), Charset + .forName(contentCharset))); + P.hline(); + } + + public void setContentCharset(String contentCharset) { + this.contentCharset = contentCharset; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + public void setDocumentContent(byte[] data) { + this.documentContent = data; + } + + public void setDocumentId(String documentId) { + this.documentId = documentId; + } + + public void setDocumentMetadata(Map metadata) { + this.documentMetadata = metadata; + } + + public void setDocumentURL(String url) { + this.url = url; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/model/KnownUrlEntry.java b/src/org/yooreeka/util/internet/crawling/model/KnownUrlEntry.java new file mode 100644 index 0000000..3c0c1e2 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/model/KnownUrlEntry.java @@ -0,0 +1,77 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.model; + +public class KnownUrlEntry { + + public static final String STATUS_UNPROCESSED = "unprocessed"; + public static final String STATUS_PROCESSED_SUCCESS = "processed"; + public static final String STATUS_PROCESSED_ERROR = "error"; + + private String url; + private String status; + private int depth; + + public KnownUrlEntry() { + + } + + public KnownUrlEntry(String url, String status, int depth) { + this.url = url; + this.status = status; + this.depth = depth; + } + + public int getDepth() { + return depth; + } + + public String getStatus() { + return status; + } + + public String getUrl() { + return url; + } + + public void setDepth(int depth) { + this.depth = depth; + } + + public void setStatus(String status) { + this.status = status; + } + + public void setUrl(String url) { + this.url = url; + } + +} diff --git a/src/org/yooreeka/util/internet/crawling/model/Outlink.java b/src/org/yooreeka/util/internet/crawling/model/Outlink.java new file mode 100644 index 0000000..b73943a --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/model/Outlink.java @@ -0,0 +1,55 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.model; + +public class Outlink { + + private String linkUrl; + private String text; + + public Outlink(String linkUrl, String text) { + this.linkUrl = linkUrl; + this.text = text; + } + + public String getLinkUrl() { + return linkUrl; + } + + public String getText() { + return text; + } + + @Override + public String toString() { + return "[link:" + linkUrl + ", text:" + text + "]"; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/common/Transport.java b/src/org/yooreeka/util/internet/crawling/transport/common/Transport.java new file mode 100644 index 0000000..4b50f5b --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/common/Transport.java @@ -0,0 +1,43 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.common; + +import org.yooreeka.util.internet.crawling.model.FetchedDocument; + +public interface Transport { + public void clear(); + + public FetchedDocument fetch(String url) throws TransportException; + + public void init(); + + public boolean pauseRequired(); +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/common/TransportException.java b/src/org/yooreeka/util/internet/crawling/transport/common/TransportException.java new file mode 100644 index 0000000..f20d037 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/common/TransportException.java @@ -0,0 +1,47 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.common; + +public class TransportException extends Exception { + + /** + * Distinct SVUID for the org.yooreeka classes + */ + private static final long serialVersionUID = -2821101482190551697L; + + public TransportException(String message) { + super(message); + } + + public TransportException(String message, Throwable t) { + super(message, t); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/file/FileTransport.java b/src/org/yooreeka/util/internet/crawling/transport/file/FileTransport.java new file mode 100644 index 0000000..69493c1 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/file/FileTransport.java @@ -0,0 +1,134 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.file; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.HashMap; + +import org.yooreeka.util.internet.crawling.model.FetchedDocument; +import org.yooreeka.util.internet.crawling.transport.common.Transport; +import org.yooreeka.util.internet.crawling.transport.common.TransportException; + +public class FileTransport implements Transport { + + public FileTransport() { + } + + public void clear() { + // DO NOTHING + } + + private FetchedDocument createDocument(String targetURL) + throws IOException, FileTransportException { + FetchedDocument doc = new FetchedDocument(); + + /* + * Maximum document length. + */ + int MAX_DOCUMENT_LENGTH = 512 * 1024; // 512K + + URL url = new URL(targetURL); + File f = null; + try { + f = new File(url.toURI()); + } catch (URISyntaxException e) { + throw new FileTransportException( + "Error while converting url to file path: ", e); + } + + /* IOException will be thrown for documents that exceed max length */ + byte[] data = loadData(f, MAX_DOCUMENT_LENGTH); + + String DEFAULT_CONTENT_TYPE = "text/html"; + String contentType = DEFAULT_CONTENT_TYPE; + if (f.getName().endsWith(".doc")) { + contentType = "application/msword"; + } + + String DEFAULT_CONTENT_CHARSET = "UTF-8"; + String contentCharset = DEFAULT_CONTENT_CHARSET; + + doc.setContentType(contentType); + doc.setDocumentURL(targetURL); + doc.setContentCharset(contentCharset); + doc.setDocumentContent(data); + doc.setDocumentMetadata(new HashMap()); + return doc; + } + + public FetchedDocument fetch(String documentUrl) throws TransportException { + + FetchedDocument doc = null; + try { + doc = createDocument(documentUrl); + } catch (Exception eX) { + System.out.println("ERROR:\n" + eX.getMessage()); + throw new FileTransportException("Failed to fetch url: '" + + documentUrl + "': ", eX); + } finally { + } + + return doc; + } + + public void init() { + // DO NOTHING + } + + private byte[] loadData(File f, int maxLength) throws IOException { + if (f.length() > maxLength) { + throw new IOException("The document is too long (doc: " + + f.getCanonicalPath() + ", size: " + f.length() + + ", max size: " + maxLength); + } + + InputStream in = new BufferedInputStream(new FileInputStream(f)); + byte[] data = new byte[(int) f.length()]; + int offset = 0; + int i = 0; + while ((offset < data.length) + && (i = in.read(data, offset, data.length - offset)) >= 0) { + offset += i; + } + in.close(); + return data; + } + + public boolean pauseRequired() { + return false; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/file/FileTransportException.java b/src/org/yooreeka/util/internet/crawling/transport/file/FileTransportException.java new file mode 100644 index 0000000..6181fd8 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/file/FileTransportException.java @@ -0,0 +1,49 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.file; + +import org.yooreeka.util.internet.crawling.transport.common.TransportException; + +public class FileTransportException extends TransportException { + + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -6380601992826152509L; + + public FileTransportException(String msg) { + super(msg); + } + + public FileTransportException(String msg, Throwable t) { + super(msg, t); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransport.java b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransport.java new file mode 100644 index 0000000..3554f1a --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransport.java @@ -0,0 +1,260 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.http; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.List; + +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.CookieStore; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.protocol.ClientContext; +import org.apache.http.cookie.Cookie; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.protocol.BasicHttpContext; +import org.apache.http.protocol.HttpContext; +import org.apache.http.util.EntityUtils; +import org.yooreeka.util.internet.crawling.model.FetchedDocument; +import org.yooreeka.util.internet.crawling.transport.common.Transport; +import org.yooreeka.util.internet.crawling.transport.common.TransportException; + +public class HTTPTransport implements Transport { + + HttpClient httpclient = null; + CookieStore cookieStore = null; + HttpContext localContext = null; + + public HTTPTransport() { + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.webcrawler.transport.common.Transport#clear() + */ + public void clear() { + httpclient = null; + // initialState = null; + } + + private FetchedDocument createDocument(String targetURL, HttpEntity entity) + throws IOException, HTTPTransportException { + FetchedDocument doc = new FetchedDocument(); + + /* + * Maximum document length that transport will attempt to download + * without issuing a warning ... + */ + int MAX_DOCUMENT_LENGTH = 8 * 1024 * 1024; // 8Mb + + BufferedInputStream bufferedInput = null; + byte[] buffer = new byte[1024]; + + int contentLength = (int) entity.getContentLength(); + if (contentLength > MAX_DOCUMENT_LENGTH) + System.out.println("WARNING: Retrieved document larger than " + + MAX_DOCUMENT_LENGTH + " [bytes]"); + + ByteBuffer byteBuffer = ByteBuffer.allocate(contentLength); + + // Construct the BufferedInputStream object + bufferedInput = new BufferedInputStream(entity.getContent()); + + // Keep reading while there is content + // when the end of the stream has been reached, -1 is returned + while (bufferedInput.read(buffer) != -1) { + + // Process the chunk of bytes read + byteBuffer.put(buffer); + } + + /* IOException will be thrown for documents that exceed max length */ + byte[] data = byteBuffer.array(); + + /* + * Check if server sent content in compressed form and uncompress the + * content if necessary. + */ + Header contentEncodingHeader = entity.getContentEncoding(); + if (contentEncodingHeader != null) { + data = HTTPUtils.decodeContent(contentEncodingHeader.getValue(), + data); + } + + /* 'Content-Type' HTTP header value */ + String contentTypeHeaderValue = null; + Header header = entity.getContentType(); + if (header != null) { + contentTypeHeaderValue = header.getValue(); + } + + /* + * Determine MIME type of the document. + * + * It is easy if we have Content-Type http header. In cases when this + * header is missing or for protocols that don't pass metadata about the + * documents (ftp://, file://) we would have to resort to url and/or + * content analysis to determine MIME type. + */ + String DEFAULT_CONTENT_TYPE = "text/html"; + String contentType = HTTPUtils.getContentType(contentTypeHeaderValue, + targetURL, data); + if (contentType == null) { + contentType = DEFAULT_CONTENT_TYPE; + } + + /* + * Determine Character encoding used in the document. In some cases it + * may be specified in the http header, in html file itself or we have + * to perform content analysis to choose the encoding. + */ + String DEFAULT_CONTENT_CHARSET = "UTF-8"; + String contentCharset = HTTPUtils.getCharset(contentTypeHeaderValue, + contentType, data); + if (contentCharset == null) { + contentCharset = DEFAULT_CONTENT_CHARSET; + } + + doc.setContentType(contentType); + doc.setDocumentURL(targetURL); + doc.setContentCharset(contentCharset); + doc.setDocumentContent(data); + doc.setDocumentMetadata(new HashMap()); + return doc; + } + + /* + * (non-Javadoc) + * + * @see + * iweb2.ch2.webcrawler.transport.common.Transport#fetch(java.lang.String) + */ + public FetchedDocument fetch(String documentUrl) throws TransportException { + + FetchedDocument doc = null; + + HttpGet httpget = new HttpGet(documentUrl); + + System.out.println("executing request " + httpget.getURI()); + + // Pass local context as a parameter + HttpResponse response = null; + try { + response = httpclient.execute(httpget, localContext); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + HttpEntity entity = response.getEntity(); + + System.out.println("----------------------------------------"); + System.out.println(response.getStatusLine()); + if (entity != null) { + System.out.println("Response content length: " + + entity.getContentLength()); + } + List cookies = cookieStore.getCookies(); + for (int i = 0; i < cookies.size(); i++) { + System.out.println("Local cookie: " + cookies.get(i)); + } + + try { + doc = createDocument(documentUrl, entity); + } catch (IOException e) { + throw new TransportException("Failed to fetch url: '" + documentUrl + + "': ", e); + } finally { + // Consume response content + try { + EntityUtils.consume(entity); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + System.out.println("----------------------------------------"); + + // When HttpClient instance is no longer needed, + // shut down the connection manager to ensure + // immediate deallocation of all system resources + httpclient.getConnectionManager().shutdown(); + } + + return doc; + } + + /* + * (non-Javadoc) + * + * @see iweb2.ch2.webcrawler.transport.common.Transport#init() + */ + public void init() { + + System.out.println("Initializing HTTPTransport ..."); + + httpclient = new DefaultHttpClient(); + + // Create a local instance of cookie store + cookieStore = new BasicCookieStore(); + + // Create local HTTP context + localContext = new BasicHttpContext(); + + // Bind custom cookie store to the local context + localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); + + // httpclient.getHttpConnectionManager().getParams().setConnectionTimeout(30000); + // httpclient.getHttpConnectionManager().getParams().setSoTimeout(30000); + // httpclient.setState(initialState); + // httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); + // + // //httpclient.getParams().setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, + // Boolean.TRUE); + // + // // Set default number of connections per host to 1 + // httpclient.getHttpConnectionManager(). + // getParams().setMaxConnectionsPerHost( + // HostConfiguration.ANY_HOST_CONFIGURATION, 1); + // // Set max for total number of connections + // httpclient.getHttpConnectionManager().getParams().setMaxTotalConnections(10); + } + + public boolean pauseRequired() { + return true; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransportException.java b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransportException.java new file mode 100644 index 0000000..497f5c6 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPTransportException.java @@ -0,0 +1,46 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.http; + +import org.yooreeka.util.internet.crawling.transport.common.TransportException; + +public class HTTPTransportException extends TransportException { + + private static final long serialVersionUID = 546574708933803471L; + + public HTTPTransportException(String msg) { + super(msg); + } + + public HTTPTransportException(String msg, Throwable t) { + super(msg, t); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/transport/http/HTTPUtils.java b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPUtils.java new file mode 100644 index 0000000..80ab133 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/transport/http/HTTPUtils.java @@ -0,0 +1,142 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.transport.http; + +class HTTPUtils { + + /** + * Decodes content according to content encoding. This is just a place + * holder. + * + * @param contentEncoding + * content type. + * @param encodedContent + * content received from the server + * @return decoded content. + */ + public static byte[] decodeContent(String contentEncoding, + byte[] encodedContent) throws HTTPTransportException { + byte[] decodedContent = null; + if ("gzip".equalsIgnoreCase(contentEncoding)) { + throw new HTTPTransportException( + "Content-Encoding 'gzip' is not supported."); + } else if ("deflate".equalsIgnoreCase(contentEncoding)) { + throw new HTTPTransportException( + "Content-Encoding 'deflate' is not supported."); + } else if ("compress".equalsIgnoreCase(contentEncoding)) { + throw new HTTPTransportException( + "Content-Encoding 'compress' is not supported."); + } else { + decodedContent = encodedContent; + } + + return decodedContent; + } + + private static String getCharset(String contentTypeHeaderValue) { + String charset = null; + String ATTR_NAME = "charset="; + if (contentTypeHeaderValue != null) { + int i = contentTypeHeaderValue.toLowerCase().indexOf(ATTR_NAME); + if (i > -1) { + charset = contentTypeHeaderValue.substring( + i + ATTR_NAME.length()).toUpperCase(); + } + } + + return charset; + } + + /** + * Extracts charset from HTTP header. If HTTP header is missing an attempt + * can be made to determine charset based on content type and data. + * + * For example, documents with type 'text/html' can define document charset + * using 'meta' tag. Such documents should use characters compatible with + * ISO-8859-1 charset until the meta tag that defines document charset. For + * more details see: http://www.w3.org/TR/html4/charset.html#h-5.2.2 + * + * @param contentTypeHeaderValue + * @param contentType + * type of data. Can be used to interpret the data. + * @param data + * @return charset or null. + */ + public static String getCharset(String contentTypeHeaderValue, + String contentType, byte[] data) { + String charset = getCharset(contentTypeHeaderValue); + if (charset == null || charset.trim().length() == 0) { + /* + * here we can implement charset detection based on content + * analysis. + */ + } + + return charset; + } + + /** + * Extracts MIME type. Ideally the value should be extracted from HTTP + * header. But if it is missing an attempt can be made to determine content + * type based on URL and/or data. + * + * @param contentTypeHeaderValue + * @param url + * document URL. + * @param data + * document content + * + * @return MIME type for document content or null if couldn't determine the + * type. + */ + public static String getContentType(String contentTypeHeaderValue, + String url, byte[] data) { + String type = null; + if (contentTypeHeaderValue != null + && contentTypeHeaderValue.trim().length() > 0) { + int i = contentTypeHeaderValue.indexOf(";"); + if (i > -1) { + type = contentTypeHeaderValue.substring(0, i); + } else { + type = contentTypeHeaderValue.substring(0); + } + } + + if (type == null) { + /* + * here url and content itself can be used to determine content + * type. + */ + } + + return type; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/util/DocumentIdUtils.java b/src/org/yooreeka/util/internet/crawling/util/DocumentIdUtils.java new file mode 100644 index 0000000..b45359e --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/util/DocumentIdUtils.java @@ -0,0 +1,56 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.util; + +public class DocumentIdUtils { + + private static final String GROUP_PREFIX = "g"; + private static final String SEQUENCE_PREFIX = "d"; + private static final String ID_COMPONENTS_DELIMITER = "-"; + + public String getDocumentGroupId(String documentId) { + String[] idComponents = documentId.split(ID_COMPONENTS_DELIMITER); + return idComponents[0].substring(GROUP_PREFIX.length()); + } + + public String getDocumentId(String docGroupId, int docSequence) { + return getDocumentId(docGroupId, String.valueOf(docSequence)); + } + + public String getDocumentId(String docGroupId, String docSequence) { + return "g" + docGroupId + "-d" + docSequence; + } + + public String getDocumentSequence(String documentId) { + String[] idComponents = documentId.split(ID_COMPONENTS_DELIMITER); + return idComponents[1].substring(SEQUENCE_PREFIX.length()); + } +} diff --git a/src/org/yooreeka/util/internet/crawling/util/FileUtils.java b/src/org/yooreeka/util/internet/crawling/util/FileUtils.java new file mode 100644 index 0000000..c8bb618 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/util/FileUtils.java @@ -0,0 +1,130 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.util; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; + +/** + * Utility methods for files and directories. + */ +public class FileUtils { + + /** + * Deletes directory with its content. + * + * @param dir + * directory to delete. + * @return true if delete was successful. + */ + public static boolean deleteDir(java.io.File dir) { + + if (dir == null || dir.isDirectory() == false) { + return false; + } + + for (String filename : dir.list()) { + boolean success = false; + File f = new File(dir, filename); + if (f.isDirectory()) { + success = deleteDir(f); + } else { + success = f.delete(); + } + if (!success) { + return success; + } + } + + return dir.delete(); + } + + /** + * Deletes directory with its content. + * + * @param dir + * directory to delete. + * @return true if delete was successful. + */ + public static boolean deleteDir(String dir) { + File f = new File(dir); + if (f.exists() && f.isDirectory()) { + return deleteDir(f); + } else { + return false; + } + } + + /** + * Finds files that start with specified prefix. + * + * @param directory + * directory with files to search + * @param filenamePrefix + * defines files that will be returned. + * @return files with names that start with specified prefix. + */ + public static File[] findMatchingFiles(final File directory, + final String filenamePrefix) { + return directory.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.startsWith(filenamePrefix); + } + }); + } + + public static void prepareDir(File dir, boolean useExisting) + throws IOException { + if (dir.exists()) { + if (useExisting == false) { + if (!FileUtils.deleteDir(dir)) { + throw new IOException("Failed to delete directory: '" + + dir.getAbsolutePath() + "'"); + } + } + } + if (!dir.exists()) { + if (!dir.mkdir()) { + throw new IOException("Failed to create directory: '" + + dir.getAbsolutePath() + "'"); + } + } + } + + /* + * All methods are static. There should be no instances of this class. + */ + private FileUtils() { + // empty + } + +} diff --git a/src/org/yooreeka/util/internet/crawling/util/UrlGroup.java b/src/org/yooreeka/util/internet/crawling/util/UrlGroup.java new file mode 100644 index 0000000..147db6e --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/util/UrlGroup.java @@ -0,0 +1,71 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.util; + +import java.util.ArrayList; +import java.util.List; + +/** + * Group of URLs for specific host and protocol. + */ +public class UrlGroup { + private String protocol; + private String host; + private List urls; + + public UrlGroup(String protocol, String host) { + this.protocol = protocol; + this.host = host; + this.urls = new ArrayList(); + } + + public void addUrl(String url) { + urls.add(url); + } + + public String getHost() { + return host; + } + + public String getProtocol() { + return protocol; + } + + public List getUrls() { + return urls; + } + + @Override + public String toString() { + return "[protocol: " + protocol + ", host: " + host + ", url count: " + + urls.size() + "]"; + } +} diff --git a/src/org/yooreeka/util/internet/crawling/util/UrlUtils.java b/src/org/yooreeka/util/internet/crawling/util/UrlUtils.java new file mode 100644 index 0000000..cca56e2 --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/util/UrlUtils.java @@ -0,0 +1,65 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.util; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class UrlUtils { + + public static List groupByProtocolAndHost(List urls) { + Map allGroups = new HashMap(); + for (String url : urls) { + URL u = null; + try { + u = new URL(url); + String protocol = u.getProtocol(); + String host = u.getHost(); + String key = protocol + "|" + host; + UrlGroup urlGroup = allGroups.get(key); + if (urlGroup == null) { + urlGroup = new UrlGroup(protocol, host); + allGroups.put(key, urlGroup); + } + urlGroup.addUrl(url); + } catch (MalformedURLException e) { + throw new RuntimeException("Invalid url format url: '" + url + + "': ", e); + } + } + return new ArrayList(allGroups.values()); + } + +} diff --git a/src/org/yooreeka/util/internet/crawling/util/ValueToIndexMapping.java b/src/org/yooreeka/util/internet/crawling/util/ValueToIndexMapping.java new file mode 100644 index 0000000..c10050d --- /dev/null +++ b/src/org/yooreeka/util/internet/crawling/util/ValueToIndexMapping.java @@ -0,0 +1,93 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.internet.crawling.util; + +import java.util.HashMap; +import java.util.Map; + +/** + * Maps string values to an index. This class is used for mapping strings to + * arrays or matrices. Index is zero-based. + */ +public class ValueToIndexMapping implements java.io.Serializable { + /** + * Unique identifier for serialization + */ + private static final long serialVersionUID = -2077767183898369580L; + + /* + * Index value that will be returned for the next new string value. + */ + private int nextIndex = 0; + + /* + * Maintains mapping from value to index. + */ + private Map valueMapping = new HashMap(); + + /* + * Maintains mapping from index to value. + */ + private Map indexMapping = new HashMap(); + + public ValueToIndexMapping() { + // empty + } + + /** + * Returns index assigned to the value. For new values new index will be + * assigned and returned. + */ + public int getIndex(String value) { + Integer index = valueMapping.get(value); + if (index == null) { + index = nextIndex; + valueMapping.put(value, index); + indexMapping.put(index, value); + nextIndex++; + } + return index; + } + + /** + * Current number of elements. + */ + public int getSize() { + return valueMapping.size(); + } + + /** + * Returns value mapped to the index or null if mapping doesn't exist. + */ + public String getValue(int index) { + return indexMapping.get(index); + } +} diff --git a/src/org/yooreeka/util/metrics/CosineDistance.java b/src/org/yooreeka/util/metrics/CosineDistance.java new file mode 100644 index 0000000..d1db320 --- /dev/null +++ b/src/org/yooreeka/util/metrics/CosineDistance.java @@ -0,0 +1,58 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +import java.util.Arrays; + +/** + * + * @author Babis Marmanis + * + */ +public class CosineDistance implements NumericDistance { + + private CosineSimilarity cosin = new CosineSimilarity(); + + public double getDistance(double[] x, double[] y) { + + double sim = cosin.sim(x, y); + + if (sim < 0.0) { + throw new RuntimeException( + "Can't use this value to calculate distance." + "x[]=" + + Arrays.toString(x) + ", y[]=" + + Arrays.toString(y) + ", cosin.sim(x,y)=" + sim); + } + + return 1.0 - sim; + } + +} diff --git a/src/org/yooreeka/util/metrics/CosineSimilarity.java b/src/org/yooreeka/util/metrics/CosineSimilarity.java new file mode 100644 index 0000000..4c42533 --- /dev/null +++ b/src/org/yooreeka/util/metrics/CosineSimilarity.java @@ -0,0 +1,76 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +public class CosineSimilarity implements SimilarityMeasure { + + private static final long serialVersionUID = -3470234210362615980L; + + private double getDotProduct(double[] v1, double[] v2) { + double sum = 0.0; + for (int i = 0, n = v1.length; i < n; i++) { + sum += v1[i] * v2[i]; + } + return sum; + } + + private double getNorm(double[] v) { + double sum = 0.0; + for (int i = 0, n = v.length; i < n; i++) { + sum += v[i] * v[i]; + } + return Math.sqrt(sum); + } + + public double sim(double[] v1, double[] v2) { + double a = getDotProduct(v1, v2); + double b = getNorm(v1) * getNorm(v2); + return a / b; + } + + /** + * Calculates cosine similarity between two sets of terms by converting them + * into term frequency vectors. It should be clear that, unlike numerical + * vectors, the definition of this similarity is to a large extent + * arbitrary. + */ + public double similarity(String[] x, String[] y) { + + double[][] termFrequencyVectors = TermFrequencyBuilder + .buildTermFrequencyVectors(x, y); + + double[] termFrequencyForX = termFrequencyVectors[0]; + double[] termFrequencyForY = termFrequencyVectors[1]; + + return sim(termFrequencyForX, termFrequencyForY); + } + +} diff --git a/src/org/yooreeka/util/metrics/CosineSimilarityMeasure.java b/src/org/yooreeka/util/metrics/CosineSimilarityMeasure.java new file mode 100644 index 0000000..874fe1f --- /dev/null +++ b/src/org/yooreeka/util/metrics/CosineSimilarityMeasure.java @@ -0,0 +1,56 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +public class CosineSimilarityMeasure { + + public double calculate(double[] v1, double[] v2) { + double a = getDotProduct(v1, v2); + double b = getNorm(v1) * getNorm(v2); + return a / b; + } + + private double getDotProduct(double[] v1, double[] v2) { + double sum = 0.0; + for (int i = 0, n = v1.length; i < n; i++) { + sum += v1[i] * v2[i]; + } + return sum; + } + + private double getNorm(double[] v) { + double sum = 0.0; + for (int i = 0, n = v.length; i < n; i++) { + sum += v[i] * v[i]; + } + return Math.sqrt(sum); + } +} diff --git a/src/org/yooreeka/util/metrics/EuclideanDistance.java b/src/org/yooreeka/util/metrics/EuclideanDistance.java new file mode 100644 index 0000000..b0e245b --- /dev/null +++ b/src/org/yooreeka/util/metrics/EuclideanDistance.java @@ -0,0 +1,55 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +public class EuclideanDistance implements NumericDistance { + + public EuclideanDistance() { + // empty + } + + public double getDistance(double[] x, double[] y) { + double sumXY2 = 0.0; + for (int i = 0, n = x.length; i < n; i++) { + sumXY2 += Math.pow(x[i] - y[i], 2); + } + return Math.sqrt(sumXY2); + } + + public double getDistance(Double[] x, Double[] y) { + double sumXY2 = 0.0; + for (int i = 0, n = x.length; i < n; i++) { + sumXY2 += Math.pow(x[i] - y[i], 2); + } + return Math.sqrt(sumXY2); + } + +} diff --git a/src/org/yooreeka/util/metrics/JaccardCoefficient.java b/src/org/yooreeka/util/metrics/JaccardCoefficient.java new file mode 100644 index 0000000..7a49a3f --- /dev/null +++ b/src/org/yooreeka/util/metrics/JaccardCoefficient.java @@ -0,0 +1,77 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Calculates Jaccard coefficient for two sets of items. + * + * @author Babis Marmanis + */ +public class JaccardCoefficient implements SimilarityMeasure { + + private static final long serialVersionUID = -5051498381470492495L; + + public JaccardCoefficient() { + // empty + } + + public double similarity(List x, List y) { + + if (x.size() == 0 || y.size() == 0) { + return 0.0; + } + + Set unionXY = new HashSet(x); + unionXY.addAll(y); + + Set intersectionXY = new HashSet(x); + intersectionXY.retainAll(y); + + return (double) intersectionXY.size() / (double) unionXY.size(); + } + + public double similarity(String[] x, String[] y) { + double sim = 0.0d; + if ((x != null && y != null) && (x.length > 0 || y.length > 0)) { + sim = similarity(Arrays.asList(x), Arrays.asList(y)); + } else { + throw new IllegalArgumentException( + "The arguments x and y must be not NULL and either x or y must be non-empty."); + } + return sim; + } + +} diff --git a/src/org/yooreeka/util/metrics/JaccardDistance.java b/src/org/yooreeka/util/metrics/JaccardDistance.java new file mode 100644 index 0000000..6da33a6 --- /dev/null +++ b/src/org/yooreeka/util/metrics/JaccardDistance.java @@ -0,0 +1,57 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-2012 Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +import java.util.List; + +import org.yooreeka.util.C; + +/** + * + * @author Babis Marmanis + * + */ +public class JaccardDistance { + + JaccardCoefficient jc; + + public JaccardDistance() { + jc = new JaccardCoefficient(); + } + + public double getDistance(List x, List y) { + + double s = jc.similarity(x, y); + + return (C.ONE_DOUBLE-s); + } + +} diff --git a/src/org/yooreeka/util/metrics/NumericDistance.java b/src/org/yooreeka/util/metrics/NumericDistance.java new file mode 100644 index 0000000..7d5218f --- /dev/null +++ b/src/org/yooreeka/util/metrics/NumericDistance.java @@ -0,0 +1,40 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +/** + * + * @author Babis Marmanis + * + */ +public interface NumericDistance { + double getDistance(double[] x, double[] y); +} diff --git a/src/org/yooreeka/util/metrics/SimilarityMeasure.java b/src/org/yooreeka/util/metrics/SimilarityMeasure.java new file mode 100644 index 0000000..d00a032 --- /dev/null +++ b/src/org/yooreeka/util/metrics/SimilarityMeasure.java @@ -0,0 +1,43 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +/** + * Interface for similarity measures. + */ +public interface SimilarityMeasure extends java.io.Serializable { + + /** + * Calculates similarity value between two sets. Each set is represented by + * array of strings. Arrays can have different length. + */ + public double similarity(String[] x, String[] y); +} diff --git a/src/org/yooreeka/util/metrics/TermFrequencyBuilder.java b/src/org/yooreeka/util/metrics/TermFrequencyBuilder.java new file mode 100644 index 0000000..a4c5dca --- /dev/null +++ b/src/org/yooreeka/util/metrics/TermFrequencyBuilder.java @@ -0,0 +1,78 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.metrics; + +import java.util.HashMap; +import java.util.Map; + +public class TermFrequencyBuilder { + + /** + * Calculates term frequency vectors based on two sets of terms. + */ + public static double[][] buildTermFrequencyVectors(String[] x, String[] y) { + + // create a set of terms with flags + Map allAttributes = new HashMap(); + for (String s : x) { + // set flags to indicate that this term is present only in x[] + allAttributes.put(s, 0x01); + } + for (String s : y) { + if (!allAttributes.containsKey(s)) { + // set flags to indicate that this term is present only in y[] + allAttributes.put(s, 0x02); + } else { + // set flags to indicate that this term is present in x[] and + // y[] + allAttributes.put(s, 0x03); + } + } + + // create term frequency vectors + int n = allAttributes.size(); + double[] termFrequencyForX = new double[n]; + double[] termFrequencyForY = new double[n]; + int i = 0; + for (Map.Entry e : allAttributes.entrySet()) { + // 0x01 - x[] only , + // 0x02 - y[] only, + // 0x03 - x[] and y[] + int flags = e.getValue(); + termFrequencyForX[i] = flags & 0x01; + termFrequencyForY[i] = flags >> 1; + i++; + } + + return new double[][] { termFrequencyForX, termFrequencyForY }; + } + +} diff --git a/src/org/yooreeka/util/parsing/common/AbstractDocument.java b/src/org/yooreeka/util/parsing/common/AbstractDocument.java new file mode 100644 index 0000000..d9c6553 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/AbstractDocument.java @@ -0,0 +1,48 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +/** + * @author Babis Marmanis + * + */ +public interface AbstractDocument { + + public String getContentCharset(); + + public String getContentType(); + + public byte[] getDocumentContent(); + + public String getDocumentId(); + + public String getDocumentURL(); +} diff --git a/src/org/yooreeka/util/parsing/common/DataEntry.java b/src/org/yooreeka/util/parsing/common/DataEntry.java new file mode 100644 index 0000000..c313dd5 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DataEntry.java @@ -0,0 +1,40 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +/** + * @author Babis Marmanis + * + */ +abstract public class DataEntry { + + abstract public DataEntry getDataEntry(); +} diff --git a/src/org/yooreeka/util/parsing/common/DataField.java b/src/org/yooreeka/util/parsing/common/DataField.java new file mode 100644 index 0000000..b6fde6d --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DataField.java @@ -0,0 +1,68 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +/** + * @author Babis Marmanis + * + */ +public class DataField { + + private String name; + private DataType dataType; + + public DataField(String name, DataType dataType) { + this.name = name; + this.dataType = dataType; + } + + public DataType getDataType() { + return dataType; + } + + public String getName() { + return name; + } + + public void setDataType(DataType dataType) { + this.dataType = dataType; + } + + public void setName(String name) { + this.name = name; + } + + public boolean validate(String s) { + boolean isValid = true; + + return isValid; + } +} diff --git a/src/org/yooreeka/util/parsing/common/DataType.java b/src/org/yooreeka/util/parsing/common/DataType.java new file mode 100644 index 0000000..67abe91 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DataType.java @@ -0,0 +1,40 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +/** + * @author Babis Marmanis + * + */ +public enum DataType { + + INTEGER, LONG, FLOAT, DOUBLE, STRING, DATE +} diff --git a/src/org/yooreeka/util/parsing/common/DocumentParser.java b/src/org/yooreeka/util/parsing/common/DocumentParser.java new file mode 100644 index 0000000..eae3a1e --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DocumentParser.java @@ -0,0 +1,44 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +/** + * Interface for parsing document that was retrieved/fetched during collection + * phase. + */ +public interface DocumentParser { + + public DataEntry getDataEntry(int i); + + public ProcessedDocument parse(AbstractDocument doc) + throws DocumentParserException; + +} diff --git a/src/org/yooreeka/util/parsing/common/DocumentParserException.java b/src/org/yooreeka/util/parsing/common/DocumentParserException.java new file mode 100644 index 0000000..533aac2 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DocumentParserException.java @@ -0,0 +1,45 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +public class DocumentParserException extends Exception { + + // Distinct SVUID for the org.yooreeka.* classes + private static final long serialVersionUID = 4938858042489090351L; + + public DocumentParserException(String msg) { + super(msg); + } + + public DocumentParserException(String msg, Throwable t) { + super(msg, t); + } +} diff --git a/src/org/yooreeka/util/parsing/common/DocumentParserFactory.java b/src/org/yooreeka/util/parsing/common/DocumentParserFactory.java new file mode 100644 index 0000000..9103e24 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/DocumentParserFactory.java @@ -0,0 +1,68 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +import org.yooreeka.util.parsing.html.HTMLDocumentParser; +import org.yooreeka.util.parsing.msword.MSWordDocumentParser; + +public class DocumentParserFactory { + + private static DocumentParserFactory instance = new DocumentParserFactory(); + + public static DocumentParserFactory getInstance() { + return instance; + } + + private DocumentParserFactory() { + // empty + } + + /** + * Returns an instance of the DocumentParser based on the + * document type. + * + * @param type + * document type. + * @return + * @throws DocumentParserException + */ + public DocumentParser getDocumentParser(String type) + throws DocumentParserException { + if (ProcessedDocument.TYPE_HTML.equalsIgnoreCase(type)) { + return new HTMLDocumentParser(); + } else if (ProcessedDocument.TYPE_MSWORD.equalsIgnoreCase(type)) { + return new MSWordDocumentParser(); + } else { + throw new DocumentParserException("Unsupported document type: '" + + type + "'."); + } + } +} diff --git a/src/org/yooreeka/util/parsing/common/ProcessedDocument.java b/src/org/yooreeka/util/parsing/common/ProcessedDocument.java new file mode 100644 index 0000000..6dbe7b2 --- /dev/null +++ b/src/org/yooreeka/util/parsing/common/ProcessedDocument.java @@ -0,0 +1,198 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.common; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; + +import org.mozilla.universalchardet.UniversalDetector; +import org.yooreeka.util.P; +import org.yooreeka.util.internet.crawling.model.Outlink; + +/** + * Represents Processed document with attributes that we are interested in. + */ +public class ProcessedDocument implements AbstractDocument { + + public static final String TYPE_TEXT = "text/plain"; + public static final String TYPE_HTML = "text/html"; + public static final String TYPE_MSWORD = "application/msword"; + + /* + * Unique document id. + */ + private String documentId; + + /* + * All document outlinks (links that document has to other documents). + */ + private List outlinks = new ArrayList(); + + /* + * URL that was used to retrieve the document. + */ + private String documentURL; + + /* + * Document title. + */ + private String title; + + /* + * Processed document content. In case of HTML doc it can be HTML with only + * relevant tags (

, ,..) preserved. + */ + private String content; + + /* + * Text extracted from the document with all formatting removed. + */ + private String text; + + /* + * Document type. + */ + private String documentType; + + public ProcessedDocument() { + } + + public String getContent() { + return this.content; + } + + @Override + public String getContentCharset() { + byte[] buf = new byte[4096]; + + ByteArrayInputStream fis = new ByteArrayInputStream(getContent() + .getBytes()); + + // (1) + UniversalDetector detector = new UniversalDetector(null); + + // (2) + int nread; + try { + while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { + detector.handleData(buf, 0, nread); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + // (3) + detector.dataEnd(); + + // (4) + String encoding = detector.getDetectedCharset(); + if (encoding != null) { + P.println("Detected encoding = " + encoding); + } else { + P.println("No encoding detected."); + } + + // (5) + detector.reset(); + return encoding; + } + + @Override + public String getContentType() { + return getDocumentType(); + } + + @Override + public byte[] getDocumentContent() { + return getContent().getBytes(Charset.forName(getContentCharset())); + } + + public String getDocumentId() { + return documentId; + } + + public String getDocumentTitle() { + return this.title; + } + + public String getDocumentType() { + return documentType; + } + + public String getDocumentURL() { + return documentURL; + } + + public List getOutlinks() { + return outlinks; + } + + public String getText() { + return text; + } + + public void setContent(String content) { + this.content = content; + } + + public void setDocumentId(String docId) { + this.documentId = docId; + } + + public void setDocumentTitle(String title) { + this.title = title; + } + + public void setDocumentType(String docType) { + this.documentType = docType; + } + + public void setDocumentURL(String documentURL) { + this.documentURL = documentURL; + } + + public void setOutlinks(List outlinks) { + this.outlinks = outlinks; + } + + public void setText(String text) { + this.text = text; + } + + @Override + public String toString() { + return "[docId: " + documentId + ", type: " + documentType + ", url: " + + documentURL + "]"; + } +} diff --git a/src/org/yooreeka/util/parsing/csv/CSVDocument.java b/src/org/yooreeka/util/parsing/csv/CSVDocument.java new file mode 100644 index 0000000..31f3006 --- /dev/null +++ b/src/org/yooreeka/util/parsing/csv/CSVDocument.java @@ -0,0 +1,93 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.csv; + +import java.util.ArrayList; + +import org.yooreeka.util.P; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +/** + * A CSVDocument is an ArrayList of CSVEntrys + * + * @author Babis Marmanis + * + */ +public class CSVDocument extends ProcessedDocument { + + private CSVEntry headers; + private ArrayList csvData; + private boolean hasHeaders; + + public CSVDocument() { + csvData = new ArrayList(); + } + + public CSVDocument(ArrayList data) { + csvData = data; + } + + public CSVEntry getHeaders() { + return headers; + } + + public boolean hasHeaders() { + return hasHeaders; + } + + public void hasHeaders(boolean val) { + hasHeaders = val; + } + + /** + * @return the csvData + */ + public ArrayList getCsvData() { + return csvData; + } + + public void print(String printSeparator) { + P.hline(); + P.println(getHeaders().toString(printSeparator)); + P.hline(); + for (CSVEntry e : csvData) { + P.println(e.toString(printSeparator)); + } + P.hline(); + } + + /** + * @param headers the headers to set + */ + public void setHeaders(CSVEntry headers) { + this.headers = headers; + } +} diff --git a/src/org/yooreeka/util/parsing/csv/CSVEntry.java b/src/org/yooreeka/util/parsing/csv/CSVEntry.java new file mode 100644 index 0000000..7578f38 --- /dev/null +++ b/src/org/yooreeka/util/parsing/csv/CSVEntry.java @@ -0,0 +1,108 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.csv; + +import org.yooreeka.util.parsing.common.DataEntry; + +/** + * A CSVEntry is simply an array of Strings. The default + * separator is the comma character, i.e. ",". + * + * @author Babis Marmanis + * + */ +public class CSVEntry extends DataEntry { + + public static final String DEFAULT_SEPARATOR = ","; + private String separator; + + private String[] data; + + public CSVEntry(String csvLine) { + this(csvLine,null); + } + + public CSVEntry(String csvLine, String sepChar) { + + if (sepChar == null) { + setSeparator(CSVEntry.DEFAULT_SEPARATOR); + } else { + setSeparator(sepChar); + } + + data = csvLine.split(getSeparator()); + } + + public String[] getData() { + return data; + } + + @Override + public DataEntry getDataEntry() { + + return this; + } + + @Override + public String toString() { + + return toString(CSVEntry.DEFAULT_SEPARATOR); + } + + public String toString(String printSeparator) { + StringBuilder sb = new StringBuilder(); + int i=1; + + for (String s : data) { + if (iBabis Marmanis + * + */ +public class CSVFile { + + private File file; + + private String separator; + + private CSVDocument doc; + + // Whether a CSV file has Headers + private boolean hasHeaders; + + public CSVFile(String fileName, boolean hasHeaders, CSVSchema schema) { + + this.hasHeaders = hasHeaders; + + file = new File(fileName); + } + + public CSVEntry getHeaders() { + + CSVEntry e = null; + + if (doc.hasHeaders()) { + e = doc.getHeaders(); + } + + return e; + } + + public boolean hasHeaders() { + return hasHeaders; + } + + public CSVDocument read() throws IOException { + + FileReader fReader = new FileReader(file); + BufferedReader bReader = new BufferedReader(fReader); + + CSVParser csvParser = new CSVParser(this); + doc = csvParser.parse(bReader); + + bReader.close(); + + return doc; + } + + /** + * @return the doc + */ + public CSVDocument getDoc() { + return doc; + } + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + CSVSchema s = new CSVSchema(); + + DataField f1 = new DataField("Customer Id", DataType.LONG); + s.addColumn(f1); + + DataField f2 = new DataField("Customer Status", DataType.STRING); + s.addColumn(f2); + + DataField f3 = new DataField("Total Order amt, USD", DataType.DOUBLE); + s.addColumn(f3); + + DataField f4 = new DataField("Content Id", DataType.STRING); + s.addColumn(f4); + + DataField f5 = new DataField("Title/Journal Id", DataType.LONG); + s.addColumn(f5); + + DataField f6 = new DataField("Title/Journal Name", DataType.STRING); + s.addColumn(f6); + + DataField f7 = new DataField("Title/Journal Publisher", DataType.STRING); + s.addColumn(f7); + + // s.addColumn(DataType.STRING_DATA_TYPE); + // s.addColumn(DataType.DOUBLE_DATA_TYPE); + // s.addColumn(DataType.STRING_DATA_TYPE); + + CSVFile f = new CSVFile(args[0], true, s); + f.read(); + } + + /** + * @return the separatorChar + */ + public String getSeparator() { + return separator; + } + + /** + * @param separatorChar the separatorChar to set + */ + public void setSeparator(String val) { + separator = val; + } +} diff --git a/src/org/yooreeka/util/parsing/csv/CSVParser.java b/src/org/yooreeka/util/parsing/csv/CSVParser.java new file mode 100644 index 0000000..6808a14 --- /dev/null +++ b/src/org/yooreeka/util/parsing/csv/CSVParser.java @@ -0,0 +1,135 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.csv; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.Charset; + +import org.yooreeka.util.parsing.common.AbstractDocument; +import org.yooreeka.util.parsing.common.DataEntry; +import org.yooreeka.util.parsing.common.DocumentParser; +import org.yooreeka.util.parsing.common.DocumentParserException; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +/** + * + * @author Babis Marmanis + * + */ +public class CSVParser implements DocumentParser { + + /** + * + */ + private CSVDocument d; + + private CSVFile csvFile; + + private long linesParsed = 0; + + /** + * + */ + public CSVParser(CSVFile f) { + this.csvFile = f; + } + + @Override + public DataEntry getDataEntry(int i) { + return d.getCsvData().get(i); + } + + public long getLinesParsed() { + return linesParsed; + } + + @Override + public ProcessedDocument parse(AbstractDocument abstractDocument) + throws DocumentParserException { + ProcessedDocument processedDocument = null; + String content = new String(abstractDocument.getDocumentContent(), + Charset.forName(abstractDocument.getContentCharset())); + BufferedReader reader = new BufferedReader(new StringReader(content)); + try { + abstractDocument = parse(reader); + } catch (IOException e) { + e.printStackTrace(); + } + return processedDocument; + } + + /** + * + * @param bR + * @return + * @throws IOException + */ + public CSVDocument parse(BufferedReader bR) throws IOException { + + d = new CSVDocument(); + + linesParsed = 0; + + boolean hasMoreLines = true; + String line; + + while (hasMoreLines) { + + line = bR.readLine(); + + if (line == null) { + + hasMoreLines = false; + + } else { + + CSVEntry csvEntry = new CSVEntry(line, getSeparator()); + if (linesParsed == 0) { + d.setHeaders(csvEntry); + } else { + d.getCsvData().add(csvEntry); + } + linesParsed++; + } + } + + return d; + } + + /** + * @return the separator + */ + public String getSeparator() { + return csvFile.getSeparator(); + } +} diff --git a/src/org/yooreeka/util/parsing/csv/CSVSchema.java b/src/org/yooreeka/util/parsing/csv/CSVSchema.java new file mode 100644 index 0000000..b38042d --- /dev/null +++ b/src/org/yooreeka/util/parsing/csv/CSVSchema.java @@ -0,0 +1,58 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.csv; + +import java.util.HashMap; + +import org.yooreeka.util.parsing.common.DataField; + +/** + * @author Babis Marmanis + * + */ +public class CSVSchema { + + private int column = 0; + private HashMap columnMap; + + public CSVSchema() { + columnMap = new HashMap<>(); + } + + public void addColumn(DataField field) { + columnMap.put(column, field); + column++; + } + + public int getNumberOfColumns() { + return columnMap.size(); + } +} diff --git a/src/org/yooreeka/util/parsing/html/CompositeFilter.java b/src/org/yooreeka/util/parsing/html/CompositeFilter.java new file mode 100644 index 0000000..0382168 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/CompositeFilter.java @@ -0,0 +1,64 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import java.util.ArrayList; +import java.util.List; + +import org.w3c.dom.Node; +import org.w3c.dom.traversal.NodeFilter; + +/* + * Combines multiple filters into one using OR logic. + */ +class CompositeFilter implements NodeFilter { + + List acceptFilters = new ArrayList(); + + public CompositeFilter() { + } + + public short acceptNode(Node n) { + short result = NodeFilter.FILTER_SKIP; + for (NodeFilter f : acceptFilters) { + result = f.acceptNode(n); + if (result == NodeFilter.FILTER_ACCEPT) { + break; + } + } + return result; + } + + public void addAcceptFilter(NodeFilter nestedFilter) { + acceptFilters.add(nestedFilter); + } + +} diff --git a/src/org/yooreeka/util/parsing/html/ElementNodeFilter.java b/src/org/yooreeka/util/parsing/html/ElementNodeFilter.java new file mode 100644 index 0000000..d308823 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/ElementNodeFilter.java @@ -0,0 +1,61 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.traversal.NodeFilter; + +/** + * Filter for nodes that are elements with specified name and attribute. + */ +class ElementNodeFilter implements NodeFilter { + private String elementName = null; + private String attributeName = null; + + public ElementNodeFilter(String elementName, String attributeName) { + this.elementName = elementName; + this.attributeName = attributeName; + } + + public short acceptNode(Node n) { + short result = FILTER_SKIP; + if (Node.ELEMENT_NODE == n.getNodeType()) { + Element e = (Element) n; + if (e.getNodeName().equalsIgnoreCase(elementName)) { + if (e.getAttributeNode(attributeName) != null) { + result = FILTER_ACCEPT; + } + } + } + return result; + } +} diff --git a/src/org/yooreeka/util/parsing/html/HTMLDocumentParser.java b/src/org/yooreeka/util/parsing/html/HTMLDocumentParser.java new file mode 100644 index 0000000..d7e39e6 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/HTMLDocumentParser.java @@ -0,0 +1,457 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.apache.xerces.xni.parser.XMLDocumentFilter; +import org.cyberneko.html.filters.ElementRemover; +import org.cyberneko.html.parsers.DOMParser; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.traversal.DocumentTraversal; +import org.w3c.dom.traversal.NodeFilter; +import org.w3c.dom.traversal.NodeIterator; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.yooreeka.util.P; +import org.yooreeka.util.internet.crawling.model.Outlink; +import org.yooreeka.util.parsing.common.AbstractDocument; +import org.yooreeka.util.parsing.common.DataEntry; +import org.yooreeka.util.parsing.common.DocumentParser; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +/** + * Parser for HTML documents. + */ +public class HTMLDocumentParser implements DocumentParser { + + ProcessedDocument htmlDoc; + + public HTMLDocumentParser() { + // NOTHING YET + } + + public HTMLDocumentParser(Reader reader) throws HTMLDocumentParserException { + HTMLDocumentParser p = new HTMLDocumentParser(); + htmlDoc = p.parse(reader); + } + + /* + * Builds absolute URL. For relative URLs will use source document URL and + * base URL. + */ + private String buildUrl(String href, String baseUrl, String documentUrl) { + + String url = null; + + String protocol = extractProtocol(href); + + if (protocol != null) { + url = href; + } else if (baseUrl != null) { + url = baseUrl + href; + } else if (href.startsWith("/")) { + try { + URL docUrl = new URL(documentUrl); + if (docUrl.getPort() == -1) { + url = docUrl.getProtocol() + "://" + docUrl.getHost() + + href; + } else { + url = docUrl.getProtocol() + "://" + docUrl.getHost() + ":" + + docUrl.getPort() + href; + } + } catch (MalformedURLException e) { + url = null; + } + } else { + url = extractParent(documentUrl) + href; + } + + return url; + } + + private String cleanText(String text) { + if (text == null) { + return null; + } + String t = text.replaceAll("[ \t]+", " "); + t = t.replaceAll("[ \t][\r\n]", "\n"); + t = t.replaceAll("[\r\n]+", "\n"); + return t; + } + + private List extractLinks(Node node, String docUrl, String baseUrl) { + if (isNoFollowForDocument(node)) { + return new ArrayList(); + } + + org.w3c.dom.Document doc = getDocumentNode(node); + DocumentTraversal traversableDoc = (DocumentTraversal) doc; + NodeFilter linkFilter = getLinkNodeFilter(); + NodeIterator iterator = traversableDoc.createNodeIterator(node, + NodeFilter.SHOW_ELEMENT, linkFilter, true); + Node currentNode = null; + + List outlinks = new ArrayList(); + + while ((currentNode = iterator.nextNode()) != null) { + String href = currentNode.getAttributes().getNamedItem("href") + .getNodeValue(); + boolean nofollow = isNoFollowPresent(currentNode); + if (nofollow == false) { + if ("BASE".equalsIgnoreCase(node.getNodeName())) { + // ignore this link + } else { + String url = buildUrl(href, baseUrl, docUrl); + if (url != null) { + String anchorText = getAnchorText(currentNode); + Outlink link = new Outlink(url, anchorText); + outlinks.add(link); + } + } + } + } + + return outlinks; + } + + private String extractParent(String url) { + String parent = url; + int i = url.lastIndexOf("/"); + if (i > -1) { + parent = url.substring(0, i + "/".length()); + } + return parent; + } + + /* + * Extracts url protocol if present. Handles two cases: + * + * 1. "://" 2. "mailto:" + */ + private String extractProtocol(String url) { + String protocol = null; + if (url.startsWith("mailto:")) { + protocol = "mailto"; + } else { + int i = url.indexOf("://"); + if (i > -1) { + protocol = url.substring(0, i); + } + } + return protocol; + } + + private String getAnchorText(Node currentNode) { + String text = getText(currentNode); + String cleanText = null; + if (text != null) { + cleanText = text.replaceAll("[\r\n\t]", " ").trim(); + } + return cleanText; + } + + private String getBaseUrl(Node node) { + if (node == null) { + return null; + } + org.w3c.dom.Document doc = getDocumentNode(node); + NodeList nodeList = doc.getElementsByTagName("base"); + Node baseNode = nodeList.item(0); + if (baseNode != null) { + NamedNodeMap attrs = baseNode.getAttributes(); + if (attrs != null) { + Node href = attrs.getNamedItem("href"); + if (href != null) { + return href.getNodeValue(); + } + } + } + return null; + } + + @Override + public DataEntry getDataEntry(int i) { + // TODO Auto-generated method stub + return null; + } + + private org.w3c.dom.Document getDocumentNode(Node node) { + if (node == null) { + return null; + } + + if (Node.DOCUMENT_NODE == node.getNodeType()) { + return (org.w3c.dom.Document) node; + } else { + return node.getOwnerDocument(); + } + } + + public ProcessedDocument getHtmlDoc() { + return htmlDoc; + } + + private NodeFilter getLinkNodeFilter() { + CompositeFilter linkFilter = new CompositeFilter(); + // For now doing the simplest thing possible - only consider + // elements + linkFilter.addAcceptFilter(new ElementNodeFilter("a", "href")); + /* + * Other elements to consider: + * + * linkFilter.addAcceptFilter(new LinkNodeFilter("frame", "src")); + * linkFilter.addAcceptFilter(new LinkNodeFilter("link", "href")); + */ + return linkFilter; + } + + private String getRobotsMeta(Node node) { + if (node == null) { + return null; + } + org.w3c.dom.Document doc = getDocumentNode(node); + NodeList nodeList = doc.getElementsByTagName("meta"); + if (nodeList != null) { + for (int i = 0, n = nodeList.getLength(); i < n; i++) { + Node currentNode = nodeList.item(i); + NamedNodeMap attrs = currentNode.getAttributes(); + if (attrs != null) { + Node contentNode = attrs.getNamedItem("content"); + Node nameNode = attrs.getNamedItem("name"); + if (nameNode != null && contentNode != null) { + if ("ROBOTS".equalsIgnoreCase(nameNode.getNodeValue())) { + if (contentNode != null) { + return contentNode.getNodeValue(); + } + } + } + } + } + } + return null; + } + + private String getText(Node node) { + if (node == null) { + return ""; + } + + org.w3c.dom.Document doc = getDocumentNode(node); + org.w3c.dom.traversal.DocumentTraversal traversable = (DocumentTraversal) doc; + int whatToShow = NodeFilter.SHOW_TEXT; + NodeIterator nodeIterator = traversable.createNodeIterator(node, + whatToShow, null, true); + + StringBuffer text = new StringBuffer(); + Node currentNode = null; + while ((currentNode = nodeIterator.nextNode()) != null) { + text.append(currentNode.getNodeValue()); + } + return text.toString(); + } + + private String getTitle(Node node) { + if (node == null) { + return ""; + } + + String cleanTitle = null; + org.w3c.dom.Document doc = getDocumentNode(node); + NodeList nodeList = doc.getElementsByTagName("title"); + Node matchedNode = nodeList.item(0); + if (matchedNode != null) { + String title = matchedNode.getTextContent(); + if (title != null) { + cleanTitle = title.replaceAll("[\r\n\t]", " ").trim(); + } + } + + return cleanTitle; + } + + private boolean isNoFollowForDocument(Node node) { + boolean noFollow = false; + + // Check + String robotsMeta = getRobotsMeta(node); + if (robotsMeta != null + && robotsMeta.toLowerCase().indexOf("nofollow") > -1) { + noFollow = true; + } + + return noFollow; + } + + /* + * Checks for presense of rel="nofollow" attribute. + */ + private boolean isNoFollowPresent(Node currentNode) { + Node relAttr = currentNode.getAttributes().getNamedItem("rel"); + boolean nofollow = false; + if (relAttr != null) { + String relAttrValue = relAttr.getNodeValue(); + if ("nofollow".equalsIgnoreCase(relAttrValue)) { + nofollow = true; + } + } + return nofollow; + } + + public ProcessedDocument parse(AbstractDocument doc) + throws HTMLDocumentParserException { + P.println("Entering HTMLDocumentParser.parse(FetchedDocument doc) ..."); + ProcessedDocument htmlDoc = new ProcessedDocument(); + htmlDoc.setDocumentType(ProcessedDocument.TYPE_HTML); + htmlDoc.setDocumentId(doc.getDocumentId()); + htmlDoc.setDocumentURL(doc.getDocumentURL()); + String documentCharset = doc.getContentCharset(); + + P.println("Converting the content bytes into a string ..."); + + InputStream contentBytes = new ByteArrayInputStream( + doc.getDocumentContent()); + try { + /* + * Up to this point document content was treated as byte array. Here + * we convert byte array into character based stream. Processed + * document will be stored using UTF-8 encoding. + */ + InputStreamReader characterStream = new InputStreamReader( + contentBytes, documentCharset); + InputSource inputSource = new InputSource(); + inputSource.setCharacterStream(characterStream); + parseHTML(htmlDoc, inputSource); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + throw new HTMLDocumentParserException("Document parsing error: ", e); + } + return htmlDoc; + } + + public ProcessedDocument parse(Reader reader) + throws HTMLDocumentParserException { + P.println("Entering HTMLDocumentParser.parse(Reader reader) ..."); + ProcessedDocument processedDocument = new ProcessedDocument(); + processedDocument.setDocumentType(ProcessedDocument.TYPE_HTML); + processedDocument.setDocumentId(null); + processedDocument.setDocumentURL(null); + InputSource inputSource = new InputSource(); + inputSource.setCharacterStream(reader); + parseHTML(processedDocument, inputSource); + return processedDocument; + } + + private void parseHTML(ProcessedDocument htmlDoc, InputSource inputSource) + throws HTMLDocumentParserException { + // NekoHTML parser + DOMParser parser = new DOMParser(); + + // Create filter to remove elements that we don't care about. + ElementRemover remover = new ElementRemover(); + // keep only a subset of elements (text and links) + remover.acceptElement("html", null); + remover.acceptElement("meta", new String[] { "name", "content" }); + remover.acceptElement("title", null); + remover.acceptElement("body", null); + remover.acceptElement("base", new String[] { "href" }); + remover.acceptElement("b", null); + remover.acceptElement("i", null); + remover.acceptElement("u", null); + remover.acceptElement("p", null); + remover.acceptElement("br", null); + remover.acceptElement("a", new String[] { "href", "rel" }); + // completely remove these elements + remover.removeElement("script"); + remover.removeElement("style"); + + StringWriter sw = new StringWriter(); + XMLDocumentFilter writer = new HTMLWriter(sw, "UTF-8"); + + XMLDocumentFilter[] filters = { remover, writer }; + try { + parser.setProperty("http://cyberneko.org/html/properties/filters", + filters); + } catch (SAXException e) { + e.printStackTrace(); + throw new HTMLDocumentParserException("Property is not supported", + e); + } + + try { + parser.parse(inputSource); + } catch (SAXException e) { + e.printStackTrace(); + throw new HTMLDocumentParserException("Parsing error: ", e); + } catch (IOException e) { + e.printStackTrace(); + throw new HTMLDocumentParserException("Parsing error: ", e); + } + + // cleaned up html. + String cleanHTML = cleanText(sw.toString()); + htmlDoc.setContent(cleanHTML); + + // just the text + Node node = parser.getDocument(); + String text = cleanText(getText(node)); + htmlDoc.setText(text); + + // content of + String title = getTitle(node); + htmlDoc.setDocumentTitle(title); + + if (htmlDoc.getDocumentURL() != null) { + String baseUrl = getBaseUrl(node); + + // links to other pages + List<Outlink> outlinks = extractLinks(node, + htmlDoc.getDocumentURL(), baseUrl); + htmlDoc.setOutlinks(outlinks); + } + } + + public void setHtmlDoc(ProcessedDocument htmlDoc) { + this.htmlDoc = htmlDoc; + } +} diff --git a/src/org/yooreeka/util/parsing/html/HTMLDocumentParserException.java b/src/org/yooreeka/util/parsing/html/HTMLDocumentParserException.java new file mode 100644 index 0000000..a839e98 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/HTMLDocumentParserException.java @@ -0,0 +1,49 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import org.yooreeka.util.parsing.common.DocumentParserException; + +public class HTMLDocumentParserException extends DocumentParserException { + + /** + * Distinct SVUID for the org.yooreeka.* classes + */ + private static final long serialVersionUID = 3397930132653232196L; + + public HTMLDocumentParserException(String msg) { + super(msg); + } + + public HTMLDocumentParserException(String msg, Throwable t) { + super(msg, t); + } +} diff --git a/src/org/yooreeka/util/parsing/html/HTMLWriter.java b/src/org/yooreeka/util/parsing/html/HTMLWriter.java new file mode 100644 index 0000000..2c36cd2 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/HTMLWriter.java @@ -0,0 +1,119 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import org.apache.xerces.xni.QName; +import org.apache.xerces.xni.XMLAttributes; +import org.cyberneko.html.filters.Writer; + +/** + * Extending NekoHTML Writer filter to override its behavior (most probably a + * bug). + */ +public class HTMLWriter extends Writer { + + public HTMLWriter(java.io.Writer writer, String encoding) { + super(writer, encoding); + } + + /** + * This code was copied from + * org.cyberneko.html.filters.Writer.printStartElement It overrides original + * version with minor adjustment for bug fix. + * + * Original version would wipe out value of 'content' attribute from in meta + * elements. In our case we are interested in: + * + * <meta name="robots" content="...."/> + */ + @Override + protected void printStartElement(QName element, XMLAttributes attributes) { + // modify META[@http-equiv='content-type']/@content value + int contentIndex = -1; + String originalContent = null; + if (element.rawname.toLowerCase().equals("meta")) { + String httpEquiv = null; + int length = attributes.getLength(); + for (int i = 0; i < length; i++) { + String aname = attributes.getQName(i).toLowerCase(); + if (aname.equals("http-equiv")) { + httpEquiv = attributes.getValue(i); + } else if (aname.equals("content")) { + contentIndex = i; + } + } + if (httpEquiv != null + && httpEquiv.toLowerCase().equals("content-type")) { + fSeenHttpEquiv = true; + String content = null; + if (contentIndex != -1) { + originalContent = attributes.getValue(contentIndex); + content = originalContent.toLowerCase(); + } + if (content != null) { + int charsetIndex = content.indexOf("charset="); + if (charsetIndex != -1) { + content = content.substring(0, charsetIndex + 8); + } else { + content += ";charset="; + } + content += fEncoding; + attributes.setValue(contentIndex, content); + } + } else { + // this is the difference from original code + contentIndex = -1; + } + } + + // print element + fPrinter.print('<'); + fPrinter.print(element.rawname); + int attrCount = attributes != null ? attributes.getLength() : 0; + for (int i = 0; i < attrCount; i++) { + String aname = attributes.getQName(i); + String avalue = attributes.getValue(i); + fPrinter.print(' '); + fPrinter.print(aname); + fPrinter.print("=\""); + printAttributeValue(avalue); + fPrinter.print('"'); + } + fPrinter.print('>'); + fPrinter.flush(); + + // return original META[@http-equiv]/@content value + if (contentIndex != -1) { + attributes.setValue(contentIndex, originalContent); + } + + } // printStartElement(QName,XMLAttributes) +} diff --git a/src/org/yooreeka/util/parsing/html/LinkNodeFilter.java b/src/org/yooreeka/util/parsing/html/LinkNodeFilter.java new file mode 100644 index 0000000..ac0522d --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/LinkNodeFilter.java @@ -0,0 +1,58 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.traversal.NodeFilter; + +class LinkNodeFilter implements NodeFilter { + private String elementName = null; + private String attributeName = null; + + public LinkNodeFilter(String elementName, String attributeName) { + this.elementName = elementName; + this.attributeName = attributeName; + } + + public short acceptNode(Node n) { + short result = FILTER_SKIP; + if (Node.ELEMENT_NODE == n.getNodeType()) { + Element e = (Element) n; + if (e.getNodeName().equalsIgnoreCase(elementName)) { + if (e.getAttributeNode(attributeName) != null) { + result = FILTER_ACCEPT; + } + } + } + return result; + } +} diff --git a/src/org/yooreeka/util/parsing/html/MultiFilter.java b/src/org/yooreeka/util/parsing/html/MultiFilter.java new file mode 100644 index 0000000..e977c47 --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/MultiFilter.java @@ -0,0 +1,61 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import java.util.ArrayList; +import java.util.List; + +import org.w3c.dom.Node; +import org.w3c.dom.traversal.NodeFilter; + +class MultiFilter implements NodeFilter { + + List<NodeFilter> acceptFilters = new ArrayList<NodeFilter>(); + + public MultiFilter() { + } + + public short acceptNode(Node n) { + short result = NodeFilter.FILTER_SKIP; + for (NodeFilter f : acceptFilters) { + result = f.acceptNode(n); + if (result == NodeFilter.FILTER_ACCEPT) { + break; + } + } + return result; + } + + public void addAcceptFilter(NodeFilter nestedFilter) { + acceptFilters.add(nestedFilter); + } + +} diff --git a/src/org/yooreeka/util/parsing/msword/MSWordDocumentParser.java b/src/org/yooreeka/util/parsing/msword/MSWordDocumentParser.java new file mode 100644 index 0000000..311786c --- /dev/null +++ b/src/org/yooreeka/util/parsing/msword/MSWordDocumentParser.java @@ -0,0 +1,103 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.msword; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; + +import org.textmining.extraction.TextExtractor; +import org.textmining.extraction.word.WordTextExtractorFactory; +import org.yooreeka.util.parsing.common.AbstractDocument; +import org.yooreeka.util.parsing.common.DataEntry; +import org.yooreeka.util.parsing.common.DocumentParser; +import org.yooreeka.util.parsing.common.DocumentParserException; +import org.yooreeka.util.parsing.common.ProcessedDocument; + +public class MSWordDocumentParser implements DocumentParser { + + ProcessedDocument wordDoc = new ProcessedDocument(); + + @Override + public DataEntry getDataEntry(int i) { + // TODO Auto-generated method stub + return null; + } + + /* + * Finds the first non-empty line in the document. + */ + private String getTitle(String text) throws IOException { + if (text == null) { + return null; + } + String title = ""; + + StringReader sr = new StringReader(text); + BufferedReader r = new BufferedReader(sr); + String line = null; + while ((line = r.readLine()) != null) { + if (line.trim().length() > 0) { + title = line.trim(); + break; + } + } + + return title; + } + + public ProcessedDocument parse(AbstractDocument doc) + throws DocumentParserException { + + wordDoc.setDocumentType(ProcessedDocument.TYPE_MSWORD); + wordDoc.setDocumentId(doc.getDocumentId()); + wordDoc.setDocumentURL(doc.getDocumentURL()); + + InputStream contentData = new ByteArrayInputStream( + doc.getDocumentContent()); + WordTextExtractorFactory wteFactory = new WordTextExtractorFactory(); + + try { + TextExtractor txtExtractor = wteFactory.textExtractor(contentData); + String text = txtExtractor.getText(); + wordDoc.setText(text); + // using the same value as text + wordDoc.setContent(text); + wordDoc.setDocumentTitle(getTitle(text)); + } catch (Exception e) { + throw new MSWordDocumentParserException( + "MSWord Document parsing error: ", e); + } + return wordDoc; + } +} diff --git a/src/org/yooreeka/util/parsing/msword/MSWordDocumentParserException.java b/src/org/yooreeka/util/parsing/msword/MSWordDocumentParserException.java new file mode 100644 index 0000000..aacdd94 --- /dev/null +++ b/src/org/yooreeka/util/parsing/msword/MSWordDocumentParserException.java @@ -0,0 +1,49 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.msword; + +import org.yooreeka.util.parsing.common.DocumentParserException; + +public class MSWordDocumentParserException extends DocumentParserException { + + /** + * Distinct SVUID for the org.yooreeka.* classes + */ + private static final long serialVersionUID = -3005082246637918030L; + + public MSWordDocumentParserException(String msg) { + super(msg); + } + + public MSWordDocumentParserException(String msg, Throwable t) { + super(msg, t); + } +} diff --git a/src/org/yooreeka/util/text/AlphabetProjection.java b/src/org/yooreeka/util/text/AlphabetProjection.java new file mode 100644 index 0000000..268010e --- /dev/null +++ b/src/org/yooreeka/util/text/AlphabetProjection.java @@ -0,0 +1,313 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-2012 Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.text; + +import java.util.logging.Logger; + +import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.C; +import org.yooreeka.util.P; +import org.yooreeka.util.metrics.EuclideanDistance; + +import com.wcohen.ss.JaroWinkler; +import com.wcohen.ss.Level2Jaro; +import com.wcohen.ss.MongeElkan; +import com.wcohen.ss.NeedlemanWunsch; +import com.wcohen.ss.api.StringDistance; + +/** + * + * @author <a href="mailto:babis@marmanis.com">Babis Marmanis</a> + * + */ +public class AlphabetProjection { + + private static final Logger LOG = Logger.getLogger(AlphabetProjection.class.getName()); + + /** + * <tt>dimensionality</tt> determines the number of <tt>String</tt> vectors + * that we will use. + */ + public static final int DEFAULT_DIMENSIONALITY = 10; + private int dimensionality; + + /** + * <tt>baseLength</tt> determines the length of the <TT>String</TT> vectors + * that we will use. + */ + public static final int DEFAULT_BASELENGTH = 10; + private int baselength; + + // TODO: This covers only the English language. Create a separate character basis class + // that has all the character bases and invoke them statically as needed. + + public static final char[] DEFAULT_CHARACTER_BASIS = { 'e', 't', 'a', 'o', 'n', 'r', 'i', 's', + 'h', 'd', 'l', 'f', 'c', 'm', 'u', 'g', 'y', 'p', 'w', 'b', 'v', + 'k', 'x', 'j', 'q', 'z' }; + private char[] characterBasis; + + private String[] projectionBasis = null; + + //TODO: These should be passed to the projection class. Take them out and define an + // appropriate encapsulation + + // String Edit Distance Metrics + private NeedlemanWunsch needlemanWunch; + private JaroWinkler jaroWinkler; + private Level2Jaro level2Jaro; + private MongeElkan mongeElkan; + + // String Distances + private StringDistance needlemanWunchDistance = null; + private StringDistance jaroWinklerDistance = null; + private StringDistance level2JaroDistance = null; + private StringDistance mongeElkanDistance = null; + + // -------------------------------------------------------------------------------- + // CONSTRUCTORS + // -------------------------------------------------------------------------------- + public AlphabetProjection(int dim, int length, char[] charBasis) { + + LOG.setLevel(YooreekaConfigurator.getLevel(AlphabetProjection.class.getName())); + + if (dim > 0) { + dimensionality = dim; + } else { + dimensionality = AlphabetProjection.DEFAULT_DIMENSIONALITY; + } + + if (length <=0) { + baselength = length; + } else { + baselength = AlphabetProjection.DEFAULT_BASELENGTH; + } + + if (charBasis != null) { + characterBasis = charBasis; + } else { + characterBasis = AlphabetProjection.DEFAULT_CHARACTER_BASIS; + } + + // Initialize the projection + initProjection(); + + // Initialize the String edit distance metrics + initMetrics(); + } + + // -------------------------------------------------------------------------------- + // INITIALIZATION + // -------------------------------------------------------------------------------- + /** + * Initialize the configuration space. + */ + private void initProjection() { + + projectionBasis = new String[dimensionality]; + + // First define the String basis onto which we will project a given + // String + for (int i = 0; i < dimensionality; i++) { + projectionBasis[i] = getEigenvector(characterBasis[i]); + } + } + + private void initMetrics() { + needlemanWunch = new NeedlemanWunsch(); + jaroWinkler = new JaroWinkler(); + level2Jaro = new Level2Jaro(); + mongeElkan = new MongeElkan(); + } + + + // -------------------------------------------------------------------------------- + // PROJECTION METHODS + // -------------------------------------------------------------------------------- + /** + * + * @param target + * the String that we want to project onto the base vectors + * @param projections + * of the <CODE>target</CODE> onto each one of the base vectors. + * + * + */ + public double[] project(String target) throws IllegalArgumentException { + + double[] projections = new double[dimensionality]; + + if (target == null) { + target = C.EMPTY_STRING; + } + + target.toLowerCase(); + + jaroWinklerDistance = jaroWinkler.getDistance(); + level2JaroDistance = level2Jaro.getDistance(); + mongeElkanDistance = mongeElkan.getDistance(); + needlemanWunchDistance = needlemanWunch.getDistance(); + + double p = 0; + + for (int i = 0; i < dimensionality; i++) { + + p=jaroWinklerDistance.score(projectionBasis[i], target); + p += level2JaroDistance.score(projectionBasis[i], target); + p += mongeElkanDistance.score(projectionBasis[i], target); + p += needlemanWunchDistance.score(projectionBasis[i], target); + + projections[i] = p*0.25; + } + + return projections; + } + + /** + * + * @param target + * the String that we want to project onto the base vectors + * @param projections + * of the <CODE>target</CODE> onto each one of the base vectors. + * + * + */ + public double[] project(String target, StringDistance d) throws IllegalArgumentException { + + double[] projections = new double[dimensionality]; + + if (target == null) { + target = C.EMPTY_STRING; + } + + target.toLowerCase(); + + for (int i = 0; i < dimensionality; i++) { + + projections[i] = d.score(projectionBasis[i], target); + } + + return projections; + } + + // -------------------------------------------------------------------------------- + // AUXILIARY METHODS + // -------------------------------------------------------------------------------- + /** + * Creates instance with default parameters (suitable when you are unaware of + * best parameters to constructor. + * + * @return instance with default parameters applied. + * + */ + public static AlphabetProjection getDefault() { + return new AlphabetProjection(DEFAULT_DIMENSIONALITY, DEFAULT_BASELENGTH, DEFAULT_CHARACTER_BASIS); + } + + public double distance(String val1, String val2) { + + EuclideanDistance euclid = new EuclideanDistance(); + + return euclid.getDistance(project(val1), project(val2)); + } + + /** + * @param val + * the single character of the base vector + * + * @return the base vector for the <tt>val</tt> character. + */ + public String getEigenvector(char val) { + + StringBuffer buf = new StringBuffer(); + + for (int i = 0; i < baselength; i++) { + + buf.append(val); + } + + return buf.toString(); + } + + // -------------------------------------------------------------------------------- + // MAIN METHOD + // -------------------------------------------------------------------------------- + public static void main(String[] args) throws Exception { + + AlphabetProjection aProjection = new AlphabetProjection(10,10,AlphabetProjection.DEFAULT_CHARACTER_BASIS); + + final String TEST_STRING_1 = "Андре́й Никола́евич Колмого́ров";//"Andrei Nikolaevitch Kolmogorov"; + final String TEST_STRING_2 = "Колмого́ров Андре́й Никола́евич";//"Kolmogorov Andrei Nikolaevitch"; + final String TEST_STRING_3 = "Nikolai"; + + P.println("d[T1,T2] = " + + aProjection.distance(TEST_STRING_1, TEST_STRING_2)); + P.println("d[T1,T3] = " + + aProjection.distance(TEST_STRING_1, TEST_STRING_3)); + } + + // -------------------------------------------------------------------------------- + // GETTERS -- SETTERS + // -------------------------------------------------------------------------------- + + public static int[] getDefaultProjectionProperties() { + return new int[] { DEFAULT_DIMENSIONALITY, DEFAULT_BASELENGTH }; + } + + public int getBaselength() { + return baselength; + } + + public void setBaselength(int baselength) { + this.baselength = baselength; + } + + public int getDimensionality() { + return dimensionality; + } + + public void setDimensionality(int dimensionality) { + this.dimensionality = dimensionality; + } + + /** + * @return the characterBasis + */ + public char[] getCharacterBasis() { + return characterBasis; + } + + /** + * @param characterBasis the characterBasis to set + */ + public void setCharacterBasis(char[] characterBasis) { + this.characterBasis = characterBasis; + } +}