From 5bc3d2ca243070c83b78823d9136346d80156ba5 Mon Sep 17 00:00:00 2001 From: "H. Marmanis" Date: Tue, 1 Jan 2013 17:04:21 -0500 Subject: [PATCH] Parity with the Google Code project --- .../util/SimilarityMatrixRepository.java | 7 +- src/org/yooreeka/util/P.java | 25 +++++- .../util/parsing/html/BookmarkParser.java | 86 +++++++++++++++++++ test/org/yooreeka/test/TestSandbox.java | 10 --- 4 files changed, 112 insertions(+), 16 deletions(-) create mode 100644 src/org/yooreeka/util/parsing/html/BookmarkParser.java diff --git a/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java index 48eb933..140804b 100644 --- a/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java +++ b/src/org/yooreeka/algos/reco/collab/similarity/util/SimilarityMatrixRepository.java @@ -43,6 +43,7 @@ import org.yooreeka.algos.reco.collab.similarity.naive.UserContentBasedSimilarity; import org.yooreeka.algos.reco.collab.similarity.naive.UserItemContentBasedSimilarity; import org.yooreeka.config.YooreekaConfigurator; +import org.yooreeka.util.P; public class SimilarityMatrixRepository { @@ -115,15 +116,13 @@ public SimilarityMatrix load(RecommendationType type, Dataset data, if (cache != null) { m = cache.get(id); if (m == null) { - System.out - .println("similarity matrix instance doesn't exist in cache: " + P.println("similarity matrix instance doesn't exist in cache: " + "id: " + id + ", cache: '" + cache.getLocation() + "'."); } else { - System.out - .println("similarity matrix instance was loaded from cache: " + P.println("similarity matrix instance was loaded from cache: " + "id: " + id + ", cache: '" diff --git a/src/org/yooreeka/util/P.java b/src/org/yooreeka/util/P.java index 6a43ac1..291f40e 100644 --- a/src/org/yooreeka/util/P.java +++ b/src/org/yooreeka/util/P.java @@ -40,12 +40,33 @@ public class P { /** - * Print a 54 character (-) horizontal line. + * Print a horizontal line with 65 characters. */ public static void hline() { println("---------- ---------- ---------- ---------- ---------- ----------"); } - + + + /** + * Auxiliary method for sending time information to the standard output. + * Time is measured in milliseconds, see the documentation + * of System.currentTimeMillis() for details. + * + */ + public static void time() { + println("Time: "+System.currentTimeMillis()); + } + + /** + * Auxiliary method for sending time information to the standard output. + * The time is given in milliseconds and in relation to a given moment + * in the past, determined by the value of the argument t. + * + */ + public static void time(long t) { + println("Time: "+ (System.currentTimeMillis()-t)); + } + public static void main(String[] args) { println(Charset.defaultCharset().displayName()); println("" + P.class.getName()); diff --git a/src/org/yooreeka/util/parsing/html/BookmarkParser.java b/src/org/yooreeka/util/parsing/html/BookmarkParser.java new file mode 100644 index 0000000..6f36a6c --- /dev/null +++ b/src/org/yooreeka/util/parsing/html/BookmarkParser.java @@ -0,0 +1,86 @@ +/* + * ________________________________________________________________________________________ + * + * Y O O R E E K A + * A library for data mining, machine learning, soft computing, and mathematical analysis + * ________________________________________________________________________________________ + * + * The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " + * (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms + * are valuable in any software application. + * + * Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko + * Copyright (c) 2009-2012 Marmanis Group LLC and individual contributors as indicated by the @author tags. + * + * Certain library functions depend on other Open Source software libraries, which are covered + * by different license agreements. See the NOTICE file distributed with this work for additional + * information regarding copyright ownership and licensing. + * + * Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.yooreeka.util.parsing.html; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; + +import org.yooreeka.util.parsing.common.ProcessedDocument; + +/** + * @author haris + * + */ +public class BookmarkParser extends HTMLDocumentParser { + + /** + * + */ + public BookmarkParser() { + // TODO Auto-generated constructor stub + } + + /** + * @param reader + * @throws HTMLDocumentParserException + */ + public BookmarkParser(Reader reader) throws HTMLDocumentParserException { + super(reader); + // TODO Auto-generated constructor stub + } + + /** + * @param args + */ + public static void main(String[] args) { + + String filename = args[0]; + BookmarkParser bookParser = null; + ProcessedDocument doc = null; + try { + bookParser = new BookmarkParser(); + InputStream inputStream = new BufferedInputStream( + new FileInputStream(filename)); + Reader reader = new InputStreamReader(inputStream, "UTF-8"); + doc = bookParser.parse(reader); + } catch (Exception e) { + throw new RuntimeException("Failed to parse html from file: " + + filename, e); + } + + //P.println(doc.getText()); + + } + +} diff --git a/test/org/yooreeka/test/TestSandbox.java b/test/org/yooreeka/test/TestSandbox.java index e356f43..f3dabd9 100644 --- a/test/org/yooreeka/test/TestSandbox.java +++ b/test/org/yooreeka/test/TestSandbox.java @@ -30,8 +30,6 @@ */ package org.yooreeka.test; -import org.yooreeka.config.YooreekaConfigurator; -import org.yooreeka.examples.newsgroups.NewsCrawler; /** @@ -49,13 +47,5 @@ public class TestSandbox { * @throws Exception */ public static void main(String[] args) throws Exception { - String rootDir = YooreekaConfigurator.getProperty(YooreekaConfigurator.CRAWL_DATA_DIR); - - NewsCrawler crawler = new NewsCrawler(rootDir, 2, 10); - - crawler.addSeedUrl("http://www.manning.com/"); - - crawler.run(); - } }