From c62576d6e260d6fbf4f966f9c10c6654fae789db Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Fri, 1 Dec 2023 14:45:25 +0100 Subject: [PATCH] Fix deprecation warnings in Wikipedia and WikipediaInfo Adds related unit tests --- .../java/org/dkpro/jwpl/api/Wikipedia.java | 374 +++++++----------- .../org/dkpro/jwpl/api/WikipediaInfo.java | 55 +-- .../distance/LevenshteinStringDistance.java | 4 +- .../org/dkpro/jwpl/api/WikipediaInfoTest.java | 60 +++ .../org/dkpro/jwpl/api/WikipediaTest.java | 17 + .../wikiapi_simple_20090119_stripped.script | 2 +- 6 files changed, 237 insertions(+), 275 deletions(-) create mode 100644 dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java index 6e73910d..9508a43c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java @@ -47,8 +47,7 @@ */ // TODO better JavaDocs! public class Wikipedia - implements WikiConstants -{ + implements WikiConstants { private static final Logger logger = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); @@ -81,14 +80,11 @@ public class Wikipedia * Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig * parameter. * - * @param dbConfig - * A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where - * the data is stored and how it can be accessed. - * @throws WikiInitializationException - * Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. + * @param dbConfig A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where + * the data is stored and how it can be accessed. + * @throws WikiInitializationException Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. */ - public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException - { + public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException { logger.trace("Creating Wikipedia object."); @@ -103,15 +99,13 @@ public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationExcept if (dbConfig.supportsCollation()) { logger.info("Wikipedia database backend supports character collation features."); - } - else { + } else { logger.debug( "Wikipedia database backend does NOT support character collation features."); } } - WikiConfig getWikConfig() - { + WikiConfig getWikConfig() { return wikiConfig; } @@ -125,35 +119,29 @@ WikiConfig getWikConfig() * For example, the article "Steam boat" could be queried with - "Steam boat" - "steam boat" - * "Steam_boat" - "steam_boat" and additionally all redirects that might point to that article. * - * @param title - * The title of the page. + * @param title The title of the page. * @return The page object for a given title. - * @throws WikiApiException - * If no page or redirect with this title exists or the title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly + * parsed. */ - public Page getPage(String title) throws WikiApiException - { + public Page getPage(String title) throws WikiApiException { return new Page(this, title, false); } /** * Gets the page with exactly the given title.
- * + *

* Note that when using this method you are responsible for converting a normal search string * into the right wiki-style.
- * + *

* If the title is a redirect, the corresponding page is returned.
* - * @param exactTitle - * The exact title of the page. + * @param exactTitle The exact title of the page. * @return The page object for a given title. - * @throws WikiApiException - * If no page or redirect with this title exists or the title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly + * parsed. */ - public Page getPageByExactTitle(String exactTitle) throws WikiApiException - { + public Page getPageByExactTitle(String exactTitle) throws WikiApiException { return new Page(this, exactTitle, true); } @@ -163,15 +151,12 @@ public Page getPageByExactTitle(String exactTitle) throws WikiApiException * Spaces in the title are converted to underscores, as this is a convention for Wikipedia * article titles. * - * @param title - * The title of the page. + * @param title The title of the page. * @return A set of page objects matching this title. - * @throws WikiApiException - * If no page or redirect with this title exists or the title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly + * parsed. */ - public Set getPages(String title) throws WikiApiException - { + public Set getPages(String title) throws WikiApiException { Set ids = new HashSet<>(getPageIdsCaseInsensitive(title)); Set pages = new HashSet<>(); @@ -184,28 +169,22 @@ public Set getPages(String title) throws WikiApiException /** * Gets the page for a given pageId. * - * @param pageId - * The id of the page. + * @param pageId The id of the page. * @return The page object for a given pageId. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public Page getPage(int pageId) throws WikiApiException - { + public Page getPage(int pageId) throws WikiApiException { return new Page(this, pageId); } /** * Gets the title for a given pageId. * - * @param pageId - * The id of the page. + * @param pageId The id of the page. * @return The title for the given pageId. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public Title getTitle(int pageId) throws WikiApiException - { + public Title getTitle(int pageId) throws WikiApiException { Session session = this.__getHibernateSession(); session.beginTransaction(); String sql = "select p.name from PageMapLine as p where p.pageId= :pId"; @@ -221,15 +200,12 @@ public Title getTitle(int pageId) throws WikiApiException /** * Gets the page ids for a given title. - * - * @param title - * The title of the page. + * + * @param title The title of the page. * @return The id for the page with the given title. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public List getPageIds(String title) throws WikiApiException - { + public List getPageIds(String title) throws WikiApiException { Session session = this.__getHibernateSession(); session.beginTransaction(); String sql = "select p.pageID from PageMapLine as p where p.name = :pName"; @@ -251,14 +227,11 @@ public List getPageIds(String title) throws WikiApiException /** * Gets the page ids for a given title with case insensitive matching.
* - * @param title - * The title of the page. + * @param title The title of the page. * @return The ids of the pages with the given title. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public List getPageIdsCaseInsensitive(String title) throws WikiApiException - { + public List getPageIdsCaseInsensitive(String title) throws WikiApiException { title = title.toLowerCase(); title = title.replaceAll(" ", "_"); @@ -283,15 +256,12 @@ public List getPageIdsCaseInsensitive(String title) throws WikiApiExcep /** * Returns the article page for a given discussion page. * - * @param discussionPage - * the discussion page object + * @param discussionPage the discussion page object * @return The page object of the article associated with the discussion. If the parameter - * already was an article, it is returned directly. - * @throws WikiApiException - * Thrown if errors occurred. + * already was an article, it is returned directly. + * @throws WikiApiException Thrown if errors occurred. */ - public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException - { + public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException { if (discussionPage.isDiscussion()) { String title = discussionPage.getTitle().getPlainTitle() .replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); @@ -304,8 +274,7 @@ public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiExcep title = title.split("/")[0]; } return getPage(title); - } - else { + } else { return discussionPage; } @@ -314,14 +283,11 @@ public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiExcep /** * Gets the discussion page for an article page with the given pageId. * - * @param articlePageId - * The id of the page. + * @param articlePageId The id of the page. * @return The page object for a given pageId. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public Page getDiscussionPage(int articlePageId) throws WikiApiException - { + public Page getDiscussionPage(int articlePageId) throws WikiApiException { // Retrieve discussion page with article title // TODO not the prettiest solution, but currently discussions are only marked in the title return getDiscussionPage(getPage(articlePageId)); @@ -331,15 +297,12 @@ public Page getDiscussionPage(int articlePageId) throws WikiApiException * Gets the discussion page for the page with the given title. The page retrieval works as * defined in {@link #getPage(String title)} * - * @param title - * The title of the page for which the discussions should be retrieved. + * @param title The title of the page for which the discussions should be retrieved. * @return The page object for the discussion page. - * @throws WikiApiException - * If no page or redirect with this title exists or title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly + * parsed. */ - public Page getDiscussionPage(String title) throws WikiApiException - { + public Page getDiscussionPage(String title) throws WikiApiException { return getDiscussionPage(getPage(title)); } @@ -347,20 +310,16 @@ public Page getDiscussionPage(String title) throws WikiApiException * Gets the discussion page for the given article page The provided page must not be a * discussion page * - * @param articlePage - * the article page for which a discussion page should be retrieved + * @param articlePage the article page for which a discussion page should be retrieved * @return The discussion page object for the given article page object - * @throws WikiApiException - * If no page or redirect with this title exists or title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly + * parsed. */ - public Page getDiscussionPage(Page articlePage) throws WikiApiException - { + public Page getDiscussionPage(Page articlePage) throws WikiApiException { String articleTitle = articlePage.getTitle().toString(); if (articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) { return articlePage; - } - else { + } else { return new Page(this, WikiConstants.DISCUSSION_PREFIX + articleTitle); } } @@ -372,15 +331,12 @@ public Page getDiscussionPage(Page articlePage) throws WikiApiException * The most recent discussion page is NOT included here! It can be obtained with * {@link #getDiscussionPage(Page)}. * - * @param articlePageId - * The id of the page for which to fetch the discussion archives + * @param articlePageId The id of the page for which to fetch the discussion archives * @return The page object for the discussion page. - * @throws WikiApiException - * If no page or redirect with this title exists or title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly + * parsed. */ - public Iterable getDiscussionArchives(int articlePageId) throws WikiApiException - { + public Iterable getDiscussionArchives(int articlePageId) throws WikiApiException { // Retrieve discussion archive pages with page id return getDiscussionArchives(getPage(articlePageId)); } @@ -392,18 +348,15 @@ public Iterable getDiscussionArchives(int articlePageId) throws WikiApiExc * The most recent discussion page is NOT included here! It can be obtained with * {@link #getDiscussionPage(Page)}. * - * @param title - * The title of the page for which the discussions should be retrieved. + * @param title The title of the page for which the discussions should be retrieved. * @return The page object for the discussion page. - * @throws WikiApiException - * If no page or redirect with this title exists or title could not be properly - * parsed. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly + * parsed. * @deprecated Use {@link #getDiscussionArchives(int)} or {@link #getDiscussionArchives(Page)} - * instead. + * instead. */ @Deprecated(since = "2.0.0", forRemoval = true) - public Iterable getDiscussionArchives(String title) throws WikiApiException - { + public Iterable getDiscussionArchives(String title) throws WikiApiException { // Retrieve discussion archive pages with page title return getDiscussionArchives(getPage(title)); } @@ -415,16 +368,13 @@ public Iterable getDiscussionArchives(String title) throws WikiApiExceptio * The provided page Object must not be a discussion page itself! If it is a discussion page, is * returned unchanged. * - * @param articlePage - * the article page for which a discussion archives should be retrieved + * @param articlePage the article page for which a discussion archives should be retrieved * @return An iterable with the discussion archive page objects for the given article page - * object - * @throws WikiApiException - * If no page or redirect with this title exists or title could not be properly - * parsed. + * object + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly + * parsed. */ - public Iterable getDiscussionArchives(Page articlePage) throws WikiApiException - { + public Iterable getDiscussionArchives(Page articlePage) throws WikiApiException { String articleTitle = articlePage.getTitle().getWikiStyleTitle(); if (!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) { articleTitle = WikiConstants.DISCUSSION_PREFIX + articleTitle; @@ -452,20 +402,15 @@ public Iterable getDiscussionArchives(Page articlePage) throws WikiApiExce /** * Gets the pages or redirects with a name similar to the pattern. Calling this method is quite * costly, as similarity is computed for all names. - * - * @param pPattern - * The pattern. - * @param pSize - * The maximum size of the result list. Only the most similar results will be - * included. + * + * @param pPattern The pattern. + * @param pSize The maximum size of the result list. Only the most similar results will be + * included. * @return A map of pages with names similar to the pattern and their distance values. Smaller - * distances are more similar. - * @throws WikiApiException - * Thrown if errors occurred. + * distances are more similar. + * @throws WikiApiException Thrown if errors occurred. */ - //// I do not want to make this public at the moment (TZ, March, 2007) - protected Map getSimilarPages(String pPattern, int pSize) throws WikiApiException - { + public Map getSimilarPages(String pPattern, int pSize) throws WikiApiException { Title title = new Title(pPattern); String pattern = title.getWikiStyleTitle(); @@ -476,20 +421,19 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W // holds a mapping of the best distance values to page IDs Map distanceMap = new HashMap<>(); + final LevenshteinStringDistance lsd = new LevenshteinStringDistance(); Session session = this.__getHibernateSession(); session.beginTransaction(); - for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml") + final String query = "select new org.dkpro.jwpl.api.Wikipedia$PageTuple(pml.pageID, pml.name)" + + " from PageMapLine as pml"; + for (PageTuple o : session.createQuery(query, PageTuple.class) .list()) { - Object[] row = (Object[]) o; - int pageID = (Integer) row[0]; - String pageName = (String) row[1]; // this returns a similarity - if we want to use it, we have to change the semantics the // ordering of the results - // double distance = new Levenshtein().getSimilarity(pageName, pPattern); - double distance = new LevenshteinStringDistance().distance(pageName, pattern); + double distance = lsd.distance(o.name(), pattern); - distanceMap.put(pageID, distance); + distanceMap.put(o.id(), distance); // if there are more than "pSize" entries in the map remove the last one (it has the // biggest distance) @@ -510,12 +454,8 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W Page page = null; try { page = this.getPage(pageID); - } - catch (WikiPageNotFoundException e) { - logger.error("Page with pageID " + pageID - + " could not be found. Fatal error. Terminating."); - e.printStackTrace(); - System.exit(1); + } catch (WikiPageNotFoundException e) { + logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID, e); } pageMap.put(page, distanceMap.get(pageID)); } @@ -531,27 +471,22 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W *

* For example, the (possible) category "Famous steamboats" could be queried with - "Famous * steamboats" - "Famous_steamboats" - "famous steamboats" - "famous_steamboats" - * - * @param title - * The title of the category. + * + * @param title The title of the category. * @return The category object with the given title. - * @throws WikiApiException - * If no category with the given title exists. + * @throws WikiApiException If no category with the given title exists. */ - public Category getCategory(String title) throws WikiApiException - { + public Category getCategory(String title) throws WikiApiException { return new Category(this, title); } /** * Gets the category for a given pageId. - * - * @param pageId - * The id of the {@link Category}. + * + * @param pageId The id of the {@link Category}. * @return The category object or null if no category with this pageId exists. */ - public Category getCategory(int pageId) - { + public Category getCategory(int pageId) { long hibernateId = __getCategoryHibernateId(pageId); if (hibernateId == -1) { return null; @@ -559,8 +494,7 @@ public Category getCategory(int pageId) try { return new Category(this, hibernateId); - } - catch (WikiPageNotFoundException e) { + } catch (WikiPageNotFoundException e) { return null; } } @@ -568,26 +502,22 @@ public Category getCategory(int pageId) /** * This returns an iterable over all {@link Category categories}, as returning all category * objects would be much too expensive. - * + * * @return An iterable over all categories. */ - public Iterable getCategories() - { + public Iterable getCategories() { return new CategoryIterable(this); } /** * Gets the {@link Category categories} for a given {@link Page} identified by its * {@code pageTitle}. - * - * @param pageTitle - * The title of a {@link Page}, not a category. + * + * @param pageTitle The title of a {@link Page}, not a category. * @return The category objects which are associated with the given {@code pageTitle}. - * @throws WikiPageNotFoundException - * Thrown if no {@link Page} exists for the given {@code pageTitle}. + * @throws WikiPageNotFoundException Thrown if no {@link Page} exists for the given {@code pageTitle}. */ - public Set getCategories(String pageTitle) throws WikiPageNotFoundException - { + public Set getCategories(String pageTitle) throws WikiPageNotFoundException { if (pageTitle == null || pageTitle.length() == 0) { throw new WikiPageNotFoundException(); } @@ -603,8 +533,7 @@ public Set getCategories(String pageTitle) throws WikiPageNotFoundExce for (int hibernateId : categoryHibernateIds) { try { categorySet.add(new Category(this, hibernateId)); - } - catch (WikiPageNotFoundException e) { + } catch (WikiPageNotFoundException e) { logger.warn("Could not load Category by it's HibernateId = '" + hibernateId + "'"); } } @@ -614,13 +543,11 @@ public Set getCategories(String pageTitle) throws WikiPageNotFoundExce /** * Get all wikipedia {@link Category categories}. Returns only an iterable, as a collection may * not fit into memory for a large wikipedia. - * - * @param bufferSize - * The size of the internal page buffer. + * + * @param bufferSize The size of the internal page buffer. * @return An iterable over all categories. */ - protected Iterable getCategories(int bufferSize) - { + protected Iterable getCategories(int bufferSize) { return new CategoryIterable(this, bufferSize); } @@ -628,13 +555,12 @@ protected Iterable getCategories(int bufferSize) * Protected method that is much faster than the public version, but exposes too much * implementation details. Get a set with all category pageIDs. Returning all category objects * is much too expensive. - * + * * @return A set with all category pageIDs */ // TODO this should be replaced with the buffered category iterator, as it might produce an // HeapSpace Overflow, if there are too many categories. - protected Set __getCategories() - { + protected Set __getCategories() { Session session = this.__getHibernateSession(); session.beginTransaction(); String sql = "select cat.pageId from Category as cat"; @@ -647,24 +573,21 @@ protected Set __getCategories() /** * Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages. * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * + * * @return An iterable over all pages. */ - public Iterable getPages() - { + public Iterable getPages() { return new PageIterable(this, false); } /** * Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages. * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * - * @param bufferSize - * The size of the internal page buffer. + * + * @param bufferSize The size of the internal page buffer. * @return An iterable over all pages. */ - protected Iterable getPages(int bufferSize) - { + protected Iterable getPages(int bufferSize) { return new PageIterable(this, false, bufferSize); } @@ -678,8 +601,7 @@ protected Iterable getPages(int bufferSize) * * @return A set with all {@code pageIDs}. Returning all pages is much to expensive. */ - protected Set __getPages() - { + protected Set __getPages() { Session session = this.__getHibernateSession(); session.beginTransaction(); String sql = "select page.pageId from Page as page"; @@ -692,8 +614,7 @@ protected Set __getPages() /** * @return an iterable over all {@code pageIDs} (without redirects) */ - public Iterable getPageIds() - { + public Iterable getPageIds() { return this.__getPages(); } @@ -701,45 +622,39 @@ public Iterable getPageIds() * Get the pages that match the given query. Does not include redirects, as they are only * pointers to real pages. Attention: may be running very slow, depending on the size of the * Wikipedia! - * - * @param query - * A query object containing the query conditions. + * + * @param query A query object containing the query conditions. * @return A set of pages that match the given query. - * @throws WikiApiException - * Thrown if errors occurred. + * @throws WikiApiException Thrown if errors occurred. */ - public Iterable getPages(PageQuery query) throws WikiApiException - { + public Iterable getPages(PageQuery query) throws WikiApiException { return new PageQueryIterable(this, query); } /** * Get all articles (pages MINUS disambiguationPages MINUS redirects). Returns only an iterable, * as a collection may not fit into memory for a large wikipedia. - * + * * @return An iterable of all article pages. */ - public Iterable getArticles() - { + public Iterable getArticles() { return new PageIterable(this, true); } /** * Get all titles including disambiguation pages and redirects). Returns only an iterable, as a * collection may not fit into memory for a large wikipedia. - * + * * @return An iterable of all article pages. */ - public Iterable getTitles() - { + public Iterable<Title> getTitles() { return new TitleIterable(this); } /** * @return The {@link Language} of this Wikipedia. */ - public Language getLanguage() - { + public Language getLanguage() { return this.language; } @@ -747,13 +662,11 @@ public Language getLanguage() * Tests, whether a page or redirect with the given title exists. Trying to retrieve a page that * does not exist in Wikipedia throws an exception. You may catch the exception or use this * test, depending on your task. - * - * @param title - * The title of the page. + * + * @param title The title of the page. * @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise. */ - public boolean existsPage(String title) - { + public boolean existsPage(String title) { if (title == null || title.isEmpty()) { return false; } @@ -761,8 +674,7 @@ public boolean existsPage(String title) Title t; try { t = new Title(title); - } - catch (WikiTitleParsingException e) { + } catch (WikiTitleParsingException e) { return false; } @@ -782,8 +694,7 @@ public boolean existsPage(String title) .setParameter("pName", encodedTitle, StandardBasicTypes.STRING); var returnValue = nativeQuery.uniqueResult(); return returnValue != null; - } - finally { + } finally { session.getTransaction().commit(); } } @@ -792,12 +703,10 @@ public boolean existsPage(String title) * Tests, whether a page with the given pageID exists. Trying to retrieve a pageID that does not * exist in Wikipedia throws an exception. * - * @param pageID - * A pageID. + * @param pageID A pageID. * @return {@code True}, if a page with that pageID exits, {@code false} otherwise. */ - public boolean existsPage(int pageID) - { + public boolean existsPage(int pageID) { // This is a hack to provide a much quicker way to test whether a page exists. // Encoding the title in this way surpasses the normal way of creating a title first. @@ -820,12 +729,10 @@ public boolean existsPage(int pageID) * Get the hibernate ID to a given pageID of a page. We need different methods for pages and * categories here, as a page and a category can have the same ID. * - * @param pageID - * A pageID that should be mapped to the corresponding hibernate ID. + * @param pageID A pageID that should be mapped to the corresponding hibernate ID. * @return The hibernateID of the page with pageID or -1, if the pageID is not valid */ - protected long __getPageHibernateId(int pageID) - { + protected long __getPageHibernateId(int pageID) { long hibernateID = -1; // first look in the id mapping cache @@ -855,12 +762,10 @@ protected long __getPageHibernateId(int pageID) * Get the hibernate ID to a given pageID of a category. We need different methods for pages and * categories here, as a page and a category can have the same ID. * - * @param pageID - * A pageID that should be mapped to the corresponding hibernate ID. + * @param pageID A pageID that should be mapped to the corresponding hibernate ID. * @return The hibernateID of the page with pageID or -1, if the pageID is not valid */ - protected long __getCategoryHibernateId(int pageID) - { + protected long __getCategoryHibernateId(int pageID) { long hibernateID = -1; // first look in the id mapping cache @@ -888,36 +793,32 @@ protected long __getCategoryHibernateId(int pageID) /** * @return A {@link MetaData} object containing all meta data about this instance of Wikipedia. */ - public MetaData getMetaData() - { + public MetaData getMetaData() { return this.metaData; } /** * @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia - * object. + * object. */ - public DatabaseConfiguration getDatabaseConfiguration() - { + public DatabaseConfiguration getDatabaseConfiguration() { return this.dbConfig; } /** * @return Shortcut for getting a hibernate session. */ - protected Session __getHibernateSession() - { + protected Session __getHibernateSession() { return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); } /** * The ID consists of the host, the database, and the language. This should be unique in most * cases. - * + * * @return Returns a unique ID for this Wikipedia object. */ - public String getWikipediaId() - { + public String getWikipediaId() { StringBuilder sb = new StringBuilder(); sb.append(this.getDatabaseConfiguration().getHost()); sb.append("_"); @@ -928,14 +829,15 @@ public String getWikipediaId() } private static class ValueComparator - implements Comparator<Entry<Integer, Double>> - { + implements Comparator<Entry<Integer, Double>> { @Override - public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) - { + public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) { return Double.compare(e2.getValue(), e1.getValue()); } } + private record PageTuple(int id, String name) { + + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java index c4b41bdc..4fd1453e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -29,7 +30,6 @@ import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.api.util.ApiUtilities; -import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,7 +51,7 @@ public class WikipediaInfo private Map<Integer, Integer> degreeDistribution; private Set<Integer> categorizedArticleSet; - private Wikipedia wiki; + private final Wikipedia wiki; /** * Get infos for the whole wikipedia. @@ -63,9 +63,7 @@ public class WikipediaInfo */ public WikipediaInfo(Wikipedia pWiki) throws WikiApiException { - this.wiki = pWiki; - new WikipediaInfo(this.wiki.getPages()); - + this(pWiki.getPages(), pWiki); } /** @@ -73,15 +71,21 @@ public WikipediaInfo(Wikipedia pWiki) throws WikiApiException * * @param pPages * A set of pages. Only this subset of wiki pages is used in the info object. - * + * @param pWiki + * The wiki object. * @throws WikiApiException Thrown if errors occurred. */ - public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException + public WikipediaInfo(Iterable<Page> pPages, Wikipedia pWiki) throws WikiApiException { if (pPages == null) { throw new WikiApiException("The page set has to be initialized."); } + if (pWiki == null) { + throw new WikiApiException("The wiki instance is not set."); + } + + wiki = pWiki; pages = pPages; averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is // accessed @@ -91,11 +95,11 @@ public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException // get number of pages numberOfPages = 0; - while (pages.iterator().hasNext()) { + Iterator<Page> it = pages.iterator(); + while (it.hasNext()) { numberOfPages++; - pages.iterator().next(); + it.next(); } - } /** @@ -109,34 +113,14 @@ public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException private double computeAverageFanOut(Iterable<Page> pages) { - Set<Integer> pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); - } + final Iterator<Page> it = pages.iterator(); - if (pageIDs.isEmpty()) { - logger.warn("Cannot compute average fan-out of an empty page set."); - return 0.0; + double sum = 0; + while (it.hasNext()) { + sum += it.next().getOutlinks().size(); } - int fanOutCounter = 0; - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page") - .list()) { - Object[] row = (Object[]) o; - Set outLinks = (Set) row[0]; - Integer pageId = (Integer) row[1]; - - // if the current page ID is in the desired result set => add outlink value - if (pageIDs.contains(pageId)) { - fanOutCounter += outLinks.size(); - } - } - session.getTransaction().commit(); - - return (double) fanOutCounter / this.getNumberOfPages(); + return sum / this.getNumberOfPages(); } /** @@ -489,5 +473,4 @@ private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) } return shortestPathLengthSum; } - } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java index 1b427e5d..f73ccea6 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java @@ -69,14 +69,14 @@ public double distance(String s, String t) cost = 1; } // Step 6 - d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); } } // Step 7 return Integer.valueOf(d[n][m]).doubleValue(); } - private int Minimum(int a, int b, int c) + private int minimum(int a, int b, int c) { int min; min = a; diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java new file mode 100644 index 00000000..700f56f2 --- /dev/null +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.jwpl.api; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +public class WikipediaInfoTest + extends BaseJWPLTest +{ + + private static WikipediaInfo wikipediaInfo; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wikipediaInfo = new WikipediaInfo(new Wikipedia(db)); + } + catch (Exception e) { + fail("WikipediaInfo could not be initialized: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetAverageFanOut() { + double average = wikipediaInfo.getAverageFanOut(); + assertTrue(average > 0); + assertEquals(1.1176470588235294d, average); + //call it twice + average = wikipediaInfo.getAverageFanOut(); + assertEquals(1.1176470588235294d, average); + } + +} diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java index 8b847bd3..d3881f0e 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java @@ -26,6 +26,7 @@ import java.lang.invoke.MethodHandles; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.UUID; @@ -34,6 +35,8 @@ import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -437,6 +440,20 @@ public void testGetLanguage() assertNotNull(wiki.getLanguage()); } + @ParameterizedTest + @ValueSource(strings = { + "Wikipedia_AP", + "Wikipedia_API" + }) + public void testGetSimilarPages(String val) throws WikiApiException { + final Map<Page, Double> similarPages = wiki.getSimilarPages(val, 1); + assertNotNull(similarPages); + assertEquals(1, similarPages.size()); + Map.Entry<Page, Double> entry = similarPages.entrySet().iterator().next(); + assertTrue(entry.getKey().getTitle().getRawTitleText().startsWith(val)); + assertTrue(entry.getValue() <= 1); + } + /* INTERNAL TEST HELPER METHODS */ private void getNotExistingPage(String title) diff --git a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script index 7d4f5d8f..44f92f20 100644 --- a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script +++ b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script @@ -29,7 +29,7 @@ SET FILES NIO TRUE SET FILES NIO SIZE 256 SET FILES LOG TRUE SET FILES LOG SIZE 200 -SET FILES CHECK 3146 +SET FILES CHECK 3225 SET DATABASE COLLATION "German" NO PAD CREATE USER SA PASSWORD DIGEST 'd41d8cd98f00b204e9800998ecf8427e' CREATE SCHEMA PUBLIC AUTHORIZATION DBA