diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java
index 6e73910d..9508a43c 100644
--- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java
+++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java
@@ -47,8 +47,7 @@
*/
// TODO better JavaDocs!
public class Wikipedia
- implements WikiConstants
-{
+ implements WikiConstants {
private static final Logger logger = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
@@ -81,14 +80,11 @@ public class Wikipedia
* Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig
* parameter.
*
- * @param dbConfig
- * A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where
- * the data is stored and how it can be accessed.
- * @throws WikiInitializationException
- * Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance.
+ * @param dbConfig A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where
+ * the data is stored and how it can be accessed.
+ * @throws WikiInitializationException Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance.
*/
- public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException
- {
+ public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException {
logger.trace("Creating Wikipedia object.");
@@ -103,15 +99,13 @@ public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationExcept
if (dbConfig.supportsCollation()) {
logger.info("Wikipedia database backend supports character collation features.");
- }
- else {
+ } else {
logger.debug(
"Wikipedia database backend does NOT support character collation features.");
}
}
- WikiConfig getWikConfig()
- {
+ WikiConfig getWikConfig() {
return wikiConfig;
}
@@ -125,35 +119,29 @@ WikiConfig getWikConfig()
* For example, the article "Steam boat" could be queried with - "Steam boat" - "steam boat" -
* "Steam_boat" - "steam_boat" and additionally all redirects that might point to that article.
*
- * @param title
- * The title of the page.
+ * @param title The title of the page.
* @return The page object for a given title.
- * @throws WikiApiException
- * If no page or redirect with this title exists or the title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly
+ * parsed.
*/
- public Page getPage(String title) throws WikiApiException
- {
+ public Page getPage(String title) throws WikiApiException {
return new Page(this, title, false);
}
/**
* Gets the page with exactly the given title.
- *
+ *
* Note that when using this method you are responsible for converting a normal search string
* into the right wiki-style.
- *
+ *
* If the title is a redirect, the corresponding page is returned.
*
- * @param exactTitle
- * The exact title of the page.
+ * @param exactTitle The exact title of the page.
* @return The page object for a given title.
- * @throws WikiApiException
- * If no page or redirect with this title exists or the title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly
+ * parsed.
*/
- public Page getPageByExactTitle(String exactTitle) throws WikiApiException
- {
+ public Page getPageByExactTitle(String exactTitle) throws WikiApiException {
return new Page(this, exactTitle, true);
}
@@ -163,15 +151,12 @@ public Page getPageByExactTitle(String exactTitle) throws WikiApiException
* Spaces in the title are converted to underscores, as this is a convention for Wikipedia
* article titles.
*
- * @param title
- * The title of the page.
+ * @param title The title of the page.
* @return A set of page objects matching this title.
- * @throws WikiApiException
- * If no page or redirect with this title exists or the title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly
+ * parsed.
*/
- public Set getPages(String title) throws WikiApiException
- {
+ public Set getPages(String title) throws WikiApiException {
Set ids = new HashSet<>(getPageIdsCaseInsensitive(title));
Set pages = new HashSet<>();
@@ -184,28 +169,22 @@ public Set getPages(String title) throws WikiApiException
/**
* Gets the page for a given pageId.
*
- * @param pageId
- * The id of the page.
+ * @param pageId The id of the page.
* @return The page object for a given pageId.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public Page getPage(int pageId) throws WikiApiException
- {
+ public Page getPage(int pageId) throws WikiApiException {
return new Page(this, pageId);
}
/**
* Gets the title for a given pageId.
*
- * @param pageId
- * The id of the page.
+ * @param pageId The id of the page.
* @return The title for the given pageId.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public Title getTitle(int pageId) throws WikiApiException
- {
+ public Title getTitle(int pageId) throws WikiApiException {
Session session = this.__getHibernateSession();
session.beginTransaction();
String sql = "select p.name from PageMapLine as p where p.pageId= :pId";
@@ -221,15 +200,12 @@ public Title getTitle(int pageId) throws WikiApiException
/**
* Gets the page ids for a given title.
- *
- * @param title
- * The title of the page.
+ *
+ * @param title The title of the page.
* @return The id for the page with the given title.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public List getPageIds(String title) throws WikiApiException
- {
+ public List getPageIds(String title) throws WikiApiException {
Session session = this.__getHibernateSession();
session.beginTransaction();
String sql = "select p.pageID from PageMapLine as p where p.name = :pName";
@@ -251,14 +227,11 @@ public List getPageIds(String title) throws WikiApiException
/**
* Gets the page ids for a given title with case insensitive matching.
*
- * @param title
- * The title of the page.
+ * @param title The title of the page.
* @return The ids of the pages with the given title.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public List getPageIdsCaseInsensitive(String title) throws WikiApiException
- {
+ public List getPageIdsCaseInsensitive(String title) throws WikiApiException {
title = title.toLowerCase();
title = title.replaceAll(" ", "_");
@@ -283,15 +256,12 @@ public List getPageIdsCaseInsensitive(String title) throws WikiApiExcep
/**
* Returns the article page for a given discussion page.
*
- * @param discussionPage
- * the discussion page object
+ * @param discussionPage the discussion page object
* @return The page object of the article associated with the discussion. If the parameter
- * already was an article, it is returned directly.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * already was an article, it is returned directly.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException
- {
+ public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException {
if (discussionPage.isDiscussion()) {
String title = discussionPage.getTitle().getPlainTitle()
.replaceAll(WikiConstants.DISCUSSION_PREFIX, "");
@@ -304,8 +274,7 @@ public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiExcep
title = title.split("/")[0];
}
return getPage(title);
- }
- else {
+ } else {
return discussionPage;
}
@@ -314,14 +283,11 @@ public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiExcep
/**
* Gets the discussion page for an article page with the given pageId.
*
- * @param articlePageId
- * The id of the page.
+ * @param articlePageId The id of the page.
* @return The page object for a given pageId.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public Page getDiscussionPage(int articlePageId) throws WikiApiException
- {
+ public Page getDiscussionPage(int articlePageId) throws WikiApiException {
// Retrieve discussion page with article title
// TODO not the prettiest solution, but currently discussions are only marked in the title
return getDiscussionPage(getPage(articlePageId));
@@ -331,15 +297,12 @@ public Page getDiscussionPage(int articlePageId) throws WikiApiException
* Gets the discussion page for the page with the given title. The page retrieval works as
* defined in {@link #getPage(String title)}
*
- * @param title
- * The title of the page for which the discussions should be retrieved.
+ * @param title The title of the page for which the discussions should be retrieved.
* @return The page object for the discussion page.
- * @throws WikiApiException
- * If no page or redirect with this title exists or title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or title could not be properly
+ * parsed.
*/
- public Page getDiscussionPage(String title) throws WikiApiException
- {
+ public Page getDiscussionPage(String title) throws WikiApiException {
return getDiscussionPage(getPage(title));
}
@@ -347,20 +310,16 @@ public Page getDiscussionPage(String title) throws WikiApiException
* Gets the discussion page for the given article page The provided page must not be a
* discussion page
*
- * @param articlePage
- * the article page for which a discussion page should be retrieved
+ * @param articlePage the article page for which a discussion page should be retrieved
* @return The discussion page object for the given article page object
- * @throws WikiApiException
- * If no page or redirect with this title exists or title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or title could not be properly
+ * parsed.
*/
- public Page getDiscussionPage(Page articlePage) throws WikiApiException
- {
+ public Page getDiscussionPage(Page articlePage) throws WikiApiException {
String articleTitle = articlePage.getTitle().toString();
if (articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) {
return articlePage;
- }
- else {
+ } else {
return new Page(this, WikiConstants.DISCUSSION_PREFIX + articleTitle);
}
}
@@ -372,15 +331,12 @@ public Page getDiscussionPage(Page articlePage) throws WikiApiException
* The most recent discussion page is NOT included here! It can be obtained with
* {@link #getDiscussionPage(Page)}.
*
- * @param articlePageId
- * The id of the page for which to fetch the discussion archives
+ * @param articlePageId The id of the page for which to fetch the discussion archives
* @return The page object for the discussion page.
- * @throws WikiApiException
- * If no page or redirect with this title exists or title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or title could not be properly
+ * parsed.
*/
- public Iterable getDiscussionArchives(int articlePageId) throws WikiApiException
- {
+ public Iterable getDiscussionArchives(int articlePageId) throws WikiApiException {
// Retrieve discussion archive pages with page id
return getDiscussionArchives(getPage(articlePageId));
}
@@ -392,18 +348,15 @@ public Iterable getDiscussionArchives(int articlePageId) throws WikiApiExc
* The most recent discussion page is NOT included here! It can be obtained with
* {@link #getDiscussionPage(Page)}.
*
- * @param title
- * The title of the page for which the discussions should be retrieved.
+ * @param title The title of the page for which the discussions should be retrieved.
* @return The page object for the discussion page.
- * @throws WikiApiException
- * If no page or redirect with this title exists or title could not be properly
- * parsed.
+ * @throws WikiApiException If no page or redirect with this title exists or title could not be properly
+ * parsed.
* @deprecated Use {@link #getDiscussionArchives(int)} or {@link #getDiscussionArchives(Page)}
- * instead.
+ * instead.
*/
@Deprecated(since = "2.0.0", forRemoval = true)
- public Iterable getDiscussionArchives(String title) throws WikiApiException
- {
+ public Iterable getDiscussionArchives(String title) throws WikiApiException {
// Retrieve discussion archive pages with page title
return getDiscussionArchives(getPage(title));
}
@@ -415,16 +368,13 @@ public Iterable getDiscussionArchives(String title) throws WikiApiExceptio
* The provided page Object must not be a discussion page itself! If it is a discussion page, is
* returned unchanged.
*
- * @param articlePage
- * the article page for which a discussion archives should be retrieved
+ * @param articlePage the article page for which a discussion archives should be retrieved
* @return An iterable with the discussion archive page objects for the given article page
- * object
- * @throws WikiApiException
- * If no page or redirect with this title exists or title could not be properly
- * parsed.
+ * object
+ * @throws WikiApiException If no page or redirect with this title exists or title could not be properly
+ * parsed.
*/
- public Iterable getDiscussionArchives(Page articlePage) throws WikiApiException
- {
+ public Iterable getDiscussionArchives(Page articlePage) throws WikiApiException {
String articleTitle = articlePage.getTitle().getWikiStyleTitle();
if (!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) {
articleTitle = WikiConstants.DISCUSSION_PREFIX + articleTitle;
@@ -452,20 +402,15 @@ public Iterable getDiscussionArchives(Page articlePage) throws WikiApiExce
/**
* Gets the pages or redirects with a name similar to the pattern. Calling this method is quite
* costly, as similarity is computed for all names.
- *
- * @param pPattern
- * The pattern.
- * @param pSize
- * The maximum size of the result list. Only the most similar results will be
- * included.
+ *
+ * @param pPattern The pattern.
+ * @param pSize The maximum size of the result list. Only the most similar results will be
+ * included.
* @return A map of pages with names similar to the pattern and their distance values. Smaller
- * distances are more similar.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * distances are more similar.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- //// I do not want to make this public at the moment (TZ, March, 2007)
- protected Map getSimilarPages(String pPattern, int pSize) throws WikiApiException
- {
+ public Map getSimilarPages(String pPattern, int pSize) throws WikiApiException {
Title title = new Title(pPattern);
String pattern = title.getWikiStyleTitle();
@@ -476,20 +421,19 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W
// holds a mapping of the best distance values to page IDs
Map distanceMap = new HashMap<>();
+ final LevenshteinStringDistance lsd = new LevenshteinStringDistance();
Session session = this.__getHibernateSession();
session.beginTransaction();
- for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml")
+ final String query = "select new org.dkpro.jwpl.api.Wikipedia$PageTuple(pml.pageID, pml.name)"
+ + " from PageMapLine as pml";
+ for (PageTuple o : session.createQuery(query, PageTuple.class)
.list()) {
- Object[] row = (Object[]) o;
- int pageID = (Integer) row[0];
- String pageName = (String) row[1];
// this returns a similarity - if we want to use it, we have to change the semantics the
// ordering of the results
- // double distance = new Levenshtein().getSimilarity(pageName, pPattern);
- double distance = new LevenshteinStringDistance().distance(pageName, pattern);
+ double distance = lsd.distance(o.name(), pattern);
- distanceMap.put(pageID, distance);
+ distanceMap.put(o.id(), distance);
// if there are more than "pSize" entries in the map remove the last one (it has the
// biggest distance)
@@ -510,12 +454,8 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W
Page page = null;
try {
page = this.getPage(pageID);
- }
- catch (WikiPageNotFoundException e) {
- logger.error("Page with pageID " + pageID
- + " could not be found. Fatal error. Terminating.");
- e.printStackTrace();
- System.exit(1);
+ } catch (WikiPageNotFoundException e) {
+ logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID, e);
}
pageMap.put(page, distanceMap.get(pageID));
}
@@ -531,27 +471,22 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W
*
* For example, the (possible) category "Famous steamboats" could be queried with - "Famous
* steamboats" - "Famous_steamboats" - "famous steamboats" - "famous_steamboats"
- *
- * @param title
- * The title of the category.
+ *
+ * @param title The title of the category.
* @return The category object with the given title.
- * @throws WikiApiException
- * If no category with the given title exists.
+ * @throws WikiApiException If no category with the given title exists.
*/
- public Category getCategory(String title) throws WikiApiException
- {
+ public Category getCategory(String title) throws WikiApiException {
return new Category(this, title);
}
/**
* Gets the category for a given pageId.
- *
- * @param pageId
- * The id of the {@link Category}.
+ *
+ * @param pageId The id of the {@link Category}.
* @return The category object or null if no category with this pageId exists.
*/
- public Category getCategory(int pageId)
- {
+ public Category getCategory(int pageId) {
long hibernateId = __getCategoryHibernateId(pageId);
if (hibernateId == -1) {
return null;
@@ -559,8 +494,7 @@ public Category getCategory(int pageId)
try {
return new Category(this, hibernateId);
- }
- catch (WikiPageNotFoundException e) {
+ } catch (WikiPageNotFoundException e) {
return null;
}
}
@@ -568,26 +502,22 @@ public Category getCategory(int pageId)
/**
* This returns an iterable over all {@link Category categories}, as returning all category
* objects would be much too expensive.
- *
+ *
* @return An iterable over all categories.
*/
- public Iterable getCategories()
- {
+ public Iterable getCategories() {
return new CategoryIterable(this);
}
/**
* Gets the {@link Category categories} for a given {@link Page} identified by its
* {@code pageTitle}.
- *
- * @param pageTitle
- * The title of a {@link Page}, not a category.
+ *
+ * @param pageTitle The title of a {@link Page}, not a category.
* @return The category objects which are associated with the given {@code pageTitle}.
- * @throws WikiPageNotFoundException
- * Thrown if no {@link Page} exists for the given {@code pageTitle}.
+ * @throws WikiPageNotFoundException Thrown if no {@link Page} exists for the given {@code pageTitle}.
*/
- public Set getCategories(String pageTitle) throws WikiPageNotFoundException
- {
+ public Set getCategories(String pageTitle) throws WikiPageNotFoundException {
if (pageTitle == null || pageTitle.length() == 0) {
throw new WikiPageNotFoundException();
}
@@ -603,8 +533,7 @@ public Set getCategories(String pageTitle) throws WikiPageNotFoundExce
for (int hibernateId : categoryHibernateIds) {
try {
categorySet.add(new Category(this, hibernateId));
- }
- catch (WikiPageNotFoundException e) {
+ } catch (WikiPageNotFoundException e) {
logger.warn("Could not load Category by it's HibernateId = '" + hibernateId + "'");
}
}
@@ -614,13 +543,11 @@ public Set getCategories(String pageTitle) throws WikiPageNotFoundExce
/**
* Get all wikipedia {@link Category categories}. Returns only an iterable, as a collection may
* not fit into memory for a large wikipedia.
- *
- * @param bufferSize
- * The size of the internal page buffer.
+ *
+ * @param bufferSize The size of the internal page buffer.
* @return An iterable over all categories.
*/
- protected Iterable getCategories(int bufferSize)
- {
+ protected Iterable getCategories(int bufferSize) {
return new CategoryIterable(this, bufferSize);
}
@@ -628,13 +555,12 @@ protected Iterable getCategories(int bufferSize)
* Protected method that is much faster than the public version, but exposes too much
* implementation details. Get a set with all category pageIDs. Returning all category objects
* is much too expensive.
- *
+ *
* @return A set with all category pageIDs
*/
// TODO this should be replaced with the buffered category iterator, as it might produce an
// HeapSpace Overflow, if there are too many categories.
- protected Set __getCategories()
- {
+ protected Set __getCategories() {
Session session = this.__getHibernateSession();
session.beginTransaction();
String sql = "select cat.pageId from Category as cat";
@@ -647,24 +573,21 @@ protected Set __getCategories()
/**
* Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages.
* Returns only an iterable, as a collection may not fit into memory for a large wikipedia.
- *
+ *
* @return An iterable over all pages.
*/
- public Iterable getPages()
- {
+ public Iterable getPages() {
return new PageIterable(this, false);
}
/**
* Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages.
* Returns only an iterable, as a collection may not fit into memory for a large wikipedia.
- *
- * @param bufferSize
- * The size of the internal page buffer.
+ *
+ * @param bufferSize The size of the internal page buffer.
* @return An iterable over all pages.
*/
- protected Iterable getPages(int bufferSize)
- {
+ protected Iterable getPages(int bufferSize) {
return new PageIterable(this, false, bufferSize);
}
@@ -678,8 +601,7 @@ protected Iterable getPages(int bufferSize)
*
* @return A set with all {@code pageIDs}. Returning all pages is much to expensive.
*/
- protected Set __getPages()
- {
+ protected Set __getPages() {
Session session = this.__getHibernateSession();
session.beginTransaction();
String sql = "select page.pageId from Page as page";
@@ -692,8 +614,7 @@ protected Set __getPages()
/**
* @return an iterable over all {@code pageIDs} (without redirects)
*/
- public Iterable getPageIds()
- {
+ public Iterable getPageIds() {
return this.__getPages();
}
@@ -701,45 +622,39 @@ public Iterable getPageIds()
* Get the pages that match the given query. Does not include redirects, as they are only
* pointers to real pages. Attention: may be running very slow, depending on the size of the
* Wikipedia!
- *
- * @param query
- * A query object containing the query conditions.
+ *
+ * @param query A query object containing the query conditions.
* @return A set of pages that match the given query.
- * @throws WikiApiException
- * Thrown if errors occurred.
+ * @throws WikiApiException Thrown if errors occurred.
*/
- public Iterable getPages(PageQuery query) throws WikiApiException
- {
+ public Iterable getPages(PageQuery query) throws WikiApiException {
return new PageQueryIterable(this, query);
}
/**
* Get all articles (pages MINUS disambiguationPages MINUS redirects). Returns only an iterable,
* as a collection may not fit into memory for a large wikipedia.
- *
+ *
* @return An iterable of all article pages.
*/
- public Iterable getArticles()
- {
+ public Iterable getArticles() {
return new PageIterable(this, true);
}
/**
* Get all titles including disambiguation pages and redirects). Returns only an iterable, as a
* collection may not fit into memory for a large wikipedia.
- *
+ *
* @return An iterable of all article pages.
*/
- public Iterable getTitles()
- {
+ public Iterable getTitles() {
return new TitleIterable(this);
}
/**
* @return The {@link Language} of this Wikipedia.
*/
- public Language getLanguage()
- {
+ public Language getLanguage() {
return this.language;
}
@@ -747,13 +662,11 @@ public Language getLanguage()
* Tests, whether a page or redirect with the given title exists. Trying to retrieve a page that
* does not exist in Wikipedia throws an exception. You may catch the exception or use this
* test, depending on your task.
- *
- * @param title
- * The title of the page.
+ *
+ * @param title The title of the page.
* @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise.
*/
- public boolean existsPage(String title)
- {
+ public boolean existsPage(String title) {
if (title == null || title.isEmpty()) {
return false;
}
@@ -761,8 +674,7 @@ public boolean existsPage(String title)
Title t;
try {
t = new Title(title);
- }
- catch (WikiTitleParsingException e) {
+ } catch (WikiTitleParsingException e) {
return false;
}
@@ -782,8 +694,7 @@ public boolean existsPage(String title)
.setParameter("pName", encodedTitle, StandardBasicTypes.STRING);
var returnValue = nativeQuery.uniqueResult();
return returnValue != null;
- }
- finally {
+ } finally {
session.getTransaction().commit();
}
}
@@ -792,12 +703,10 @@ public boolean existsPage(String title)
* Tests, whether a page with the given pageID exists. Trying to retrieve a pageID that does not
* exist in Wikipedia throws an exception.
*
- * @param pageID
- * A pageID.
+ * @param pageID A pageID.
* @return {@code True}, if a page with that pageID exits, {@code false} otherwise.
*/
- public boolean existsPage(int pageID)
- {
+ public boolean existsPage(int pageID) {
// This is a hack to provide a much quicker way to test whether a page exists.
// Encoding the title in this way surpasses the normal way of creating a title first.
@@ -820,12 +729,10 @@ public boolean existsPage(int pageID)
* Get the hibernate ID to a given pageID of a page. We need different methods for pages and
* categories here, as a page and a category can have the same ID.
*
- * @param pageID
- * A pageID that should be mapped to the corresponding hibernate ID.
+ * @param pageID A pageID that should be mapped to the corresponding hibernate ID.
* @return The hibernateID of the page with pageID or -1, if the pageID is not valid
*/
- protected long __getPageHibernateId(int pageID)
- {
+ protected long __getPageHibernateId(int pageID) {
long hibernateID = -1;
// first look in the id mapping cache
@@ -855,12 +762,10 @@ protected long __getPageHibernateId(int pageID)
* Get the hibernate ID to a given pageID of a category. We need different methods for pages and
* categories here, as a page and a category can have the same ID.
*
- * @param pageID
- * A pageID that should be mapped to the corresponding hibernate ID.
+ * @param pageID A pageID that should be mapped to the corresponding hibernate ID.
* @return The hibernateID of the page with pageID or -1, if the pageID is not valid
*/
- protected long __getCategoryHibernateId(int pageID)
- {
+ protected long __getCategoryHibernateId(int pageID) {
long hibernateID = -1;
// first look in the id mapping cache
@@ -888,36 +793,32 @@ protected long __getCategoryHibernateId(int pageID)
/**
* @return A {@link MetaData} object containing all meta data about this instance of Wikipedia.
*/
- public MetaData getMetaData()
- {
+ public MetaData getMetaData() {
return this.metaData;
}
/**
* @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia
- * object.
+ * object.
*/
- public DatabaseConfiguration getDatabaseConfiguration()
- {
+ public DatabaseConfiguration getDatabaseConfiguration() {
return this.dbConfig;
}
/**
* @return Shortcut for getting a hibernate session.
*/
- protected Session __getHibernateSession()
- {
+ protected Session __getHibernateSession() {
return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession();
}
/**
* The ID consists of the host, the database, and the language. This should be unique in most
* cases.
- *
+ *
* @return Returns a unique ID for this Wikipedia object.
*/
- public String getWikipediaId()
- {
+ public String getWikipediaId() {
StringBuilder sb = new StringBuilder();
sb.append(this.getDatabaseConfiguration().getHost());
sb.append("_");
@@ -928,14 +829,15 @@ public String getWikipediaId()
}
private static class ValueComparator
- implements Comparator>
- {
+ implements Comparator> {
@Override
- public int compare(Entry e1, Entry e2)
- {
+ public int compare(Entry e1, Entry e2) {
return Double.compare(e2.getValue(), e1.getValue());
}
}
+ private record PageTuple(int id, String name) {
+
+ }
}
diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java
index c4b41bdc..4fd1453e 100644
--- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java
+++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java
@@ -22,6 +22,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -29,7 +30,6 @@
import org.dkpro.jwpl.api.exception.WikiApiException;
import org.dkpro.jwpl.api.exception.WikiPageNotFoundException;
import org.dkpro.jwpl.api.util.ApiUtilities;
-import org.hibernate.Session;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -51,7 +51,7 @@ public class WikipediaInfo
private Map degreeDistribution;
private Set categorizedArticleSet;
- private Wikipedia wiki;
+ private final Wikipedia wiki;
/**
* Get infos for the whole wikipedia.
@@ -63,9 +63,7 @@ public class WikipediaInfo
*/
public WikipediaInfo(Wikipedia pWiki) throws WikiApiException
{
- this.wiki = pWiki;
- new WikipediaInfo(this.wiki.getPages());
-
+ this(pWiki.getPages(), pWiki);
}
/**
@@ -73,15 +71,21 @@ public WikipediaInfo(Wikipedia pWiki) throws WikiApiException
*
* @param pPages
* A set of pages. Only this subset of wiki pages is used in the info object.
- *
+ * @param pWiki
+ * The wiki object.
* @throws WikiApiException Thrown if errors occurred.
*/
- public WikipediaInfo(Iterable pPages) throws WikiApiException
+ public WikipediaInfo(Iterable pPages, Wikipedia pWiki) throws WikiApiException
{
if (pPages == null) {
throw new WikiApiException("The page set has to be initialized.");
}
+ if (pWiki == null) {
+ throw new WikiApiException("The wiki instance is not set.");
+ }
+
+ wiki = pWiki;
pages = pPages;
averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is
// accessed
@@ -91,11 +95,11 @@ public WikipediaInfo(Iterable pPages) throws WikiApiException
// get number of pages
numberOfPages = 0;
- while (pages.iterator().hasNext()) {
+ Iterator it = pages.iterator();
+ while (it.hasNext()) {
numberOfPages++;
- pages.iterator().next();
+ it.next();
}
-
}
/**
@@ -109,34 +113,14 @@ public WikipediaInfo(Iterable pPages) throws WikiApiException
private double computeAverageFanOut(Iterable pages)
{
- Set pageIDs = new HashSet<>();
- while (pages.iterator().hasNext()) {
- pageIDs.add(pages.iterator().next().getPageId());
- }
+ final Iterator it = pages.iterator();
- if (pageIDs.isEmpty()) {
- logger.warn("Cannot compute average fan-out of an empty page set.");
- return 0.0;
+ double sum = 0;
+ while (it.hasNext()) {
+ sum += it.next().getOutlinks().size();
}
- int fanOutCounter = 0;
-
- Session session = this.wiki.__getHibernateSession();
- session.beginTransaction();
- for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page")
- .list()) {
- Object[] row = (Object[]) o;
- Set outLinks = (Set) row[0];
- Integer pageId = (Integer) row[1];
-
- // if the current page ID is in the desired result set => add outlink value
- if (pageIDs.contains(pageId)) {
- fanOutCounter += outLinks.size();
- }
- }
- session.getTransaction().commit();
-
- return (double) fanOutCounter / this.getNumberOfPages();
+ return sum / this.getNumberOfPages();
}
/**
@@ -489,5 +473,4 @@ private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph)
}
return shortestPathLengthSum;
}
-
}
diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java
index 1b427e5d..f73ccea6 100644
--- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java
+++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java
@@ -69,14 +69,14 @@ public double distance(String s, String t)
cost = 1;
}
// Step 6
- d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
+ d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
// Step 7
return Integer.valueOf(d[n][m]).doubleValue();
}
- private int Minimum(int a, int b, int c)
+ private int minimum(int a, int b, int c)
{
int min;
min = a;
diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java
new file mode 100644
index 00000000..700f56f2
--- /dev/null
+++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.dkpro.jwpl.api;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+public class WikipediaInfoTest
+ extends BaseJWPLTest
+{
+
+ private static WikipediaInfo wikipediaInfo;
+
+ /**
+ * Made this static so that following tests don't run if assumption fails. (With AT_Before,
+ * tests also would not be executed but marked as passed) This could be changed back as soon as
+ * JUnit ignored tests after failed assumptions
+ */
+ @BeforeAll
+ public static void setupWikipedia()
+ {
+ DatabaseConfiguration db = obtainHSDLDBConfiguration();
+ try {
+ wikipediaInfo = new WikipediaInfo(new Wikipedia(db));
+ }
+ catch (Exception e) {
+ fail("WikipediaInfo could not be initialized: " + e.getLocalizedMessage());
+ }
+ }
+
+ @Test
+ public void testGetAverageFanOut() {
+ double average = wikipediaInfo.getAverageFanOut();
+ assertTrue(average > 0);
+ assertEquals(1.1176470588235294d, average);
+ //call it twice
+ average = wikipediaInfo.getAverageFanOut();
+ assertEquals(1.1176470588235294d, average);
+ }
+
+}
diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java
index 8b847bd3..d3881f0e 100644
--- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java
+++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java
@@ -26,6 +26,7 @@
import java.lang.invoke.MethodHandles;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.UUID;
@@ -34,6 +35,8 @@
import org.dkpro.jwpl.api.exception.WikiTitleParsingException;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -437,6 +440,20 @@ public void testGetLanguage()
assertNotNull(wiki.getLanguage());
}
+ @ParameterizedTest
+ @ValueSource(strings = {
+ "Wikipedia_AP",
+ "Wikipedia_API"
+ })
+ public void testGetSimilarPages(String val) throws WikiApiException {
+ final Map similarPages = wiki.getSimilarPages(val, 1);
+ assertNotNull(similarPages);
+ assertEquals(1, similarPages.size());
+ Map.Entry entry = similarPages.entrySet().iterator().next();
+ assertTrue(entry.getKey().getTitle().getRawTitleText().startsWith(val));
+ assertTrue(entry.getValue() <= 1);
+ }
+
/* INTERNAL TEST HELPER METHODS */
private void getNotExistingPage(String title)
diff --git a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script
index 7d4f5d8f..44f92f20 100644
--- a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script
+++ b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script
@@ -29,7 +29,7 @@ SET FILES NIO TRUE
SET FILES NIO SIZE 256
SET FILES LOG TRUE
SET FILES LOG SIZE 200
-SET FILES CHECK 3146
+SET FILES CHECK 3225
SET DATABASE COLLATION "German" NO PAD
CREATE USER SA PASSWORD DIGEST 'd41d8cd98f00b204e9800998ecf8427e'
CREATE SCHEMA PUBLIC AUTHORIZATION DBA