From c0441c8b812f4eb870fdc36c5d97a7dd65732bfc Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Fri, 1 Dec 2023 14:45:25 +0100 Subject: [PATCH] Fix deprecation warnings in Wikipedia and WikipediaInfo Adds related unit tests --- .../java/org/dkpro/jwpl/api/Wikipedia.java | 32 +++------- .../org/dkpro/jwpl/api/WikipediaInfo.java | 61 ++++++------------- .../distance/LevenshteinStringDistance.java | 4 +- .../org/dkpro/jwpl/api/WikipediaInfoTest.java | 58 ++++++++++++++++++ .../org/dkpro/jwpl/api/WikipediaTest.java | 18 ++++++ 5 files changed, 107 insertions(+), 66 deletions(-) create mode 100644 dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java index 6e73910d..2f2b239f 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java @@ -18,16 +18,8 @@ package org.dkpro.jwpl.api; import java.lang.invoke.MethodHandles; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeSet; import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiInitializationException; @@ -463,8 +455,7 @@ public Iterable getDiscussionArchives(Page articlePage) throws WikiApiExce * @throws WikiApiException * Thrown if errors occurred. */ - //// I do not want to make this public at the moment (TZ, March, 2007) - protected Map getSimilarPages(String pPattern, int pSize) throws WikiApiException + public Map getSimilarPages(String pPattern, int pSize) throws WikiApiException { Title title = new Title(pPattern); String pattern = title.getWikiStyleTitle(); @@ -476,20 +467,17 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W // holds a mapping of the best distance values to page IDs Map distanceMap = new HashMap<>(); + final LevenshteinStringDistance lsd = new LevenshteinStringDistance(); Session session = this.__getHibernateSession(); session.beginTransaction(); - for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml") + for (PageTuple o : session.createQuery("select new org.dkpro.jwpl.api.Wikipedia$PageTuple(pml.pageID, pml.name) from PageMapLine as pml", PageTuple.class) .list()) { - Object[] row = (Object[]) o; - int pageID = (Integer) row[0]; - String pageName = (String) row[1]; // this returns a similarity - if we want to use it, we have to change the semantics the // ordering of the results - // double distance = new Levenshtein().getSimilarity(pageName, pPattern); - double distance = new LevenshteinStringDistance().distance(pageName, pattern); + double distance = lsd.distance(o.name(), pattern); - distanceMap.put(pageID, distance); + distanceMap.put(o.id(), distance); // if there are more than "pSize" entries in the map remove the last one (it has the // biggest distance) @@ -512,10 +500,7 @@ protected Map getSimilarPages(String pPattern, int pSize) throws W page = this.getPage(pageID); } catch (WikiPageNotFoundException e) { - logger.error("Page with pageID " + pageID - + " could not be found. Fatal error. Terminating."); - e.printStackTrace(); - System.exit(1); + logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID, e); } pageMap.put(page, distanceMap.get(pageID)); } @@ -938,4 +923,7 @@ public int compare(Entry e1, Entry e2) } } + private record PageTuple(int id, String name) { + + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java index c4b41bdc..b95a3912 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java @@ -18,13 +18,7 @@ package org.dkpro.jwpl.api; import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; @@ -51,7 +45,7 @@ public class WikipediaInfo private Map degreeDistribution; private Set categorizedArticleSet; - private Wikipedia wiki; + private final Wikipedia wiki; /** * Get infos for the whole wikipedia. @@ -63,9 +57,7 @@ public class WikipediaInfo */ public WikipediaInfo(Wikipedia pWiki) throws WikiApiException { - this.wiki = pWiki; - new WikipediaInfo(this.wiki.getPages()); - + this(pWiki.getPages(), pWiki); } /** @@ -73,15 +65,21 @@ public WikipediaInfo(Wikipedia pWiki) throws WikiApiException * * @param pPages * A set of pages. Only this subset of wiki pages is used in the info object. - * + * @param pWiki + * The wiki object. * @throws WikiApiException Thrown if errors occurred. */ - public WikipediaInfo(Iterable pPages) throws WikiApiException + public WikipediaInfo(Iterable pPages, Wikipedia pWiki) throws WikiApiException { if (pPages == null) { throw new WikiApiException("The page set has to be initialized."); } + if (pWiki == null) { + throw new WikiApiException("The wiki instance is not set."); + } + + wiki = pWiki; pages = pPages; averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is // accessed @@ -91,11 +89,11 @@ public WikipediaInfo(Iterable pPages) throws WikiApiException // get number of pages numberOfPages = 0; - while (pages.iterator().hasNext()) { + Iterator it = pages.iterator(); + while (it.hasNext()) { numberOfPages++; - pages.iterator().next(); + it.next(); } - } /** @@ -109,34 +107,14 @@ public WikipediaInfo(Iterable pPages) throws WikiApiException private double computeAverageFanOut(Iterable pages) { - Set pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); - } + final Iterator it = pages.iterator(); - if (pageIDs.isEmpty()) { - logger.warn("Cannot compute average fan-out of an empty page set."); - return 0.0; + double sum = 0; + while (it.hasNext()) { + sum += it.next().getOutlinks().size(); } - int fanOutCounter = 0; - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page") - .list()) { - Object[] row = (Object[]) o; - Set outLinks = (Set) row[0]; - Integer pageId = (Integer) row[1]; - - // if the current page ID is in the desired result set => add outlink value - if (pageIDs.contains(pageId)) { - fanOutCounter += outLinks.size(); - } - } - session.getTransaction().commit(); - - return (double) fanOutCounter / this.getNumberOfPages(); + return sum / this.getNumberOfPages(); } /** @@ -489,5 +467,4 @@ private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) } return shortestPathLengthSum; } - } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java index 1b427e5d..f73ccea6 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/distance/LevenshteinStringDistance.java @@ -69,14 +69,14 @@ public double distance(String s, String t) cost = 1; } // Step 6 - d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); } } // Step 7 return Integer.valueOf(d[n][m]).doubleValue(); } - private int Minimum(int a, int b, int c) + private int minimum(int a, int b, int c) { int min; min = a; diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java new file mode 100644 index 00000000..bace5624 --- /dev/null +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaInfoTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.jwpl.api; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class WikipediaInfoTest + extends BaseJWPLTest +{ + + private static WikipediaInfo wikipediaInfo; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wikipediaInfo = new WikipediaInfo(new Wikipedia(db)); + } + catch (Exception e) { + fail("WikipediaInfo could not be initialized: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetAverageFanOut() { + double average = wikipediaInfo.getAverageFanOut(); + assertTrue(average > 0); + assertEquals(1.1176470588235294d, average); + //call it twice + average = wikipediaInfo.getAverageFanOut(); + assertEquals(1.1176470588235294d, average); + } + +} diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java index 8b847bd3..7db09112 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java @@ -26,6 +26,7 @@ import java.lang.invoke.MethodHandles; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.UUID; @@ -33,7 +34,10 @@ import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -437,6 +441,20 @@ public void testGetLanguage() assertNotNull(wiki.getLanguage()); } + @ParameterizedTest + @ValueSource(strings = { + "Wikipedia_AP", + "Wikipedia_API" + }) + public void testGetSimilarPages(String val) throws WikiApiException { + final Map similarPages = wiki.getSimilarPages(val, 1); + assertNotNull(similarPages); + assertEquals(1, similarPages.size()); + Map.Entry entry = similarPages.entrySet().iterator().next(); + assertTrue(entry.getKey().getTitle().getRawTitleText().startsWith(val)); + assertTrue(entry.getValue() <= 1); + } + /* INTERNAL TEST HELPER METHODS */ private void getNotExistingPage(String title)