diff --git a/CHANGELOG.md b/CHANGELOG.md index 13a764d..bab23fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Version history +## 0.9.0 + +- Updates recommended RKorAPClient version to 0.9.0 +- Added `matchStart` and `matchEnd` columns to corpusQuery results, containing the start and end positions of the match in the text +- Added `mergeDuplicateCollocates` function to merge collocation analysis results for different context positions +- Added a query column to collocation analysis results +- Improved documentation for span parameter in `collocationAnalysis` functions +- Updated `textMetadata` method to use new metadata fields API, if available, to retrieve custom metadata for a text based on its sigle +- Added new unit tests to cover the new features and changes + ## 0.8.1 - Updates recommended RKorAPClient version to 0.8.1 diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py index 9d46471..35c4c3a 100644 --- a/KorAPClient/__init__.py +++ b/KorAPClient/__init__.py @@ -15,7 +15,7 @@ from packaging import version from rpy2.robjects.methods import RS4 -CURRENT_R_PACKAGE_VERSION = "0.8.1" +CURRENT_R_PACKAGE_VERSION = "0.9.0" KorAPClient = packages.importr('RKorAPClient') if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION): @@ -206,7 +206,7 @@ def collocationAnalysis(self, node, vc="", **kwargs): - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample - **searchHitsSampleLimit** - limit the size of the search hits sample - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored - - **withinSpan** - KorAP span specification for collocations to be searched within + - **withinSpan** - KorAP span specification (see ) for collocations to be searched within. Defaults to `base/s=s` - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies - **stopwords** - vector of stopwords not to be considered as collocates - **seed** - seed for random page collecting order @@ -229,6 +229,11 @@ def collocationAnalysis(self, node, vc="", **kwargs): """ return KorAPClient.collocationAnalysis(self, node, vc, **kwargs) + def mergeDuplicateCollocates(self, *args, **kwargs): + """Merge collocation analysis results for different context positions.""" + return KorAPClient.mergeDuplicateCollocates(*args, **kwargs) + + def corpusQuery(self, *args, **kwargs): """Query search term(s). @@ -237,7 +242,7 @@ def corpusQuery(self, *args, **kwargs): - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True) - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`) - - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`) + - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`) - **verbose** - (default = `self.verbose`) Returns: diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py index eb44d1c..d4673b3 100644 --- a/KorAPClient/tests/test_korapclient.py +++ b/KorAPClient/tests/test_korapclient.py @@ -89,6 +89,52 @@ def test_textMetadata(self): self.assertIn('creationDate', df.columns) self.assertIn('pubPlace', df.columns) self.assertIn('author', df.columns) + + def test_corpus_query_token_api(self): + q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False) + q = q.fetchNext() + matches = q.slots['collectedMatches'] + + self.assertGreater(len(matches), 10) + + unique_matches = matches['tokens.match'].unique() + self.assertEqual(len(unique_matches), 1) + self.assertEqual(unique_matches[0], "Newstickeritis") + + left_contexts = matches['tokens.left'] + self.assertTrue(any('reine' in context for context in left_contexts)) + + right_contexts = matches['tokens.right'] + self.assertTrue(any('Begriff' in context for context in right_contexts)) + + def test_match_start_and_end(self): + q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False) + q = q.fetchNext() + matches = q.slots['collectedMatches'] + + self.assertGreater(matches['matchEnd'].max(), 1000) + self.assertTrue((matches['matchEnd'] == matches['matchStart']).all()) + + def test_extended_metadata_fields_ked(self): + kcon_ked = KorAPConnection(KorAPUrl="https://korap.ids-mannheim.de/instance/ked", verbose=True) + q = kcon_ked.corpusQuery( + "einfache", + fields=[ + "textSigle", "pubDate", "pubPlace", "availability", "textClass", + "snippet", "tokens", "KED.cover1Herder", "KED.cover2Herder", + "KED.cover3Herder", "KED.cover4Herder", "KED.cover5Herder", + "KED.nPara", "KED.nPunct1kTks", "KED.nSent", "KED.nToks", + "KED.nToksSentMd", "KED.nTyps", "KED.rcpnt", "KED.rcpntLabel", + "KED.strtgy", "KED.strtgyLabel", "KED.topic", "KED.topicLabel", + "KED.txttyp", "KED.txttypLabel" + ] + ).fetchAll() + df = q.slots['collectedMatches'] + self.assertGreater(len(df), 0) + self.assertGreater(min(df['KED.nToks'].astype(float)), 100) + self.assertGreater(min(df['KED.nSent'].astype(float)), 8) + self.assertGreater(min(df['KED.rcpnt'].str.len()), 5) + if __name__ == '__main__': diff --git a/pyproject.toml b/pyproject.toml index 9cc876d..c32b40c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "KorAPClient" -version = "0.8.1" +version = "0.9.0" description = "Client package to access KorAP's web service API" authors = [ {name = "Marc Kupietz",email = "kupietz@ids-mannheim.de"},