Skip to content

Commit

Permalink
update functions and tests to align with RKorAPClient 0.9.0
Browse files Browse the repository at this point in the history
Change-Id: I0221c6cc0b9180bc83feb96651e0a5f204846451
  • Loading branch information
„feldmueller“ authored and kupietz committed Jan 21, 2025
1 parent e8c7adc commit 3386c1f
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 4 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Version history

## 0.9.0

- Updates recommended RKorAPClient version to 0.9.0
- Added `matchStart` and `matchEnd` columns to corpusQuery results, containing the start and end positions of the match in the text
- Added `mergeDuplicateCollocates` function to merge collocation analysis results for different context positions
- Added a query column to collocation analysis results
- Improved documentation for span parameter in `collocationAnalysis` functions
- Updated `textMetadata` method to use new metadata fields API, if available, to retrieve custom metadata for a text based on its sigle
- Added new unit tests to cover the new features and changes

## 0.8.1

- Updates recommended RKorAPClient version to 0.8.1
Expand Down
11 changes: 8 additions & 3 deletions KorAPClient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from packaging import version
from rpy2.robjects.methods import RS4

CURRENT_R_PACKAGE_VERSION = "0.8.1"
CURRENT_R_PACKAGE_VERSION = "0.9.0"

KorAPClient = packages.importr('RKorAPClient')
if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
Expand Down Expand Up @@ -206,7 +206,7 @@ def collocationAnalysis(self, node, vc="", **kwargs):
- **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
- **searchHitsSampleLimit** - limit the size of the search hits sample
- **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
- **withinSpan** - KorAP span specification for collocations to be searched within
- **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
- **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
- **stopwords** - vector of stopwords not to be considered as collocates
- **seed** - seed for random page collecting order
Expand All @@ -229,6 +229,11 @@ def collocationAnalysis(self, node, vc="", **kwargs):
"""
return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)

def mergeDuplicateCollocates(self, *args, **kwargs):
"""Merge collocation analysis results for different context positions."""
return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)


def corpusQuery(self, *args, **kwargs):
"""Query search term(s).
Expand All @@ -237,7 +242,7 @@ def corpusQuery(self, *args, **kwargs):
- **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
- **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
- **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
- **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`)
- **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
- **verbose** - (default = `self.verbose`)
Returns:
Expand Down
46 changes: 46 additions & 0 deletions KorAPClient/tests/test_korapclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,52 @@ def test_textMetadata(self):
self.assertIn('creationDate', df.columns)
self.assertIn('pubPlace', df.columns)
self.assertIn('author', df.columns)

def test_corpus_query_token_api(self):
q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
q = q.fetchNext()
matches = q.slots['collectedMatches']

self.assertGreater(len(matches), 10)

unique_matches = matches['tokens.match'].unique()
self.assertEqual(len(unique_matches), 1)
self.assertEqual(unique_matches[0], "Newstickeritis")

left_contexts = matches['tokens.left']
self.assertTrue(any('reine' in context for context in left_contexts))

right_contexts = matches['tokens.right']
self.assertTrue(any('Begriff' in context for context in right_contexts))

def test_match_start_and_end(self):
q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
q = q.fetchNext()
matches = q.slots['collectedMatches']

self.assertGreater(matches['matchEnd'].max(), 1000)
self.assertTrue((matches['matchEnd'] == matches['matchStart']).all())

def test_extended_metadata_fields_ked(self):
kcon_ked = KorAPConnection(KorAPUrl="https://korap.ids-mannheim.de/instance/ked", verbose=True)
q = kcon_ked.corpusQuery(
"einfache",
fields=[
"textSigle", "pubDate", "pubPlace", "availability", "textClass",
"snippet", "tokens", "KED.cover1Herder", "KED.cover2Herder",
"KED.cover3Herder", "KED.cover4Herder", "KED.cover5Herder",
"KED.nPara", "KED.nPunct1kTks", "KED.nSent", "KED.nToks",
"KED.nToksSentMd", "KED.nTyps", "KED.rcpnt", "KED.rcpntLabel",
"KED.strtgy", "KED.strtgyLabel", "KED.topic", "KED.topicLabel",
"KED.txttyp", "KED.txttypLabel"
]
).fetchAll()
df = q.slots['collectedMatches']
self.assertGreater(len(df), 0)
self.assertGreater(min(df['KED.nToks'].astype(float)), 100)
self.assertGreater(min(df['KED.nSent'].astype(float)), 8)
self.assertGreater(min(df['KED.rcpnt'].str.len()), 5)



if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "KorAPClient"
version = "0.8.1"
version = "0.9.0"
description = "Client package to access KorAP's web service API"
authors = [
{name = "Marc Kupietz",email = "[email protected]"},
Expand Down

0 comments on commit 3386c1f

Please sign in to comment.