diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b6e4761
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/LICENSE b/LICENSE
index 7c62f0b..866a0e2 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2019-2020 msg systems ag
+Copyright 2019-2021 msg systems ag
The Holmes library is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..709fa8b
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include SHORTREADME.md
+global-include *.cfg
+global-include *.csv
+global-include LICENSE
diff --git a/README.md b/README.md
index b7c8644..bad5897 100644
--- a/README.md
+++ b/README.md
@@ -7,19 +7,20 @@ Author: Richard Paul Hudson, msg syste
- [1.2 Installation](#installation)
- [1.2.1 Prerequisites](#prerequisites)
- [1.2.2 Library installation](#library-installation)
- - [1.2.3 Installing the spaCy models](#installing-the-spacy-models)
+ - [1.2.3 Installing the spaCy and coreferee models](#installing-the-spacy-and-coreferee-models)
- [1.2.4 Comments about deploying Holmes in an
enterprise
environment](#comments-about-deploying-holmes-in-an-enterprise-environment)
- - [1.2.5 Using multiprocessing](#using-multiprocessing)
- - [1.2.6 Resource requirements](#resource-requirements)
+ - [1.2.5 Resource requirements](#resource-requirements)
- [1.3 Getting started](#getting-started)
- [2. Word-level matching strategies](#word-level-matching-strategies)
- [2.1 Direct matching](#direct-matching)
- [2.2 Derivation-based matching](#derivation-based-matching)
- - [2.3 Named entity matching](#named-entity-matching)
+ - [2.3 Named-entity matching](#named-entity-matching)
- [2.4 Ontology-based matching](#ontology-based-matching)
- [2.5 Embedding-based matching](#embedding-based-matching)
+ - [2.6 Named-entity-embedding-based matching](#named-entity-embedding-based-matching)
+ - [2.7 Initial-question-word matching](#initial-question-word-matching)
- [3. Coreference resolution](#coreference-resolution)
- [4. Writing effective search
phrases](#writing-effective-search-phrases)
@@ -55,7 +56,7 @@ Author: Richard Paul Hudson, msg syste
- [6 Interfaces intended for public
use](#interfaces-intended-for-public-use)
- [6.1 `Manager`](#manager)
- - [6.2 `MultiprocessingManager`](#multiprocessing-manager)
+ - [6.2 `manager.nlp`](#manager.nlp)
- [6.3 `Ontology`](#ontology)
- [6.4 `SupervisedTopicTrainingBasis`](#supervised-topic-training-basis)
(returned from `Manager.get_supervised_topic_training_basis()`)
@@ -64,16 +65,10 @@ Author: Richard Paul Hudson, msg syste
- [6.6 `SupervisedTopicClassifier`](#supervised-topic-classifier)
(returned from `SupervisedTopicModelTrainer.classifier()` and
`Manager.deserialize_supervised_topic_classifier()`)
- - [6.7 `Match` (returned from
- `Manager.match()`)](#match)
- - [6.8 `WordMatch` (returned from
- `Manager.match().word_matches`)](#wordmatch)
- - [6.9 `Subword` (returned from `word_match.subword`)](#subword)
- - [6.10 Dictionary returned from
- `Manager.match_returning_dictionaries()`)](#dictionary)
- - [6.11 `TopicMatch`(returned from `Manager.topic_match_documents_against()`)](#topic-match)
- - [6.12 Dictionary returned from
- `Manager.topic_match_documents_returning_dictionaries_against()` and `MultiprocessingManager.topic_match_documents_returning_dictionaries_against()`](#topic-match-dictionary)
+ - [6.7 Dictionary returned from
+ `Manager.match()`)](#dictionary)
+ - [6.8 Dictionary returned from
+ `Manager.topic_match_documents_against()`](#topic-match-dictionary)
- [7 A note on the license](#a-note-on-the-license)
- [8 Information for developers](#information-for-developers)
- [8.1 How it works](#how-it-works)
@@ -95,6 +90,7 @@ Author: Richard Paul Hudson, msg syste
- [8.4.2 Version 2.1.0](#version-210)
- [8.4.3 Version 2.2.0](#version-220)
- [8.4.4 Version 2.2.1](#version-221)
+ - [8.4.5 Version 3.0.0](#version-300)
### 1. Introduction
@@ -102,9 +98,11 @@ Author: Richard Paul Hudson, msg syste
#### 1.1 The basic idea
-**Holmes** is a Python 3 library (tested with version 3.7.7) that supports a number of
-use cases involving information extraction from English and German texts. In all use cases, the information extraction
-is based on analysing the semantic relationships expressed by the component parts of each sentence:
+**Holmes** is a Python 3 library (tested with version 3.9.5) running on top of
+[spaCy](https://spacy.io/) (tested with version 3.1.2) that supports a number of use cases
+involving information extraction from English and German texts. In all use cases, the information
+extraction is based on analysing the semantic relationships expressed by the component parts of
+each sentence:
- In the [chatbot](#getting-started) use case, the system is configured using one or more **search phrases**.
Holmes then looks for structures whose meanings correspond to those of these search phrases within
@@ -115,7 +113,7 @@ corresponds to one or more such words in the document. Both the fact that a sear
- The [structural extraction](#structural-extraction) use case uses exactly the same
[structural matching](#how-it-works-structural-matching) technology as the chatbot use
case, but searching takes place with respect to a pre-existing document or documents that are typically much
-longer than the snippets analysed in the chatbot use case, and the aim to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to
+longer than the snippets analysed in the chatbot use case, and the aim is to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to
take over a second company. The identities of the companies concerned could then be stored in a database.
- The [topic matching](#topic-matching) use case aims to find passages in a document or documents whose meaning
@@ -165,24 +163,20 @@ before installing Holmes.
##### 1.2.2 Library installation
-Because of a conflict between the install scripts of two of Holmes' dependencies
-(`neuralcoref` and `numpy`), `numpy` has to be installed before the Holmes installation
-script runs. Install Holmes using the following commands:
+Install Holmes using the following commands:
*Linux:*
```
-pip3 install numpy
pip3 install holmes-extractor
```
*Windows:*
```
-pip install numpy
pip install holmes-extractor
```
To upgrade from a previous Holmes version, issue the following commands and then
-[reissue the commands to download the spaCy models](#installing-the-spacy-models) to ensure
+[reissue the commands to download the spaCy and coreferee models](#installing-the-spacy-and-coreferee-models) to ensure
you have the correct versions of them:
*Linux:*
@@ -195,6 +189,9 @@ pip3 install --upgrade holmes-extractor
pip install --upgrade holmes-extractor
```
+Note that if you are upgrading to a new Holmes version that uses a different major or minor version
+of Python from the pre-existing version, you will need to upgrade Python and then follow the instructions for installing Holmes from scratch.
+
If you are working on some versions of Windows and have not used Python before,
several of Holmes' dependencies may require you to download Visual Studio and then
rerun the installation. During the Visual Studio install, it is imperative to select
@@ -206,10 +203,6 @@ If you wish to use the examples and tests, clone the source code using
git clone https://github.com/msg-systems/holmes-extractor
```
-Note that at present spaCy version 2.1.0 is installed rather than the current version
-because of a conflict between later versions of spaCy and the version of `neuralcoref` that
-was available when Holmes 2.2 was developed.
-
If you wish to experiment with changing the source code, you can
override the installed code by starting Python (type `python3` (Linux) or `python`
(Windows)) in the parent directory of the directory where your altered `holmes_extractor`
@@ -225,81 +218,71 @@ import holmes_extractor
print(holmes_extractor.__file__)
```
-
-##### 1.2.3 Installing the spaCy models
+
+##### 1.2.3 Installing the spaCy and coreferee models
-The spaCy library that Holmes builds upon requires
-[language-specific models](https://spacy.io/usage/models) that have to be downloaded
-separately before Holmes can be used. The following models are for English and German
-respectively:
+The spaCy and coreferee libraries that Holmes builds upon require
+language-specific models that have to be downloaded separately before Holmes can be used:
-*Linux:*
+*Linux/English:*
```
+python3 -m spacy download en_core_web_trf
python3 -m spacy download en_core_web_lg
-python3 -m spacy download de_core_news_md
+python3 -m coreferee install en
```
-and if you plan to run the [regression tests](#development-and-testing-guidelines):
-
+*Linux/German:*
```
-python3 -m spacy download en_core_web_sm
+python3 -m spacy download de_core_news_lg
+python3 -m coreferee install de
```
-
-*Windows:*
+*Windows/English:*
```
+python -m spacy download en_core_web_trf
python -m spacy download en_core_web_lg
-python -m spacy download de_core_news_md
+python -m coreferee install en
+```
+
+*Windows/German:*
+```
+python -m spacy download de_core_news_lg
+python -m coreferee install de
```
and if you plan to run the [regression tests](#development-and-testing-guidelines):
+*Linux:*
+```
+python3 -m spacy download en_core_web_sm
+```
+
+*Windows:*
```
python -m spacy download en_core_web_sm
```
-`en_core_web_sm` is one of the smaller models that are also available. Users of Holmes are nonetheless urged to stick to the `en_core_web_lg` and `de_core_news_md` models as they have consistently been found to yield the best results.
+You specify a spaCy model for Holmes to use [when you instantiate the Manager facade class](#getting-started). `en_core_web_trf` and `de_core_web_lg` are the models that have been found to yield the best results for English and German respectively. Because `en_core_web_trf` does not have its own word vectors, but Holmes requires word vectors for [embedding-based-matching](#embedding-based-matching), the `en_core_web_lg` model is loaded as a vector source whenever `en_core_web_trf` is specified to the Manager class as the main model.
+
+The `en_core_web_trf` model requires sufficiently more resources than the other models; in a siutation where resources are scarce, it may be a sensible compromise to use `en_core_web_lg` as the main model instead.
##### 1.2.4 Comments about deploying Holmes in an enterprise environment
-Python 3 is a language that is absent from the architecture standards of
-many large enterprises. For a number of reasons, however, it was the
-only serious contender with which to develop Holmes.
-
The best way of integrating Holmes into a non-Python environment is to
wrap it as a RESTful HTTP service and to deploy it as a
-microservice. See [here](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/examples/example_search_EN_literature.py) for an example.
-
-
-##### 1.2.5 Using multiprocessing
-
-Holmes normally only occupies a single processor core. In order to improve performance, the workload of the
-[topic matching](#topic-matching) use case can be distributed amongst multiple processors using the
-[MultiprocessingManager](#multiprocessing-manager) class. This is achieved by assigning the registered
-documents to worker processes in a round-robin fashion, which implies that there is no point in starting
-more worker threads than there are documents to analyse, and that the best performance is achieved when
-all documents are of a fairly similar length.
-
-Usually, multiprocessing involves a physical copy of working process memory on Windows but not on Linux. Because of
-an issue with `neuralcoref` memory management, however, the MultiprocessingManager has to start a separate instance
-of the spaCy model for each worker process on all operating systems. In a typically configured environment, this makes it likely that
-memory will be exhausted before CPU, which should be taken into account when deciding how many processes to start.
-
-The parent process communicates with its workers via queues. On one occasion, the MultiprocessingManager was observed
-to stop working and the culprit was an old worker process that had not completed normally and was reading queue
-objects that were destined for a process that had been started subsequently. If the MultiprocessingManager hangs shortly
-after being started, the probable solution is therefore to ensure all Python processes have been killed before
-trying again.
+microservice. See [here](https://github.com/msg-systems/holmes-extractor/blob/master/examples/example_search_EN_literature.py) for an example.
-##### 1.2.6 Resource requirements
+##### 1.2.5 Resource requirements
Because Holmes performs complex, intelligent analysis, it is inevitable that it requires more hardware resources than more traditional search frameworks. The use cases that involve loading documents — [structural extraction](#structural-extraction) and [topic matching](#topic-matching) — are most immediately applicable to large but not massive corpora (e.g. all the documents belonging to a certain organisation, all the patents on a certain topic, all the books by a certain author). For cost reasons, Holmes would not be an appropriate tool with which to analyse the content of the entire Internet!
-That said, Holmes is both vertically and horizontally scalable. With sufficient hardware, both these use cases can be applied to an essentially unlimited number of documents by running Holmes on multiple machines, processing a different set of documents on each one and conflating the results. Note that this is the strategy employed by the [MultiprocessingManager](#multiprocessing-manager) to distribute [topic matching](#topic-matching) processing amongst multiple cores on a single machine and that the [TopicMatchDictionaryOrderer](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/extensive_matching.py) class, which is used to conflate results from several cores, could easily be reused to conflate results received from multiple machines over the network.
+That said, Holmes is both vertically and horizontally scalable. With sufficient hardware, both these use cases can be applied to an essentially unlimited number of documents by running Holmes on multiple machines, processing a different set of documents on each one and conflating the results. Note that this strategy is already employed to distribute matching amongst multiple cores on a single machine: the [Manager](#manager) class starts a number of worker processes and distributes registered documents between them.
-Holmes holds loaded documents in memory. On the one hand, this ties in with its intended use with large but not massive corpora; on the other hand, documents that have been analysed using [coreference resolution](#coreference-resolution) are not serializable, so that it would not be technically possible to offer a persistent storage option. The performance of document loading, [structural extraction](#structural-extraction) and [topic matching](#topic-matching) all degrade heavily if the operating system has to swaps memory pages to secondary storage, because Holmes can require memory from a variety of pages to be addressed when processing a single sentence. This means it is important to supply enough RAM on each machine to hold all loaded documents.
+Holmes holds loaded documents in memory, which ties in with its intended use with large but not massive corpora. The performance of document loading, [structural extraction](#structural-extraction) and [topic matching](#topic-matching) all degrade heavily if the operating system has to swaps memory pages to secondary storage, because Holmes can require memory from a variety of pages to be addressed when processing a single sentence. This means it is important to supply enough RAM on each machine to hold all loaded documents.
+
+Please note the [above comments](#installing-the-spacy-and-coreferee-models) about the relative resource requirements of the different models.
#### 1.3 Getting started
@@ -307,7 +290,7 @@ Holmes holds loaded documents in memory. On the one hand, this ties in with its
The easiest use case with which to get a quick basic idea of how Holmes works is the **chatbot** use case.
Here one or more search phrases are defined to Holmes in advance, and the
-searched 'documents' are short sentences or paragraphs typed in
+searched documents are short sentences or paragraphs typed in
interactively by an end user. In a real-life setting, the extracted
information would be used to
determine the flow of interaction with the end user. For testing and
@@ -326,7 +309,7 @@ demonstration chatbot console:
```
import holmes_extractor as holmes
-holmes_manager = holmes.Manager(model='en_core_web_lg')
+holmes_manager = holmes.Manager(model='en_core_web_lg', number_of_workers=1)
holmes_manager.register_search_phrase('A big dog chases a cat')
holmes_manager.start_chatbot_mode_console()
```
@@ -335,7 +318,7 @@ holmes_manager.start_chatbot_mode_console()
```
import holmes_extractor as holmes
-holmes_manager = holmes.Manager(model='de_core_news_md')
+holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=1)
holmes_manager.register_search_phrase('Ein großer Hund jagt eine Katze')
holmes_manager.start_chatbot_mode_console()
```
@@ -351,8 +334,8 @@ Ready for input
A big dog chased a cat
-Matched search phrase 'A big dog chases a cat':
-'big'->'big' (direct); 'A big dog'->'dog' (direct); 'chased'->'chase' (direct); 'a cat'->'cat' (direct)
+Matched search phrase with text 'A big dog chases a cat':
+'big'->'big' (Matches BIG directly); 'A big dog'->'dog' (Matches DOG directly); 'chased'->'chase' (Matches CHASE directly); 'a cat'->'cat' (Matches CAT directly)
```
*German:*
@@ -364,7 +347,7 @@ Ein großer Hund jagte eine Katze
Matched search phrase 'Ein großer Hund jagt eine Katze':
-'großer'->'groß' (direct); 'Ein großer Hund'->'hund' (direct); 'jagte'->'jagen' (direct); 'eine Katze'->'katze' (direct)
+'großer'->'groß' (Matches GROSS directly); 'Ein großer Hund'->'hund' (Matches HUND directly); 'jagte'->'jagen' (Matches JAGEN directly); 'eine Katze'->'katze' (Matches KATZE directly)
```
This could easily have been achieved with a simple matching algorithm, so type
@@ -386,6 +369,12 @@ The cat the big dog chased was scared
The big dog chasing the cat was a problem
There was a big dog that was chasing a cat
The cat chase by the big dog
+There was a big dog and it was chasing a cat.
+I saw a big dog. My cat was afraid of being chased by the dog.
+There was a big dog. His name was Fido. He was chasing my cat.
+A dog appeared. It was chasing a cat. It was very big.
+The cat sneaked back into our lounge because a big dog had been chasing her outside.
+Our big dog was excited because he had been chasing a cat.
```
*German:*
@@ -397,27 +386,15 @@ Die Katze wurde vom großen Hund gejagt
Die Katze wurde immer wieder durch den großen Hund gejagt
Der große Hund wollte die Katze jagen
Der große Hund entschied sich, die Katze zu jagen
-Die Katze hatte genug, vom großen Hund gejagt zu werden
Die Katze, die der große Hund gejagt hatte, hatte Angst
Dass der große Hund die Katze jagte, war ein Problem
Es gab einen großen Hund, der eine Katze jagte
Die Katzenjagd durch den großen Hund
-```
-
-In English but not presently in German, [coreference resolution](#coreference-resolution)
-is active. This means that the system can link pronouns and nouns to other pronouns and nouns
-nearby in the same text that refer to the same entities. It increases the variety of
-structures that Holmes can recognise:
-
-*English:*
-
-```
-There was a big dog and it was chasing a cat.
-I saw a big dog. My cat was afraid of being chased by the dog.
-The big dog was called Fido. He was chasing my cat.
-A dog appeared. It was chasing a cat. It was very big.
-The cat sneaked back into our lounge because a big dog had been chasing her outside.
-Our big dog was excited because he had been chasing a cat.
+Es gab einen großen Hund und er jagte eine Katze
+Es gab einen großen Hund. Er hieß Fido. Er jagte meine Katze
+Es erschien ein Hund. Er jagte eine Katze. Er war sehr groß.
+Die Katze schlich sich in unser Wohnzimmer zurück, weil ein großer Hund sie draußen gejagt hatte
+Unser großer Hund war aufgeregt, weil er eine Katze gejagt hatte
```
The demonstration is not complete without trying other sentences that
@@ -489,11 +466,11 @@ Ready for input
I met Richard Hudson and John Doe last week. They didn't want to go into town.
-Matched search phrase 'An ENTITYPERSON goes into town'; negated; uncertain; involves coreference:
-'Richard Hudson'->'ENTITYPERSON' (entity); 'go'->'go' (direct); 'into'->'into' (direct); 'town'->'town' (direct)
+Matched search phrase with text 'An ENTITYPERSON goes into town'; negated; uncertain; involves coreference:
+'Richard Hudson'->'ENTITYPERSON' (Has an entity label matching ENTITYPERSON); 'go'->'go' (Matches GO directly); 'into'->'into' (Matches INTO directly); 'town'->'town' (Matches TOWN directly)
-Matched search phrase 'An ENTITYPERSON goes into town'; negated; uncertain; involves coreference:
-'John Doe'->'ENTITYPERSON' (entity); 'go'->'go' (direct); 'into'->'into' (direct); 'town'->'town' (direct)
+Matched search phrase with text 'An ENTITYPERSON goes into town'; negated; uncertain; involves coreference:
+'John Doe'->'ENTITYPERSON' (Has an entity label matching ENTITYPERSON); 'go'->'go' (Matches GO directly); 'into'->'into' (Matches INTO directly); 'town'->'town' (Matches TOWN directly)
```
*German:*
@@ -501,14 +478,14 @@ Matched search phrase 'An ENTITYPERSON goes into town'; negated; uncertain; invo
```
Ready for input
-Richard Hudson und Max Mustermann wollten nicht mehr in die Stadt gehen
+Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen.
-Matched search phrase 'Ein ENTITYPER geht in die Stadt'; negated; uncertain:
-'Richard Hudson'->'ENTITYPER' (entity); 'gehen'->'gehen' (direct); 'in'->'in' (direct); 'die Stadt'->'stadt' (direct)
+Matched search phrase with text 'Ein ENTITYPER geht in die Stadt'; negated; uncertain; involves coreference:
+'Richard Hudson'->'ENTITYPER' (Has an entity label matching ENTITYPER); 'gehen'->'gehen' (Matches GEHEN directly); 'in'->'in' (Matches IN directly); 'die Stadt'->'stadt' (Matches STADT directly)
-Matched search phrase 'Ein ENTITYPER geht in die Stadt'; negated; uncertain:
-'Max Mustermann'->'ENTITYPER' (entity); 'gehen'->'gehen' (direct); 'in'->'in' (direct); 'die Stadt'->'stadt' (direct)
+Matched search phrase with text 'Ein ENTITYPER geht in die Stadt'; negated; uncertain; involves coreference:
+'Max Mustermann'->'ENTITYPER' (Has an entity label matching ENTITYPER); 'gehen'->'gehen' (Matches GEHEN directly); 'in'->'in' (Matches IN directly); 'die Stadt'->'stadt' (Matches STADT directly)
```
In each of the two languages, this last example demonstrates several
@@ -532,19 +509,13 @@ For more examples, please see [section 5](#use-cases-and-examples).
### 2. Word-level matching strategies
-The same word-level matching strategies are employed with [all use cases](#use-cases-and-examples) and most
-of the comments that follow apply equally to all use cases. An exception to this principle
-is that there are different ways of configuring
-[ontology-based matching](#ontology-based-matching) and that the choices that are typically
-recommended are different for different use cases.
-
#### 2.1 Direct matching (`word_match.type=='direct'`)
Direct matching between search phrase words and document words is always
active. The strategy relies mainly on matching stem forms of words,
-e.g. matching English *buy* and *child* for *bought* and *children*,
-German *steigen* and *Kind* for *stieg* and *Kinder*. However, in order to
+e.g. matching English *buy* and *child* to *bought* and *children*,
+German *steigen* and *Kind* to *stieg* and *Kinder*. However, in order to
increase the chance of direct matching working when the parser delivers an
incorrect stem form for a word, the raw-text forms of both search-phrase and
document words are also taken into consideration during direct matching.
@@ -555,22 +526,21 @@ document words are also taken into consideration during direct matching.
Derivation-based matching involves distinct but related words that typically
belong to different word classes, e.g. English *assess* and *assessment*,
German *jagen* and *Jagd*. It is active by default but can be switched off using
-the `analyze_derivational_morphology` parameter, which is set when instantiating the [Manager](#manager) and [MultiprocessingManager](#multiprocessing-manager) classes.
+the `analyze_derivational_morphology` parameter, which is set when instantiating the [Manager](#manager) class.
-#### 2.3 Named entity matching (`word_match.type=='entity'`)
+#### 2.3 Named-entity matching (`word_match.type=='entity'`)
-Named entity matching is activated by inserting a special named-entity
+Named-entity matching is activated by inserting a special named-entity
identifier at the desired point in a search phrase in place of a noun,
e.g.
***An ENTITYPERSON goes into town*** (English)
***Ein ENTITYPER geht in die Stadt*** (German).
-The supported named-entity identifiers depend directly on the named
-entity information supplied by the spaCy models for each language
-(descriptions copied from the [spaCy
-documentation](https://spacy.io/usage/linguistic-features#section-named-entities)):
+The supported named-entity identifiers depend directly on the named-entity information supplied
+by the spaCy models for each language (descriptions copied from an earlier version of the spaCy
+documentation):
*English:*
@@ -725,7 +695,7 @@ to be classified. In the example ontology shown above, all words in the ontology
- The **classification** ontology is used to capture relationships between classification labels: that a document
has a certain classification implies it also has any classifications to whose subtree that classification belongs.
Synonyms should be used sparingly if at all in classification ontologies because they add to the complexity of the
-neural network without adding any tangible value; and although it is technically possible to set up a classification
+neural network without adding any value; and although it is technically possible to set up a classification
ontology to use symmetric matching, there is no sensible reason for doing so. Note that a label within the
classification ontology that is not directly defined as the label of any training document
[has to be registered specifically](#supervised-topic-training-basis) using the
@@ -735,27 +705,18 @@ account when training the classifier.
#### 2.5 Embedding-based matching (`word_match.type=='embedding'`)
-For both English and German, spaCy offers **word embeddings**:
+spaCy offers **word embeddings**:
machine-learning-generated numerical vector representations of words
that capture the contexts in which each word
tends to occur. Two words with similar meaning tend to emerge with word
embeddings that are close to each other, and spaCy can measure the
-**similarity** between any two words' embeddings expressed as a decimal
+**cosine similarity** between any two words' embeddings expressed as a decimal
between 0.0 (no similarity) and 1.0 (the same word). Because *dog* and
*cat* tend to appear in similar contexts, they have a similarity of
0.80; *dog* and *horse* have less in common and have a similarity of
-0.62; and *dog* and *iron* have a similarity of only 0.25.
-
-Holmes makes use of word-embedding-based similarities using a globally
-defined **overall similarity threshold**. A match is detected between a
-search phrase and a structure within a document whenever the geometric
-mean of the similarities between the individual corresponding word pairs
-is greater than the threshold. The intuition behind this technique is
-that where a search phrase with e.g. six lexical words has matched a
-document structure where five of these words match exactly and only one
-corresponds via an embedding, the similarity that should be required to match this sixth word is less than
-when only three of the words matched exactly and two of the other words also correspond via embeddings. Embedding-based matching is only activated for nouns, adjectives and adverbs
-because the results have been found to be unsatisfactory with other word classes.
+0.62; and *dog* and *iron* have a similarity of only 0.25. Embedding-based matching
+is only activated for nouns, adjectives and adverbs because the results have been found to be
+unsatisfactory with other word classes.
It is important to understand that the fact that two words have similar
embeddings does not imply the same sort of logical relationship between
@@ -763,9 +724,19 @@ the two as when [ontology-based matching](#ontology-based-matching) is used: for
fact that *dog* and *cat* have similar embeddings means neither that a
dog is a type of cat nor that a cat is a type of dog. Whether or not
embedding-based matching is nonetheless an appropriate choice depends on
-the use case. It is more likely to be appropriate for the [topic matching](#topic-matching) and
-[supervised document classification](#supervised-document-classification) use cases than for the
-[chatbot](#chatbot) and [structural extraction](#structural-extraction) use cases.
+the functional use case.
+
+For the [chatbot](#chatbot), [structural extraction](#structural-extraction) and [supervised document classification](#supervised-document-classification) use cases, Holmes makes use of word-
+embedding-based similarities using a `overall_similarity_threshold` parameter defined globally on
+the [Manager](#manager) class. A match is detected between a
+search phrase and a structure within a document whenever the geometric
+mean of the similarities between the individual corresponding word pairs
+is greater than this threshold. The intuition behind this technique is
+that where a search phrase with e.g. six lexical words has matched a
+document structure where five of these words match exactly and only one
+corresponds via an embedding, the similarity that should be required to match this sixth word is
+less than when only three of the words matched exactly and two of the other words also correspond
+via embeddings.
Matching a search phrase to a document begins by finding words
in the document that match the word at the root (syntactic head) of the
@@ -782,60 +753,41 @@ except the [chatbot](#chatbot) use case essentially unusable.
To avoid the typically unnecessary performance hit that results from embedding-based matching
of search phrase root words, it is controlled separately from embedding-based matching in general
using the `embedding_based_matching_on_root_words` parameter, which is set when instantiating the
-[Manager](#manager) and [MultiprocessingManager](#multiprocessing-manager) classes. You are advised to keep this setting switched off (value `False`) for most use cases.
+[Manager](#manager) class. You are advised to keep this setting switched off (value `False`) for most use cases.
+
+Neither the `overall_similarity_threshold` nor the `embedding_based_matching_on_root_words` parameter has any effect on the [topic matching](#topic_matching) use case. Here word-level embedding similarity thresholds are set using the `word_embedding_match_threshold` and `initial_question_word_embedding_match_threshold` parameters when calling the [`topic_match_documents_against` function on the Manager class](#manager-topic-match-function).
+
+
+#### 2.6 Named-entity-embedding-based matching (`word_match.type=='entity_embedding'`)
+
+A named-entity-embedding based match obtains between a searched-document word that has a certain entity label and a search phrase or query document word whose embedding is sufficiently similar to the underlying meaning of that entity label, e.g. the word *individual* in a search phrase has a similar word embedding to the underlying meaning of the *PERSON* entity label. Note that named-entity-embedding-based matching is never active on root words regardless of the `embedding_based_matching_on_root_words` setting.
+
+
+#### 2.7 Initial-question-word matching (`word_match.type=='question'`)
+
+Initial-question-word matching is only active during [topic matching](#topic-matching). Initial question words in query phrases match entities in the searched documents that represent potential answers to the question, e.g. when comparing the query phrase *When did Peter have breakfast* to the searched-document phrase *Peter had breakfast at 8 a.m.*, the question word *When* would match the temporal adverbial phrase *at 8 a.m.*.
-Note that with [topic matching](#topic-matching), embeddings are automatically investigated in
-[certain circumstances](#how-it-works-topic-matching) regardless of the value of the
-`embedding_based_matching_on_root_words` parameter. However, switching the parameter
-on for topic matching (which is absolutely not recommended!) will still lead to embeddings being investigated for all root words that have valid word classes.
+Initial-question-word matching is switched on and off using the `initial_question_word_behaviour` parameter when calling the [`topic_match_documents_against` function on the Manager class](#manager-topic-match-function). It is only likely to be useful when topic matching is being performed in an interactive setting where the user enters short query phrases, as opposed to when it is being used to find documents on a similar topic to an pre-existing query document: initial question words are only processed at the beginning of the first sentence of the query phrase or query document.
+
+If a query phrase consists of a complex question with several elements dependent on the main verb, a finding in a searched document is only strictly an 'answer' if contains matches to all these elements. Because recall is typically more important than precision when performing topic matching with interactive query phrases, however, Holmes will match an initial question word to a searched-document phrase wherever they correspond semantically (e.g. wherever *when* corresponds to a temporal adverbial phrase) and each depend on verbs that themselves match at the word level. One possible strategy to filter out 'incomplete answers' would be to calculate the maximum possible score for a query phrase and reject topic matches that score below a threshold scaled to this maximum.
### 3. Coreference resolution
-As explained in the [initial examples](#getting-started), Holmes can be configured to use
-**coreference resolution** when analysing English (but not yet German). This
-means that situations are recognised where pronouns and nouns that are located near one another
-within a text refer to the same entities. The information from one mention can then
-be applied to the analysis of further mentions:
+Before Holmes analyses a searched document or query document, coreference resolution is performed using the [coreferee](https://github.com/msg-systems/coreferee)
+library running on top of spaCy. This means that situations are recognised where pronouns and nouns that are located near one another within a text refer to the same entities. The information from one mention can then be applied to the analysis of further mentions:
I saw a *big dog*. *It* was chasing a cat.
I saw a *big dog*. *The dog* was chasing a cat.
-Coreference resolution is performed using the [neuralcoref](https://github.com/huggingface/neuralcoref)
-library running on top of spaCy. The `neuralcoref` library detects chains of coreferring nouns and pronouns that can
-grow to considerable lengths when longer texts are analysed. For Holmes, it has been found
-to be appropriate to limit the consideration of coreference resolution information to a small
-number of mentions either side of a noun or pronoun within a chain — the threshold is currently set to 3 — as well as to suppress coreference between elements more than 300 words apart.
-
-Alongside the main use of coreference resolution information to increase the scope of
-structural matching between search phrases and documents, Holmes also looks for situations
-where a matched word is in a coreference chain with another word that is linked to the
-matched word in an [ontology](#ontology-based-matching) and that is more specific than the
-matched word:
+Coreferee also detects situations where a noun refers back to a named entity:
We discussed *msg systems*. *The company* had made a profit.
-If this example were to match the search phrase ***A company makes a profit*** and if
-*msg systems* were defined as a named-individual instance of *company* in the ontology, the
+If this example were to match the search phrase ***A company makes a profit***, the
coreference information that the company under discussion is msg systems is clearly
relevant and worth extracting in addition to the word(s) directly matched to the search
-phrase. Such information is captured in the [word_match.extracted_word](#wordmatch) field.
-
-A caveat applies when using coreference resolution in the context of the
-[structural extraction](#structural-extraction) use case. The `neuralcoref` library yields excellent results with
-grammatical structures of low or average complexity. However, with very complex texts, the proportion of errors in
-the detected coreference chains seems to increase significantly to an extent that is not observed either for the
-underlying spaCy syntactic parses or for the Holmes semantic interpretations of them. This is presumably because humans
-performing coreference resolution rely partially on information about the world to which the library does
-not have access. This should be borne in mind when extracting structured information from very complex documents:
-there is a danger that using coreference resolution will lead to an unacceptable proportion of the
-extracted information being incorrect.
-
-The `neuralcoref` library does not currently support
-[serialization](#manager-serialize-function): an attempt to serialize a document that has been parsed
-with coreference resolution will result in an error being raised. If you wish to serialize documents and
-are using a spaCy model for which coreference resolution is available (essentially: if you are working in
-English), you have to switch off coreference resolution when instantiating the [Manager](#manager) class by setting the `perform_coreference_resolution` parameter to `False`.
+phrase. Such information is captured in the [word_match.extracted_word](#dictionary) field.
### 4. Writing effective search phrases
@@ -847,8 +799,7 @@ The concept of search phrases has [already been introduced](#getting-started) an
chatbot use case, the structural extraction use case and to [preselection](#preselection) within the supervised
document classification use case.
-**It is crucial to understand that the tips and limitations set out in Section 4 do not apply in any way to search queries in topic matching. If you are using
-Holmes for topic matching only, you can completely ignore this section!**
+**It is crucial to understand that the tips and limitations set out in Section 4 do not apply in any way to query phrases in topic matching. If you are using Holmes for topic matching only, you can completely ignore this section!**
Structural matching between search phrases and documents is not symmetric: there
are many situations in which sentence X as a search phrase would match
@@ -989,13 +940,10 @@ that can be matched to documents.
##### 4.2.5 Coreferring pronouns
***A dog chases a cat and he chases a mouse*** (English)
+***Ein Hund jagt eine Katze und er jagt eine Maus*** (German)
Pronouns that corefer with nouns elsewhere in the search phrase are not permitted as this
would overcomplicate the library without offering any benefits.
-Whether or not this applies to a specific pronoun depends not only on the search phrase
-content, but also on whether or not [coreference resolution](#coreference-resolution)
-is available for the model being used and is [switched on](#manager). Because coreference
-resolution is not currently available for German, only an English example is given.
#### 4.3 Structures strongly discouraged in search phrases
@@ -1029,7 +977,7 @@ search phrases are expressed in the present active.
***Who chases the cat?*** (English)
***Wer jagt die Katze?*** (German)
-Although questions are supported in a limited sense as query phrases in the
+Although questions are supported as query phrases in the
[topic matching](#topic-matching) use case, they are not appropriate as search phrases.
Questions should be re-phrased as statements, in this case
@@ -1082,7 +1030,7 @@ Correlations between the resulting matches can then be established by
matching via the [`Manager.match()` function](#manager-match-function) and looking for
situations where the document token objects are shared across multiple match objects.
-One important exception to this piece of advice is when
+One possible exception to this piece of advice is when
[embedding-based matching](#embedding-based-matching) is active. Because
whether or not each word in a search phrase matches then depends on whether
or not other words in the same search phrase have been matched, large, complex
@@ -1112,7 +1060,7 @@ and to allow the corresponding nominal phrases to be matched via [derivation-bas
The chatbot use case has [already been introduced](#getting-started):
a predefined set of search phrases is used to extract
information from phrases entered interactively by an end user, which in
-this use case act as the 'documents'.
+this use case act as the documents.
The Holmes source code ships with two examples demonstrating the chatbot
use case, one for each language, with predefined ontologies. Having
@@ -1167,12 +1115,11 @@ drive a dialog flow; they are examined solely to extract and store structured in
Code for performing structural extraction would typically perform the following tasks:
-- Initialize the Holmes manager object
-- Call `Manager.register_search_phrase()` several times to define a number of search phrases specifying the
-information to be extracted
-- Call `Manager.parse_and_register_document()` several times to load a number of documents within which to search
-- Call `Manager.match()` to perform the matching
-- Query the returned match objects to obtain the extracted information and store it in a database
+- Initialize the Holmes manager object.
+- Call `Manager.register_search_phrase()` several times to define a number of search phrases specifying the information to be extracted.
+- Call `Manager.parse_and_register_document()` several times to load a number of documents within which to search.
+- Call `Manager.match()` to perform the matching.
+- Query the returned match objects to obtain the extracted information and store it in a database.
#### 5.3 Topic matching
@@ -1180,23 +1127,18 @@ information to be extracted
The topic matching use case matches a **query document**, or alternatively a **query phrase**
entered ad-hoc by the user, against a set of documents pre-loaded into memory. The aim is to find the passages
in the documents whose topic most closely corresponds to the topic of the query document; the output is
-a ordered list of passages scored according to topic similarity.
+a ordered list of passages scored according to topic similarity. Additionally, if a query phrase contains an [initial question word](#initial-question-word-matching), the output will contain potential answers to the question.
Topic matching queries may contain [generic pronouns](#generic-pronouns) and
-[named entity identifiers](#named-entity-matching) just like search phrases, although the `ENTITYNOUN`
+[named-entity identifiers](#named-entity-matching) just like search phrases, although the `ENTITYNOUN`
token is not supported. However, an important difference from
search phrases is that the topic matching use case places no
-restrictions on the grammatical structures permissible within the query document. This means that query phrases
-can be expressed as questions, and indeed questions may well be the most natural way for many users to formulate query
-phrases. However, it is important to understand that Holmes is not a dedicated question answering system in that it
-makes no attempt to retrieve content based on the meanings of question words. Instead, question words are
-ignored as grammatical words; the lexical words within the question are analysed and used as a basis for
-matching in the same way as if they had been contained within a statement.
+restrictions on the grammatical structures permissible within the query document.
The Holmes source code ships with three examples demonstrating the topic matching use case with an English literature
corpus, a German literature corpus and a German legal corpus respectively. The two literature examples are hosted at
-the [Holmes demonstration website](http://holmes-demo.xt.msg.team), although users are encouraged to run [the scripts](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/examples/)
-locally as well to get a feel for how they work. The German law example starts a simple interactive console and its [script](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/examples/example_search_DE_law.py) contains some example queries as comments.
+the [Holmes demonstration website](http://holmes-demo.xt.msg.team), although users are encouraged to run [the scripts](https://github.com/msg-systems/holmes-extractor/blob/master/examples/)
+locally as well to get a feel for how they work. The German law example starts a simple interactive console and its [script](https://github.com/msg-systems/holmes-extractor/blob/master/examples/example_search_DE_law.py) contains some example queries as comments.
Topic matching uses a variety of strategies to find text passages that are relevant to the query. These include
resource-hungry procedures like investigating semantic relationships and comparing embeddings. Because applying these
@@ -1204,26 +1146,6 @@ across the board would prevent topic matching from scaling, Holmes only attempts
that less resource-intensive strategies have already marked as looking promising. This and the other interior workings
of topic matching are explained [here](#how-it-works-topic-matching).
-Because the decision as to which strategies to apply when are driven by thresholds that measure how often certain
-features occur within the document corpus, it is important to realise that a given result may not remain constant when
-the size of the corpus changes. This issue is especially pertinent when using the
-[MultiprocessingManager](#multiprocessing-manager) because each worker process applies its own thresholds and the
-scored results are pooled at the end. In practice, this seems to make little difference to the results that are
-actually observed, but there may be use cases where consistency and predicability are more important than performance.
-Consistency and predictability can be ensured by setting each of the
-`maximum_number_of_single_word_matches_for_relation_matching` and
-`maximum_number_of_single_word_matches_for_embedding_matching` parameters to either `0` (always off) or
-`sys.maxsize` (always on).
-
-[Embedding-based matching](#embedding-based-matching) is controlled by the overall similarity threshold set on the
-[Manager](#manager) or [MultiprocessingManager](#multiprocessing-manager) object, which measures the geometric mean
-of all the words matched within a structure. In topic matching used in the normal situation where
-`embedding_based_matching_on_root_words==False`, there are always two words involved in a match and the similarity for
-one of these words is always 1. A target value for the second word can thus be specified by setting the overall
-similarity threshold to the *square root* of that target value. For example, if pairs of words
-whose embedding similarity is higher than 0.75 are to be matched, the threshold should be set to
-the square root of 0.75, which is around 0.866.
-
#### 5.4 Supervised document classification
@@ -1242,8 +1164,7 @@ remedied by specifying a smaller number of hidden layers or a smaller number of
A trained document classification model retains no references to its training data. This is an advantage
from a data protection viewpoint, although it
[cannot presently be guaranteed](#remove-names-from-supervised-document-classification-models) that models will
-not contain individual personal or company names. It also means that models can be serialized even when
-[the training documents were not serializable](#coreference-resolution).
+not contain individual personal or company names.
A typical problem with the execution of many document classification use cases is that a new classification label
@@ -1254,7 +1175,7 @@ are not preselected as having the new classification label are then passed to th
classifier in the normal way. When enough documents exemplifying the new classification have accumulated in the system,
the model can be retrained and the preselection search phrases removed.
-Holmes ships with an example [script](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/examples/example_supervised_topic_model_EN.py) demonstrating supervised document classification for English with the
+Holmes ships with an example [script](https://github.com/msg-systems/holmes-extractor/blob/master/examples/example_supervised_topic_model_EN.py) demonstrating supervised document classification for English with the
[BBC Documents dataset](http://mlg.ucd.ie/datasets/bbc.html). The script downloads the documents (for
this operation and for this operation alone, you will need to be online) and places them in a working directory.
When training is complete, the script saves the model to the working directory. If the model file is found
@@ -1290,93 +1211,83 @@ The interior workings of supervised document classification are explained [here]
``` {.python}
holmes_extractor.Manager(self, model, *, overall_similarity_threshold=1.0,
embedding_based_matching_on_root_words=False, ontology=None,
- analyze_derivational_morphology=True, perform_coreference_resolution=None, debug=False)
+ analyze_derivational_morphology=True, perform_coreference_resolution=None,
+ number_of_workers=None, verbose=False)
The facade class for the Holmes library.
Parameters:
-model -- the name of the spaCy model, e.g. 'en_core_web_lg'
-overall_similarity_threshold -- the overall similarity threshold for
- embedding-based matching. Defaults to '1.0', which deactivates embedding-based matching.
-embedding_based_matching_on_root_words -- determines whether or not embedding-based
+model -- the name of the spaCy model, e.g. *en_core_web_trf*
+overall_similarity_threshold -- the overall similarity threshold for embedding-based
+ matching. Defaults to *1.0*, which deactivates embedding-based matching. Note that this
+ parameter is not relevant for topic matching, where the thresholds for embedding-based
+ matching are set on the call to *topic_match_documents_against*.
+embedding_based_matching_on_root_words -- determines whether or not embedding-based
matching should be attempted on search-phrase root tokens, which has a considerable
- performance hit. Defaults to 'False'.
-ontology -- an 'Ontology' object. Defaults to 'None' (no ontology).
+ performance hit. Defaults to *False*. Note that this parameter is not relevant for topic
+ matching.
+ontology -- an *Ontology* object. Defaults to *None* (no ontology).
analyze_derivational_morphology -- *True* if matching should be attempted between different
words from the same word family. Defaults to *True*.
-perform_coreference_resolution -- 'True', 'False', or 'None' if coreference resolution
- should be performed depending on whether the model supports it. Defaults to 'None'.
-debug -- a boolean value specifying whether debug representations should
-be outputted for parsed sentences. Defaults to 'False'.
-```
-
-``` {.python}
-Manager.parse_and_register_document(self, document_text, label='')
-
-Parameters:
-
-document_text -- the raw document text.
-label -- a label for the document which must be unique. Defaults to the
- empty string, which is intended for use cases where single documents
- (user entries) are matched to predefined search phrases.
+perform_coreference_resolution -- *True* if coreference resolution should be taken into account
+ when matching. Defaults to *True*.
+use_reverse_dependency_matching -- *True* if appropriate dependencies in documents can be
+ matched to dependencies in search phrases where the two dependencies point in opposite
+ directions. Defaults to *True*.
+number_of_workers -- the number of worker processes to use, or *None* if the number of worker
+ processes should depend on the number of available cores. Defaults to *None*
+verbose -- a boolean value specifying whether multiprocessing messages should be outputted to
+ the console. Defaults to *False*
```
``` {.python}
-Manager.register_parsed_document(self, document, label='')
+Manager.register_serialized_document(self, serialized_document:bytes, label:str) -> None
Parameters:
-document -- a preparsed Holmes document.
-label -- a label for the document which must be unique. Defaults to the
- empty string, which is intended for the chatbot use case where single documents
- (user entries) are matched to predefined search phrases.
+document -- a preparsed Holmes document.
+label -- a label for the document which must be unique. Defaults to the empty string,
+ which is intended for use cases involving single documents (typically user entries).
```
+
``` {.python}
-Manager.deserialize_and_register_document(self, document, label='')
+Manager.register_serialized_documents(self, document_dictionary:dict[str, Doc]) -> None
-Raises a 'WrongModelDeserializationError' if the model used to parse the serialized
- document does not correspond to the model with which this Manager object was created.
+Note that this function is the most efficient way of loading documents.
Parameters:
-document -- a Holmes document serialized using the
- 'serialize_document()' function.
-label -- a label for the document which must be unique. Defaults to the
- empty string, which is intended for the chatbot use case where single documents
- (user entries) are matched to predefined search phrases.
+document_dictionary -- a dictionary from labels to serialized documents.
```
``` {.python}
-Manager.remove_document(self, label)
+Manager.parse_and_register_document(self, document_text:str, label:str='') -> None
Parameters:
-label -- the label of the document to be removed.
-```
-
-``` {.python}
-Manager.remove_all_documents(self)
+document_text -- the raw document text.
+label -- a label for the document which must be unique. Defaults to the empty string,
+ which is intended for use cases involving single documents (typically user entries).
```
``` {.python}
-Manager.remove_all_search_phrases(self)
+Manager.remove_document(self, label:str) -> None
```
``` {.python}
-Manager.remove_all_search_phrases_with_label(self, label)
+Manager.remove_all_documents(self) -> None
```
``` {.python}
-Manager.document_labels(self)
+Manager.document_labels(self) -> list[str]
Returns a list of the labels of the currently registered documents.
```
-
``` {.python}
-Manager.serialize_document(self, label)
+Manager.serialize_document(self, label:str) -> bytes
Returns a serialized representation of a Holmes document that can be
persisted to a file. If 'label' is not the label of a registered document,
@@ -1388,125 +1299,120 @@ label -- the label of the document to be serialized.
```
``` {.python}
-Manager.register_search_phrase(self, search_phrase_text, label=None)
+Manager.get_document(self, label:str='') -> Doc
+
+Returns a Holmes document. If *label* is not the label of a registered document, *None*
+ is returned instead.
Parameters:
-search_phrase_text -- the raw search phrase text.
-label -- a label for the search phrase which need not be unique.
- If label==None, the assigned label defaults to the raw search phrase text.
+label -- the label of the document to be serialized.
```
-
+
``` {.python}
-Manager.match(self)
+Manager.debug_document(self, label:str='') -> Doc
-Matches the registered search phrases to the registered documents.
- Returns a list of Match objects sorted by their overall similarity
- measures in descending order. Should be called by applications wishing
- to retain references to the spaCy and Holmes information that was used
- to derive the matches.
-```
+Outputs a debug representation for a loaded document.
-``` {.python}
-Manager.match_returning_dictionaries(self)
+Parameters:
-Matches the registered search phrases to the registered documents.
- Returns a list of dictionaries describing any matches, sorted by their
- overall similarity measures in descending order. Callers of this method
- do not have to manage any further dependencies on spaCy or Holmes.
+label -- the label of the document to be serialized.
```
-
``` {.python}
-Manager.match_search_phrases_against(self, entry)
+Manager.register_search_phrase(self, search_phrase_text:str, label:str=None) -> SearchPhrase
-Matches the registered search phrases against a single document
- supplied to the method and returns dictionaries describing any matches.
-```
+Registers and returns a new search phrase.
+Parameters:
+
+search_phrase_text -- the raw search phrase text.
+label -- a label for the search phrase, which need not be unique.
+ If label==None, the assigned label defaults to the raw search phrase text.
+```
``` {.python}
-Manager.match_documents_against(self, search_phrase)
+Manager.remove_all_search_phrases_with_label(self, label:str) -> None
+```
-Matches the registered documents against a single search phrase
- supplied to the method and returns dictionaries describing any matches.
+```
+Manager.remove_all_search_phrases(self) -> None
```
-
+```
+Manager.list_search_phrase_labels(self) -> list[str]
+```
+
+
``` {.python}
-topic_match_documents_against(self, text_to_match, *, maximum_activation_distance=75,
- relation_score=30, reverse_only_relation_score = 20, single_word_score=5,
- single_word_any_tag_score=2, overlapping_relation_multiplier=1.5,
- embedding_penalty=0.6, ontology_penalty=0.9,
- maximum_number_of_single_word_matches_for_relation_matching = 500,
- maximum_number_of_single_word_matches_for_embedding_matching = 100,
- sideways_match_extent=100, only_one_result_per_document=False,
- number_of_results=10, document_label_filter=None):
+Manager.match(self, search_phrase_text:str=None, document_text:str=None) -> list[dict]
-Returns the results of a topic match between an entered text and the loaded documents.
+Matches search phrases to documents and returns the result as match dictionaries.
Parameters:
-text_to_match -- the text to match against the loaded documents.
-maximum_activation_distance -- the number of words it takes for a previous
- phraselet activation to reduce to zero when the library is reading through a document.
-relation_score -- the activation score added when a normal two-word relation is matched.
-reverse_only_relation_score -- the activation score added when a two-word relation
- is matched using a search phrase that can only be reverse-matched.
-single_word_score -- the activation score added when a normal single word is matched.
-single_word_any_tag_score -- the activation score added when a single word is matched
- whose tag did not correspond to the template specification.
-overlapping_relation_multiplier -- the value by which the activation score is multiplied
- when two relations were matched and the matches involved a common document word.
-embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
- match involved an embedding. The result is additionally multiplied by the overall
- similarity measure of the match.
-ontology_penalty -- a value between 0 and 1 with which scores are multiplied for each
- word match within a match that involved the ontology. For each such word match,
- the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
- higher for hyponyms and hypernyms than for synonyms and increases with the
- depth distance.
-maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
-maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
-sideways_match_extent -- the maximum number of words that may be incorporated into a
- topic match either side of the word where the activation peaked.
-only_one_result_per_document -- if 'True', prevents multiple results from being returned
- for the same document.
-number_of_results -- the number of topic match objects to return.
-document_label_filter -- optionally, a string with which document labels must start to
- be considered for inclusion in the results.
+search_phrase_text -- a text from which to generate a search phrase, or *None* if the
+ preloaded search phrases should be used for matching.
+document_text -- a text from which to generate a document, or *None* if the preloaded
+ documents should be used for matching.
```
+
``` {.python}
-topic_match_documents_returning_dictionaries_against(self, text_to_match, *,
- maximum_activation_distance=75, relation_score=30, reverse_only_relation_score = 20,
- single_word_score=5, single_word_any_tag_score=2, overlapping_relation_multiplier=1.5,
- embedding_penalty=0.6, ontology_penalty=0.9,
- maximum_number_of_single_word_matches_for_relation_matching = 500,
- maximum_number_of_single_word_matches_for_embedding_matching = 100,
- sideways_match_extent=100, only_one_result_per_document=False, number_of_results=10,
- document_label_filter=None, tied_result_quotient=0.9):
-
-Returns a list of dictionaries representing the results of a topic match between an entered text and the loaded
- documents. Callers of this method do not have to manage any further dependencies on spaCy or Holmes.
+topic_match_documents_against(self, text_to_match:str, *,
+ use_frequency_factor:bool=True,
+ maximum_activation_distance:int=75,
+ word_embedding_match_threshold:float=0.8,
+ initial_question_word_embedding_match_threshold:float=0.7,
+ relation_score:int=300,
+ reverse_only_relation_score:int=200,
+ single_word_score:int=50,
+ single_word_any_tag_score:int=20,
+ initial_question_word_answer_score:int=600,
+ initial_question_word_behaviour:str='process',
+ different_match_cutoff_score:int=15,
+ overlapping_relation_multiplier:float=1.5,
+ embedding_penalty:float=0.6,
+ ontology_penalty:float=0.9,
+ relation_matching_frequency_threshold:float=0.25,
+ embedding_matching_frequency_threshold:float=0.5,
+ sideways_match_extent:int=100,
+ only_one_result_per_document:bool=False,
+ number_of_results:int=10,
+ document_label_filter:str=None,
+ tied_result_quotient:float=0.9) -> list[dict]:
+
+Returns a list of dictionaries representing the results of a topic match between an entered text
+and the loaded documents.
-Parameters:
+Properties:
text_to_match -- the text to match against the loaded documents.
-maximum_activation_distance -- the number of words it takes for a previous
- phraselet activation to reduce to zero when the library is reading through a document.
+use_frequency_factor -- *True* if scores should be multiplied by a factor between 0 and 1
+ expressing how rare the words matching each phraselet are in the corpus. Note that,
+ even if set to *False*, the factors are still calculated as they are required for
+ determining which relation and embedding matches should be attempted.
+maximum_activation_distance -- the number of words it takes for a previous phraselet
+ activation to reduce to zero when the library is reading through a document.
+word_embedding_match_threshold -- the cosine similarity above which two words match where
+ the search phrase word does not govern an interrogative pronoun.
+initial_question_word_embedding_match_threshold -- the cosine similarity above which two
+ words match where the search phrase word governs an interrogative pronoun.
relation_score -- the activation score added when a normal two-word relation is matched.
reverse_only_relation_score -- the activation score added when a two-word relation
is matched using a search phrase that can only be reverse-matched.
single_word_score -- the activation score added when a normal single word is matched.
single_word_any_tag_score -- the activation score added when a single word is matched
- whose tag did not correspond to the template specification.
+ whose tag would not normally allow it to be matched by phraselets.
+initial_question_word_answer_score -- the activation score added when a question word is
+ matched to an potential answer phrase.
+initial_question_word_behaviour -- 'process' if a question word in the sentence
+ constinuent at the beginning of *text_to_match* is to be matched to document phrases
+ that answer it; 'exclusive' if only topic matches that involve such question words
+ are to be permitted; 'ignore' if question words are to be ignored.
+different_match_cutoff_score -- the activation threshold under which topic matches are
+ separated from one another. Note that the default value will probably be too low if
+ *use_frequency_factor* is set to *False*.
overlapping_relation_multiplier -- the value by which the activation score is multiplied
when two relations were matched and the matches involved a common document word.
embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
@@ -1517,14 +1423,11 @@ ontology_penalty -- a value between 0 and 1 with which scores are multiplied for
the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
higher for hyponyms and hypernyms than for synonyms and increases with the
depth distance.
-maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
-maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
+relation_matching_frequency_threshold -- the frequency threshold above which single
+ word matches are used as the basis for attempting relation matches.
+embedding_matching_frequency_threshold -- the frequency threshold above which single
+ word matches are used as the basis for attempting relation matches with
+ embedding-based matching on the second word.
sideways_match_extent -- the maximum number of words that may be incorporated into a
topic match either side of the word where the activation peaked.
only_one_result_per_document -- if 'True', prevents multiple results from being returned
@@ -1537,8 +1440,9 @@ tied_result_quotient -- the quotient between a result and following results abov
```
``` {.python}
-Manager.get_supervised_topic_training_basis(self, *, classification_ontology=None,
- overlap_memory_size=10, oneshot=True, match_all_words=False, verbose=True)
+Manager.get_supervised_topic_training_basis(self, *, classification_ontology:Ontology=None,
+ overlap_memory_size:int=10, oneshot:bool=True, match_all_words:bool=False,
+ verbose:bool=True) -> SupervisedTopicTrainingBasis:
Returns an object that is used to train and generate a model for the
supervised document classification use case.
@@ -1558,14 +1462,16 @@ verbose -- if 'True', information about training progress is outputted to the co
```
``` {.python}
-Manager.deserialize_supervised_topic_classifier(self, serialized_model)
+Manager.deserialize_supervised_topic_classifier(self,
+ serialized_model:str, verbose:bool=False) -> SupervisedTopicClassifier:
Returns a classifier for the supervised document classification use case
that will use a supplied pre-trained model.
Parameters:
-serialized_model -- the pre-trained model.
+serialized_model -- the pre-trained model as returned from `SupervisedTopicClassifier.serialize_model()`.
+verbose -- if 'True', information about matching is outputted to the console.
```
``` {.python}
@@ -1579,94 +1485,38 @@ Starts a chatbot mode console enabling the matching of pre-registered
``` {.python}
Manager.start_structural_search_mode_console(self)
-Starts a search mode console enabling the matching of pre-registered
+Starts a structural extraction mode console enabling the matching of pre-registered
documents to search phrases entered ad-hoc by the user.
```
``` {.python}
Manager.start_topic_matching_search_mode_console(self,
- only_one_result_per_document=False,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100)
+ only_one_result_per_document:bool=False, word_embedding_match_threshold:float=0.8,
+ initial_question_word_embedding_match_threshold:float=0.7):
-Starts a topic mode console enabling the matching of pre-registered
- documents to search texts entered ad-hoc by the user.
+Starts a topic matching search mode console enabling the matching of pre-registered
+ documents to query phrases entered ad-hoc by the user.
Parameters:
only_one_result_per_document -- if 'True', prevents multiple topic match
results from being returned for the same document.
-maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
-maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
-```
-
-
-#### 6.2 `MultiprocessingManager`
-
-For details of the `MultiprocessingManager.document_labels()`,
-`MultiprocessingManager.topic_match_documents_returning_dictionaries_against()` and
-`MultiprocessingManager.start_topic_matching_search_mode_console()` methods, see the similarly named
-methods of the [Manager](#manager) class.
-
-``` {.python}
-holmes_extractor.MultiprocessingManager(self, model, *,
- overall_similarity_threshold=1.0, embedding_based_matching_on_root_words=False,
- ontology=None, analyze_derivational_morphology=True, perform_coreference_resolution=None,
- debug=False, verbose=True, number_of_workers=None):
-
-The facade class for the Holmes library used in a multiprocessing environment.
- This class is threadsafe.
-
-Parameters:
-
-model -- the name of the spaCy model, e.g. 'en_core_web_lg'
-overall_similarity_threshold -- the overall similarity threshold for
- embedding-based matching. Defaults to '1.0', which deactivates
- embedding-based matching.
-embedding_based_matching_on_root_words -- determines whether or not embedding-based
- matching should be attempted on search-phrase root tokens, which has a considerable
- performance hit. Defaults to 'False'.
-ontology -- an 'Ontology' object. Defaults to 'None' (no ontology).
-analyze_derivational_morphology -- *True* if matching should be attempted between different
- words from the same word family. Defaults to *True*.
-perform_coreference_resolution -- 'True', 'False', or 'None' if coreference resolution
- should be performed depending on whether the model supports it. Defaults to 'None'.
-debug -- a boolean value specifying whether debug representations should
- be outputted for parsed sentences. Defaults to 'False'.
-verbose -- a boolean value specifying whether status messages should be outputted
- to the console. Defaults to *True*
-number_of_workers -- the number of worker processes to use, or *None* if the number of worker
- processes should depend on the number of available cores. Defaults to *None*
+word_embedding_match_threshold -- the cosine similarity above which two words match where the
+ search phrase word does not govern an interrogative pronoun.
+initial_question_word_embedding_match_threshold -- the cosine similarity above which two
+ words match where the search phrase word governs an interrogative pronoun.
```
``` {.python}
-MultiprocessingManager.parse_and_register_documents(self, document_dictionary)
-
-Parameters:
-
-document_dictionary -- a dictionary from unique document labels to raw document texts.
-```
-
-``` {.python}
-MultiprocessingManager.deserialize_and_register_documents(self, serialized_document_dictionary)
-
-Parameters:
+Manager.close(self) -> None
-serialized_document_dictionary -- a dictionary from unique document labels to
- documents serialized using the *Manager.serialize_document()* method.
+Terminates the worker processes.
```
-``` {.python}
-MultiprocessingManager.close(self)
+
+#### 6.2 `manager.nlp`
-Shut down all processes associated with this instance.
-```
+`manager.nlp` is the underlying spaCy [Language](https://spacy.io/api/language/) object on which both Coreferee and Holmes have been registered as custom pipeline components. The most efficient way of parsing documents for use with Holmes is to call [`manager.nlp.pipe()`](https://spacy.io/api/language/#pipe). This yields an iterable of documents that can then be loaded into Holmes via [`manager.register_serialized_documents()`](#manager-register-serialized-documents-function)
#### 6.3 `Ontology`
@@ -1696,7 +1546,8 @@ Matching is case-insensitive.
Parameters:
-ontology_path -- the path from where the ontology is to be loaded, or a list of several such paths. See https://github.com/RDFLib/rdflib/.
+ontology_path -- the path from where the ontology is to be loaded,
+or a list of several such paths. See https://github.com/RDFLib/rdflib/.
owl_class_type -- optionally overrides the OWL 2 URL for types.
owl_individual_type -- optionally overrides the OWL 2 URL for individuals.
owl_type_link -- optionally overrides the RDF URL for types.
@@ -1706,13 +1557,14 @@ symmetric_matching -- if 'True', means hypernym relationships are also taken int
```
-#### 6.4 `SupervisedTopicTrainingBasis` (returned from `Manager.get_supervised_topic_training_basis`)
+#### 6.4 `SupervisedTopicTrainingBasis` (returned from `Manager.get_supervised_topic_training_basis()`)
Holder object for training documents and their classifications from which one or more
-[SupervisedTopicModelTrainer](#supervised-topic-model-trainer) objects can be derived.
+[SupervisedTopicModelTrainer](#supervised-topic-model-trainer) objects can be derived. This class is NOT threadsafe.
``` {.python}
-SupervisedTopicTrainingBasis.parse_and_register_training_document(self, text, classification, label=None)
+SupervisedTopicTrainingBasis.parse_and_register_training_document(self, text, classification,
+ label=None)
Parses and registers a document to use for training.
@@ -1757,11 +1609,12 @@ Matches the phraselets derived from the training documents against the training
or additional classification labels.
```
+
``` {.python}
-SupervisedTopicTrainingBasis.train(self, *, minimum_occurrences=4, cv_threshold=1.0, mlp_activation='relu',
- mlp_solver='adam', mlp_learning_rate='constant', mlp_learning_rate_init=0.001,
- mlp_max_iter=200, mlp_shuffle=True, mlp_random_state=42, oneshot=True,
- overlap_memory_size=10, hidden_layer_sizes=None):
+SupervisedTopicTrainingBasis.train(self, *, minimum_occurrences=4, cv_threshold=1.0,
+ mlp_activation='relu', mlp_solver='adam', mlp_learning_rate='constant',
+ mlp_learning_rate_init=0.001, mlp_max_iter=200, mlp_shuffle=True, mlp_random_state=42,
+ hidden_layer_sizes=None):
Trains a model based on the prepared state.
@@ -1774,10 +1627,6 @@ cv_threshold -- the minimum coefficient of variation with which a word or relati
to occur across the explicit classification labels for the phraselet to be
accepted into the final model.
mlp_* -- see https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html.
-oneshot -- whether the same word or relationship matched multiple times within a single
- document should be counted once only (value 'True') or multiple times (value 'False')
-overlap_memory_size -- No longer has any effect - the value defined in __init__()
- is used instead. Retained for backwards compatibility.
hidden_layer_sizes -- a list where each entry is the size of a hidden layer, or 'None'
if the topology should be determined automatically.
```
@@ -1805,7 +1654,7 @@ can be serialized.
SupervisedTopicClassifier.parse_and_classify(self, text)
Returns a list containing zero, one or many document classifications. Where more
-than one classifications are returned, the labels are ordered by decreasing
+than one classification is returned, the labels are ordered by decreasing
probability.
Parameters:
@@ -1817,7 +1666,7 @@ text -- the text to parse and classify.
SupervisedTopicClassifier.classify(self, doc)
Returns a list containing zero, one or many document classifications. Where more
-than one classifications are returned, the labels are ordered by decreasing
+than one classification is returned, the labels are ordered by decreasing
probability.
Parameters:
@@ -1826,107 +1675,23 @@ doc -- the pre-parsed document to classify.
```
``` {.python}
-SupervisedTopicClassifier.serialize_model(self)
-```
-
-``` {.python}
-SupervisedTopicClassifier.deserialize_model(self, serialized_model)
-```
-
-
-#### 6.7 `Match` (returned from `Manager.match()`)
-
-``` {.python}
-A match between a search phrase and a document. The indexes refer to words.
-
-Externally relevant properties:
-
-search_phrase_label -- the label of the search phrase that matched.
-document_label -- the label of the document that matched.
-is_negated -- 'True' if this match is negated.
-is_uncertain -- 'True' if this match is uncertain.
-involves_coreference -- 'True' if this match was found using
- coreference resolution.
-overall_similarity_measure -- the overall similarity of the match, or
- '1.0' if embedding-based matching was not involved in the match.
-word_matches -- a list of WordMatch objects.
-index_within_document -- the index of the document token that matched
- the search phrase root token.
-```
-
-
-#### 6.8 `WordMatch` (returned from `Manager.match().word_matches`)
-
-``` {.python}
-A match between a searched phrase word and a document word.
+SupervisedTopicClassifier.serialize_model(self) -> str
-Properties:
-
-search_phrase_token -- the spaCy token from the search phrase.
-search_phrase_word -- the string that matched from the search phrase.
-document_token -- the spaCy token from the document.
-first_document_token -- the first token that matched from the document, which will equal
- 'document_token' except with multiword matches.
-last_document_token -- the lst token that matched from the document, which will equal
- 'document_token' except with multiword matches.
-document_word -- the string that matched from the document.
-document_subword -- the subword from the token that matched, or *None* if the match was
- with the whole token.
-type -- 'direct', 'entity', 'embedding' or 'ontology'.
-similarity_measure -- for type 'embedding', the similarity between the
- two tokens, otherwise '1.0'.
-is_negated -- 'True' if this word match leads to a match of which it
- is a part being negated.
-is_uncertain -- 'True' if this word match leads to a match of which it
- is a part being uncertain.
-structurally_matched_document_token -- the spaCy token from the document that matched
- the parent dependencies, which may be different from 'document_token' if coreference
- resolution is active.
-involves_coreference -- 'True' if 'document_token' and
- 'structurally_matched_document_token' are different.
-extracted_word -- within the coreference chain, the most specific term that corresponded to
- 'document_word' in the ontology.
-depth -- the number of hyponym relationships linking 'search_phrase_word' and
- 'extracted_word', or '0' if ontology-based matching is not active. Can be negative
- if symmetric matching is active.
-explain() -- returns a human-readable explanation of the word match from the perspective of the
- document word (e.g. to be used as a tooltip over it).
-```
-
-
-#### 6.9 `Subword` (returned from `word_match.subword`)
-
-``` {.python}
-A semantically atomic part of a word. Currently only used for German.
-
-containing_token_index -- the index of the containing token within the document.
-index -- the index of the subword within the word.
-text -- the original subword string.
-lemma -- the model-normalized representation of the subword string.
-derived_lemma -- where relevant, another lemma with which *lemma* is derivationally related
-and which can also be useful for matching in some usecases; otherwise *None*
-char_start_index -- the character index of the subword within the containing word.
-is_head -- **True**, if this subword is the head within its word, **False** otherwise.
-dependent_index -- the index of a subword that is dependent on this subword, or *None*
- if there is no such subword.
-dependency_label -- the label of the dependency between this subword and its dependent,
- or *None* if it has no dependent.
-governor_index -- the index of a subword on which this subword is dependent, or *None*
- if there is no such subword.
-governing_dependency_label -- the label of the dependency between this subword and its
- governor, or *None* if it has no governor.
+Returns a serialized model that can be reloaded using
+ *Manager.deserialize_supervised_topic_classifier()*
```
-#### 6.10 Dictionary returned from `Manager.match_returning_dictionaries()`)
+#### 6.7 Dictionary returned from `Manager.match_returning_dictionaries()`)
``` {.python}
A text-only representation of a match between a search phrase and a
-document. The indexes refer to words.
+document. The indexes refer to tokens.
Properties:
-search_phrase -- the label of the search phrase.
+search_phrase_label -- the label of the search phrase.
+search_phrase_text -- the text of the search phrase.
document -- the label of the document.
index_within_document -- the index of the match within the document.
sentences_within_document -- the raw text of the sentences within the document that matched.
@@ -1937,51 +1702,45 @@ overall_similarity_measure -- the overall similarity of the match, or
'1.0' if embedding-based matching was not involved in the match.
word_matches -- an array of dictionaries with the properties:
+ search_phrase_token_index -- the index of the token that matched from the search phrase.
search_phrase_word -- the string that matched from the search phrase.
+ document_token_index -- the index of the token that matched within the document.
+ first_document_token_index -- the index of the first token that matched within the document.
+ Identical to 'document_token_index' except where the match involves a multiword phrase.
+ last_document_token_index -- the index of the last token that matched within the document
+ (NOT one more than that index). Identical to 'document_token_index' except where the match
+ involves a multiword phrase.
+ structurally_matched_document_token_index -- the index of the token within the document that
+ structurally matched the search phrase token. Is either the same as 'document_token_index' or
+ is linked to 'document_token_index' within a coreference chain.
+ document_subword_index -- the index of the token subword that matched within the document, or
+ 'None' if matching was not with a subword but with an entire token.
+ document_subword_containing_token_index -- the index of the document token that contained the
+ subword that matched, which may be different from 'document_token_index' in situations where a
+ word containing multiple subwords is split by hyphenation and a subword whose sense
+ contributes to a word is not overtly realised within that word.
document_word -- the string that matched from the document.
- document_phrase -- the phrase headed by the word that matched from the
- document.
- match_type -- 'direct', 'entity', 'embedding' or 'ontology'.
- similarity_measure -- for type 'embedding', the similarity between the
+ document_phrase -- the phrase headed by the word that matched from the document.
+ match_type -- 'direct', 'derivation', 'entity', 'embedding', 'ontology' or 'entity_embedding'.
+ negated -- 'True' if this word match is negated.
+ uncertain -- 'True' if this word match is uncertain.
+ similarity_measure -- for types 'embedding' and 'entity_embedding', the similarity between the
two tokens, otherwise '1.0'.
involves_coreference -- 'True' if the word was matched using coreference resolution.
extracted_word -- within the coreference chain, the most specific term that corresponded to
- document_word in the ontology.
+ the document_word.
+ depth -- the number of hyponym relationships linking 'search_phrase_word' and
+ 'extracted_word', or '0' if ontology-based matching is not active. Can be negative
+ if symmetric matching is active.
explanation -- creates a human-readable explanation of the word match from the perspective of the
document word (e.g. to be used as a tooltip over it).
```
-
-#### 6.11 `TopicMatch` (returned from `Manager.topic_match_documents_against()`))
-
-``` {.python}
-A topic match between some text and part of a document. The indexes refer to words.
-
-Properties:
-
-document_label -- the document label.
-index_within_document -- the index within the document where 'score' was achieved.
-subword_index -- the index of the subword within the token within the document where 'score'
- was achieved, or *None* if the match involved the whole word.
-start_index -- the start index of the topic match within the document.
-end_index -- the end index of the topic match within the document.
-sentences_start_index -- the start index within the document of the sentence that contains
- 'start_index'.
-sentences_end_index -- the end index within the document of the sentence that contains
- 'end_index'.
-relative_start_index -- the start index of the topic match relative to 'sentences_start_index'.
-relative_end_index -- the end index of the topic match relative to 'sentences_start_index'.
-score -- the similarity score of the topic match.
-text -- the text between 'sentences_start_index' and 'sentences_end_index'.
-structural_matches -- a list of `Match` objects that were used to derive this object.
-```
-
-#### 6.12 Dictionary returned from `Manager.topic_match_documents_returning_dictionaries_against()` and `Manager.topic_match_documents_returning_dictionaries_against()`
+#### 6.8 Dictionary returned from `Manager.topic_match_documents_returning_dictionaries_against()`
``` {.python}
-A text-only representation of a topic match between a search text and a
-document. The indexes refer to characters.
+A text-only representation of a topic match between a search text and a document.
Properties:
@@ -1989,6 +1748,15 @@ document_label -- the label of the document.
text -- the document text that was matched.
text_to_match -- the search text.
rank -- a string representation of the scoring rank which can have the form '2=' in case of a tie.
+index_within_document -- the index of the document token where the activation peaked.
+subword_index -- the index of the subword within the document token where the activation peaked, or
+ 'None' if the activation did not peak at a specific subword.
+start_index -- the index of the first document token in the topic match.
+end_index -- the index of the last document token in the topic match (NOT one more than that index).
+sentences_start_index -- the token start index within the document of the sentence that contains
+ 'start_index'
+sentences_end_index -- the token end index within the document of the sentence that contains
+ 'end_index' (NOT one more than that index).
sentences_character_start_index_in_document -- the character index of the first character of 'text'
within the document.
sentences_character_end_index_in_document -- one more than the character index of the last
@@ -1998,13 +1766,21 @@ word_infos -- an array of arrays with the semantics:
[0] -- 'relative_start_index' -- the index of the first character in the word relative to
'sentences_character_start_index_in_document'.
- [1] -- 'relative_end_index' -- one more than the index of the last character in the word relative to
- 'sentences_character_start_index_in_document'.
- [2] -- 'type' -- 'single' for a single-word match, 'relation' if within a relation match involving two words,
- 'overlapping_relation' if within a relation match involving three or more words.
- [3] -- 'is_highest_activation' -- 'True' if this was the word at which the highest activation score reported in 'score' was achieved, otherwise 'False'.
- [4] -- 'explanation' -- a human-readable explanation of the word match from the perspective of the
- document word (e.g. to be used as a tooltip over it).
+ [1] -- 'relative_end_index' -- one more than the index of the last character in the word
+ relative to 'sentences_character_start_index_in_document'.
+ [2] -- 'type' -- 'single' for a single-word match, 'relation' if within a relation match
+ involving two words, 'overlapping_relation' if within a relation match involving three
+ or more words.
+ [3] -- 'is_highest_activation' -- 'True' if this was the word at which the highest activation
+ score reported in 'score' was achieved, otherwise 'False'.
+ [4] -- 'explanation' -- a human-readable explanation of the word match from the perspective of
+ the document word (e.g. to be used as a tooltip over it).
+
+answers -- an array of arrays with the semantics:
+
+ [0] -- the index of the first character of a potential answer to an initial question word.
+ [1] -- one more than the index of the last character of a potential answer to an initial question
+ word.
```
@@ -2012,7 +1788,7 @@ word_infos -- an array of arrays with the semantics:
Holmes encompasses several concepts that build on work that the author, Richard
Paul Hudson, carried out as a young graduate and for which his former
-employer, [Definiens](https://www.definiens.com), has since been granted a
+employer, [Definiens], has since been granted a
[U.S. patent](https://patents.google.com/patent/US8155946B2/en).
Definiens has kindly permitted the author to publish Holmes under the GNU General Public
License ("GPL"). As long as you abide by the terms of the GPL, this means you can
@@ -2042,17 +1818,29 @@ you are proposing involves the USA in any way.
The word-level matching and the high-level operation of structural
matching between search-phrase and document subgraphs both work more or
less as one would expect. What is perhaps more in need of further
-comment is the semantic analysis code subsumed in the `semantics.py`
-script.
+comment is the semantic analysis code subsumed in the [parsing.py](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/parsing.py)
+script as well as in the `language_specific_rules.py` script for each
+language.
-`SemanticAnalyzer` is an abstract class that is subclassed for each new
+`SemanticAnalyzer` is an abstract class that is subclassed for each
language: at present by `EnglishSemanticAnalyzer` and
-`GermanSemanticAnalyzer`. At present, all functionality that is common
-to the two languages is realised in the abstract parent class.
+`GermanSemanticAnalyzer`. These classes contain most of the semantic analysis code.
+`SemanticMatchingHelper` is a second abstract class, again with an concrete
+implementation for each language, that contains semantic analysis code
+that is required at matching time. Moving this out to a separate class family
+was necessary because, on operating systems that spawn processes rather
+than forking processes (e.g. Windows), `SemanticMatchingHelper` instances
+have to be serialized when the worker processes are created: this would
+not be possible for `SemanticAnalyzer` instances because not all
+spaCy models are serializable, and would also unnecessarily consume
+large amounts of memory.
+
+At present, all functionality that is common
+to the two languages is realised in the two abstract parent classes.
Especially because English and German are closely related languages, it
is probable that functionality will need to be moved from the abstract
-parent class to specific implementing children classes when new semantic
-analyzers are added for new languages.
+parent classes to specific implementing children classes if and when new
+semantic analyzers are added for new languages.
The `HolmesDictionary` class is defined as a [spaCy extension
attribute](https://spacy.io/usage/processing-pipelines#section-custom-components-attributes)
@@ -2079,9 +1867,9 @@ exclusively for matching and are therefore neither intended nor required
to form a coherent set of linguistic theoretical entities or relationships;
whatever works best for matching is assigned on an ad-hoc basis.
-For each language, the `_matching_dep_dict` dictionary maps search-phrase semantic dependencies to matching
-document semantic dependencies and is responsible for the [asymmetry of matching between search phrases
-and documents](#general-comments).
+For each language, the `match_implication_dict` dictionary maps search-phrase semantic dependencies
+to matching document semantic dependencies and is responsible for the [asymmetry of matching
+between search phrases and documents](#general-comments).
##### 8.1.2 Topic matching
@@ -2089,77 +1877,66 @@ and documents](#general-comments).
Topic matching involves the following steps:
1. The query document or query phrase is parsed and a number of **phraselets**
-are derived from it. Single-word phraselets are extracted for every word (or subword in German) with its own meaning within the query phrase apart from a handful of stop words defined within the semantic analyzer (`SemanticAnalyzer.topic_matching_phraselet_stop_lemmas`), which are
+are derived from it. Single-word phraselets are extracted for every word (or subword in German) with its own meaning within the query phrase apart from a handful of stop words defined within the semantic matching helper (`SemanticMatchingHelper.topic_matching_phraselet_stop_lemmas`), which are
consistently ignored throughout the whole process.
2. Two-word or **relation** phraselets are extracted wherever certain grammatical structures
are found. The structures that trigger two-word phraselets differ from language to language
but typically include verb-subject, verb-object and noun-adjective pairs as well as verb-noun and noun-noun relations spanning prepositions. Each relation phraselet
has a parent (governor) word or subword and a child (governed) word or subword. The relevant
-phraselet structures for a given language are defined in `SemanticAnalyzer.phraselet_templates`.
-3. Phraselet templates where the parent word belongs to a closed word class e.g. prepositions can be defined as 'reverse_only'. This signals that matching with derived templates should only be attempted starting from the child word rather than from the parent word as normal. Phraselets are also defined as reverse-only when the parent word is one of a handful of words defined within the semantic analyzer (`SemanticAnalyzer.topic_matching_reverse_only_parent_lemmas`). This is necessary because
-matching on e.g. a parent preposition would lead to a large number of
+phraselet structures for a given language are defined in `SemanticMatchingHelper.phraselet_templates`.
+3. Both types of phraselet are assigned a **frequency factor** expressing how common or rare its word or words are in the corpus. Frequency factors are determined using a logarithmic calculation and range from 0.0 (very common) to 1.0 (very rare). Each word within a relation phraselet is also assigned its own frequency factor.
+4. Phraselet templates where the parent word belongs to a closed word class, e.g. prepositions, can be defined as 'reverse_only'. This signals that matching with derived templates should only be attempted starting from the child word rather than from the parent word as normal. Phraselets are also defined as reverse-only when the parent word is one of a handful of words defined within the semantic matching helper (`SemanticMatchingHelper.topic_matching_reverse_only_parent_lemmas`) or when the frequency factor for the parent word is below the threshold for relation matching ( `relation_matching_frequency_threshold`, default: 0.25). These measures are necessary because matching on e.g. a parent preposition would lead to a large number of
potential matches that would take a lot of resources to investigate: it is better to start
investigation from the less frequent word within a given relation.
-4. All single-word phraselets are matched against the document corpus. If words matching the parent
-member of a normal (not reverse-only) phraselet occur more often within the corpus than a certain threshold
-('maximum_number_of_single_word_matches_for_relation_matching'; default: 500), the phraselet is set for
-reverse-matching for the duration of this topic match only.
-5. Normal [structural matching](#how-it-works-structural-matching) is used to match against the document corpus all relation phraselets
+5. All single-word phraselets are matched against the document corpus.
+6. Normal [structural matching](#how-it-works-structural-matching) is used to match against the document corpus all relation phraselets
that are not set to reverse-matching.
-6. Reverse matching starts at all words in the corpus that match a relation phraselet child word. Every
-word governing one of these words is a potential match for the corresponding relation phraselet parent word, so
-structural matching is attempted starting at all these parent words. Reverse matching is only attempted
-for reverse-only relation phraselets where the child word occurs less frequently in the corpus
-that the 'maximum_number_of_single_word_matches_for_relation_matching' threshold.
-7. If either the parent or the child word of a relation template has been matched less frequently
-than a certain threshold within the corpus ('maximum_number_of_single_word_matches_for_embedding_matching';
-default: 100), matching at all of those words where the relation template has not already been
-matched is retried using embeddings at the other word within the relation. This is only relevant
-if the manager was started with `overall_similarity_threshold < 1.0`.
-8. The set of structural matches collected up to this point is filtered to cover cases where the same
+7. Reverse matching starts at all words in the corpus that match a relation phraselet child word. Every word governing one of these words is a potential match for the corresponding relation phraselet parent word, so structural matching is attempted starting at all these parent words. Reverse matching is only attempted for reverse-only relation phraselets where the child word's frequency factor is above the threshold for relation matching ( `relation_matching_frequency_threshold`, default: 0.25).
+8. If either the parent or the child word of a relation template has a frequency factor above a configurable threshold (`embedding_matching_frequency_threshold`, default: 0.5), matching at all of those words where the relation template has not already been
+matched is retried using embeddings at the other word within the relation. A pair of words is then regarded as matching when their mutual cosine similarity is above `initial_question_word_embedding_match_threshold` (default: 0.7) in situations where the document word has an initial question word in its phrase or `word_embedding_match_threshold` (default: 0.8) in all other situations.
+9. The set of structural matches collected up to this point is filtered to cover cases where the same
document words were matched by multiple phraselets, where multiple sibling words have been matched by the same
phraselet where one sibling has a higher [embedding-based similarity](#embedding-based-matching) than the
other, and where a phraselet has matched multiple words that [corefer](#coreference-resolution) with one another.
-9. Each document is scanned from beginning to end and a psychologically inspired **activation score**
+10. Each document is scanned from beginning to end and a psychologically inspired **activation score**
is determined for each word in each document.
- - In contrast to Holmes versions < 2.1.0, activation is now tracked separately for each phraselet. Each time
+ - Activation is tracked separately for each phraselet. Each time
a match for a phraselet is encountered, the activation for that phraselet is set to the score returned by
- the match, unless the existing activation is already greater than that score.
+ the match, unless the existing activation is already greater than that score. If the parameter `use_frequency_factor` is set to `True` (the default), each score are scaled by the frequency factor of its phraselet, meaning that words that occur less frequently in the corpus give rise to higher scores.
- For as long as the activation score for a phraselet has a value above zero, it is reduced by 1 divided by a
- configurable number ('maximum_activation_distance'; default: 75) as each new word is read.
- - The score returned by a match depends on whether the match was produced by a single-word noun phraselet that matched an entire word ('single_word_score'; default: 5), another type of single-word phraselet or a noun phraselet that matched a subword ('single_word_any_tag_score'; default: 2),
- a relation phraselet produced by a reverse-only template ('reverse_only_relation_score'; default: 20) or
- any other (normally matched) relation phraselet ('relation_score'; default: 30).
+ configurable number (`maximum_activation_distance`; default: 75) as each new word is read.
+ - The score returned by a match depends on whether the match was produced by a single-word noun phraselet that matched an entire word (`single_word_score`; default: 50), another type of single-word phraselet or a noun phraselet that matched a subword (`single_word_any_tag_score`; default: 20),
+ a relation phraselet produced by a reverse-only template (`reverse_only_relation_score`; default: 200),
+ any other (normally matched) relation phraselet (`relation_score`; default: 300), or a relation
+ phraselet involving an initial question word (`initial_question_word_answer_score`; default: 600).
- Where a match involves embedding-based matching, the resulting inexactitude is
captured by multiplying the potential new activation score with the value of the
- 'Match.overall_similarity_measure' quotient that was returned for the match multiplied by a penalty value ('embedding_penalty; default: 0.6').
+ similarity measure that was returned for the match multiplied by a penalty value (`embedding_penalty`; default: 0.6').
- Where a match involves ontology-based matching, the resulting inexactitude is captured
- by multiplying the potential new activation score by a penalty value ('ontology_penalty;
+ by multiplying the potential new activation score by a penalty value (`ontology_penalty`;
default: 0.9') once more often than the difference in depth between the two ontology entries,
i.e. once for a synonym, twice for a child, three times for a grandchild and so on.
- When the same word was involved in matches against more than one two-word phraselets, this
implies that a structure involving three or more words has been matched. The activation score returned by
each match within such a structure is multiplied by a configurable factor
- ('overlapping_relation_multiplier'; default: 1.5).
+ (`overlapping_relation_multiplier`; default: 1.5).
-10. The most relevant passages are then determined by the highest activation score peaks within the documents. Areas to either side of each peak up to a certain distance
-('sideways_match_extent'; default: 100 words) within which the activation score is higher than the number of points
-awarded for a single-word noun phraselet match (default: 5) are regarded as belonging to a contiguous passage around
-the peak that is then returned as a `TopicMatch` object. A word whose activation equals the threshold exactly is included at the beginning of the area as long as the next word where
+11. The most relevant passages are then determined by the highest activation score peaks within the documents. Areas to either side of each peak up to a certain distance
+(`sideways_match_extent`; default: 100 words) within which the activation score is higher than the `different_match_cutoff_score` (default: 15) are regarded as belonging to a contiguous passage around the peak that is then returned as a `TopicMatch` object. (Note that this default will almost certainly turn out to be too low if `use_frequency_factor`is set to `False`.) A word whose activation equals the threshold exactly is included at the beginning of the area as long as the next word where
activation increases has a score above the threshold. If the topic match peak is below the
threshold, the topic match will only consist of the peak word.
-11. Setting `only_one_result_per_document = True` prevents more than one result from being returned from the same
+12. If `initial_question_word_behaviour` is set to `process` (the default) or to `exclusive`, where a document word has [matched an initial question word](#initial-question-word-matching) from the query phrase, the subtree of the matched document word is identified as a potential answer to the question and added to the dictionary to be returned. If `initial_question_word_behaviour` is set to `exclusive`, any topic matches that do not contain answers to initial question words are discarded.
+13. Setting `only_one_result_per_document = True` prevents more than one result from being returned from the same
document; only the result from each document with the highest score will then be returned.
-12. If the results are being returned as dictionaries, the score for each topic match is used to calculate a rank.
-Adjacent topic matches whose scores differ by less than 'tied_result_quotient' (default: 0.9) are labelled as tied.
+14. Adjacent topic matches whose scores differ by less than `tied_result_quotient` (default: 0.9) are labelled as tied.
##### 8.1.3 Supervised document classification
The supervised document classification use case relies on the same phraselets as the
[topic matching use case](#how-it-works-topic-matching), although reverse-only templates are ignored and
-a different set of stop words is used (`SemanticAnalyzer.supervised_document_classification_phraselet_stop_lemmas`).
+a different set of stop words is used (`SemanticMatchingHelper.supervised_document_classification_phraselet_stop_lemmas`).
Classifiers are built and trained as follows:
1. All phraselets are extracted from all training documents and registered with a structural matcher.
@@ -2175,16 +1952,16 @@ See [here](#improve-performance-of-supervised-document-classification-training)
performance of this step.
3. The results for each phraselet are examined and phraselets are removed from the model that do not play a
statistically significant role in predicting classifications. Phraselets are removed that did not match within
-the documents of any classification a minimum number of times ('minimum_occurrences'; default: 4) or where the
+the documents of any classification a minimum number of times (`minimum_occurrences`; default: 4) or where the
coefficient of variation (the standard deviation divided by the arithmetic mean) of the occurrences across the
-categories is below a [threshold](#supervised-topic-training-basis) ('cv_threshold'; default: 1.0).
+categories is below a threshold (`cv_threshold`; default: 1.0).
4. The phraselets that made it into the model are once again matched against each document. Matches against each
phraselet are used to determine the input values to a multilayer perceptron: the input nodes can either record
occurrence (binary) or match frequency (scalar) (`oneshot==True` vs. `oneshot==False` respectively). The outputs are the
category labels, including any additional labels determined via a classification ontology. By default, the multilayer
perceptron has three hidden layers where the first hidden layer has the same number of neurons as the input layer and
the second and third layers have sizes in between the input and the output layer with an equally sized step between
-each size; the user is however [free to specify any other topology](#supervised-topic-training-basis).
+each size; the user is however [free to specify any other topology](#supervised-topic-training-basis-train).
5. The resulting model is serializable, i.e. can be saved and reloaded.
6. When a new document is classified, the output
is zero, one or many suggested classifications; when more than one classification is suggested, the classifications
@@ -2199,7 +1976,7 @@ the complexity of some of the code, Holmes adheres to a 100-character
rather than an 80-character line width as permitted as an option there.
The complexity of what Holmes does makes development impossible without
-a robust set of over 1100 regression tests. These can be executed individually
+a robust set of over 1350 regression tests. These can be executed individually
with `unittest` or all at once by running the
[pytest](https://docs.pytest.org/en/latest/) utility from the Holmes
source code root directory. (Note that the Python 3 command on Linux
@@ -2209,9 +1986,9 @@ The `pytest` variant will only work on machines with sufficient memory resources
reduce this problem, the tests are distributed across three subdirectories, so that
`pytest` can be run three times, once from each subdirectory:
-- [en](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/tests/en): tests relating to English
-- [de](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/tests/de): tests relating to German
-- [common](https://github.com/msg-systems/holmes-extractor/blob/master/holmes_extractor/tests/common): language-independent tests
+- [en](https://github.com/msg-systems/holmes-extractor/blob/master/tests/en): tests relating to English
+- [de](https://github.com/msg-systems/holmes-extractor/blob/master/tests/de): tests relating to German
+- [common](https://github.com/msg-systems/holmes-extractor/blob/master/tests/common): language-independent tests
#### 8.3 Areas for further development
@@ -2220,11 +1997,8 @@ reduce this problem, the tests are distributed across three subdirectories, so t
##### 8.3.1 Additional languages
New languages can be added to Holmes by subclassing the
-`SemanticAnalyzer` class as explained [here](#how-it-works-structural-matching). Because [some of
-the linguistic features](https://spacy.io/api/annotation) returned by
-spaCy are the same for all languages except English and German, the
-additional effort required to add a *fourth* language may well be less
-than the additional effort required to add a third language.
+`SemanticAnalyzer` and `SemanticMatchingHelper` classes as explained
+[here](#how-it-works-structural-matching).
##### 8.3.2 Use of machine learning to improve matching
@@ -2245,7 +2019,7 @@ make them more compliant with data protection laws.
##### 8.3.4 Improve the performance of supervised document classification training
-As long as [embedding-based matching](#embedding-based-matching) is not active, the second step of the
+In cases where [embedding-based matching](#embedding-based-matching) is not active, the second step of the
[supervised document classification](#how-it-works-supervised-document-classification) procedure repeats
a considerable amount of processing from the first step. Retaining the relevant information from the first
step of the procedure would greatly improve training performance. This has not been attempted up to now
@@ -2273,8 +2047,8 @@ The initial open-source version.
- Upgrade to spaCy 2.1.0 and neuralcoref 4.0.0.
- Addition of new dependency `pobjp` linking parents of prepositions directly with their children.
-- Development of the multiprocessing architecture, which has the
-[MultiprocessingManager](#multiprocessing-manager) object as its facade.
+- Development of the multiprocessing architecture, which has the `MultiprocessingManager` object
+as its facade.
- Complete overhaul of [topic matching](#how-it-works-topic-matching).
- Incorporation of coreference information into Holmes document structures so it no longer needs to be calculated on the fly.
- New literature examples for both languages and the facility to serve them over RESTful HTTP.
@@ -2291,12 +2065,11 @@ same stem.
- Ontology implication rules are now calculated eagerly to improve runtime performance.
- [Ontology-based matching](#ontology-based-matching) now includes special, language-specific rules to handle hyphens within ontology entries.
- Word-match information is now included in all matches including single-word matches.
-- [Word matches](#wordmatch) and dictionaries derived from them now include human-readable explanations designed to be used as tooltips.
+- Word matches and dictionaries derived from them now include human-readable explanations designed to be used as tooltips.
- In [topic matching](#manager-topic-match-function), a penalty is now applied to ontology-based matches as well as to embedding-based matches.
- [Topic matching](#manager-topic-match-function) now includes a filter facility to specify
that only documents whose labels begin with a certain string should be searched.
-- Error handling and reporting have been improved for the
-[MultiprocessingManager](#multiprocessing-manager).
+- Error handling and reporting have been improved for the MultiprocessingManager.
- Numerous minor improvements and bugfixes.
- The [demo website](http://holmes-demo.xt.msg.team/) has been updated to reflect the changes.
@@ -2305,3 +2078,13 @@ that only documents whose labels begin with a certain string should be searched.
- Fixed bug with reverse derived lemmas and subwords (only affects German).
- Removed dead code.
+
+
+##### 8.4.5 Version 3.0.0
+
+- Moved to [coreferee](https://github.com/msg-systems/coreferee) as the source of coreference information, meaning that coreference resolution is now active for German as well as English; all documents can be serialized; and the latest spaCy version can be supported.
+- The corpus frequencies of words are now taken into account when scoring topic matches.
+- Reverse dependencies are now taken into account, so that e.g. *a man dies* can match *the dead man* although the dependencies in the two phrases point in opposite directions.
+- Merged the pre-existing `Manager` and `MultiprocessingManager` classes into a single `Manager` class, with a redesigned public interface, that uses worker threads for everything except supervised document classification.
+- Added support for [initial question words](#initial-question-word-matching).
+- The [demo website](http://holmes-demo.xt.msg.team/) has been updated to reflect the changes.
diff --git a/SHORTREADME.md b/SHORTREADME.md
index aeb828d..364c94d 100644
--- a/SHORTREADME.md
+++ b/SHORTREADME.md
@@ -1,41 +1,43 @@
-**Holmes** is a Python 3 library (tested with version 3.7.7) that supports a number of
-use cases involving information extraction from English and German texts. In all use cases, the information extraction
-is based on analysing the semantic relationships expressed by the component parts of each sentence:
+**Holmes** is a Python 3 library (tested with version 3.9.5) running on top of
+[spaCy](https://spacy.io/) (tested with version 3.1.2) that supports a number of use cases
+involving information extraction from English and German texts. In all use cases, the information
+extraction is based on analysing the semantic relationships expressed by the component parts of
+each sentence:
-- In the [chatbot](https://github.com/msg-systems/holmes-extractor/#getting-started) use case, the system is configured using one or more **search phrases**.
+- In the [chatbot](https://github.com/msg-systems/holmes-extractor#getting-started) use case, the system is configured using one or more **search phrases**.
Holmes then looks for structures whose meanings correspond to those of these search phrases within
a searched **document**, which in this case corresponds to an individual snippet of text or speech
entered by the end user. Within a match, each word with its own meaning (i.e. that does not merely fulfil a grammatical function) in the search phrase
corresponds to one or more such words in the document. Both the fact that a search phrase was matched and any structured information the search phrase extracts can be used to drive the chatbot.
-- The [structural extraction](https://github.com/msg-systems/holmes-extractor/#structural-extraction) use case uses exactly the same
-[structural matching](https://github.com/msg-systems/holmes-extractor/#how-it-works-structural-matching) technology as the chatbot use
+- The [structural extraction](https://github.com/msg-systems/holmes-extractor#structural-extraction) use case uses exactly the same
+[structural matching](https://github.com/msg-systems/holmes-extractor#how-it-works-structural-matching) technology as the chatbot use
case, but searching takes place with respect to a pre-existing document or documents that are typically much
-longer than the snippets analysed in the chatbot use case, and the aim to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to
+longer than the snippets analysed in the chatbot use case, and the aim is to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to
take over a second company. The identities of the companies concerned could then be stored in a database.
-- The [topic matching](https://github.com/msg-systems/holmes-extractor/#topic-matching) use case aims to find passages in a document or documents whose meaning
+- The [topic matching](https://github.com/msg-systems/holmes-extractor#topic-matching) use case aims to find passages in a document or documents whose meaning
is close to that of another document, which takes on the role of the **query document**, or to that of a **query phrase** entered ad-hoc by the user. Holmes extracts a number of small **phraselets** from the query phrase or
-query document, matches the documents being searched against each phraselet, and conflates the results to find the
-most relevant passages within the documents. Because there is no strict requirement that every word with its own
-meaning in the query document match a specific word or words in the searched documents, more matches are found
+query document, matches the documents being searched against each phraselet, and conflates the results to find
+the most relevant passages within the documents. Because there is no strict requirement that every
+word with its own meaning in the query document match a specific word or words in the searched documents, more matches are found
than in the structural extraction use case, but the matches do not contain structured information that can be
used in subsequent processing. The topic matching use case is demonstrated by [a website allowing searches within
the Harry Potter corpus (for English) and around 350 traditional stories (for German)](http://holmes-demo.xt.msg.team/).
-- The [supervised document classification](https://github.com/msg-systems/holmes-extractor/#supervised-document-classification) use case uses training data to
+- The [supervised document classification](https://github.com/msg-systems/holmes-extractor#supervised-document-classification) use case uses training data to
learn a classifier that assigns one or more **classification labels** to new documents based on what they are about.
It classifies a new document by matching it against phraselets that were extracted from the training documents in the
same way that phraselets are extracted from the query document in the topic matching use case. The technique is
inspired by bag-of-words-based classification algorithms that use n-grams, but aims to derive n-grams whose component
words are related semantically rather than that just happen to be neighbours in the surface representation of a language.
-In all four use cases, the **individual words** are matched using a [number of strategies](https://github.com/msg-systems/holmes-extractor/#word-level-matching-strategies).
+In all four use cases, the **individual words** are matched using a [number of strategies](https://github.com/msg-systems/holmes-extractor#word-level-matching-strategies).
To work out whether two grammatical structures that contain individually matching words correspond logically and
constitute a match, Holmes transforms the syntactic parse information provided by the [spaCy](https://spacy.io/) library
into semantic structures that allow texts to be compared using predicate logic. As a user of Holmes, you do not need to
understand the intricacies of how this works, although there are some
-[important tips](https://github.com/msg-systems/holmes-extractor/#writing-effective-search-phrases) around writing effective search phrases for the chatbot and
+[important tips](https://github.com/msg-systems/holmes-extractor#writing-effective-search-phrases) around writing effective search phrases for the chatbot and
structural extraction use cases that you should try and take on board.
Holmes aims to offer generalist solutions that can be used more or less out of the box with
@@ -43,7 +45,7 @@ relatively little tuning, tweaking or training and that are rapidly applicable t
At its core lies a logical, programmed, rule-based system that describes how syntactic representations in each
language express semantic relationships. Although the supervised document classification use case does incorporate a
neural network and although the spaCy library upon which Holmes builds has itself been pre-trained using machine
-learning, the essentially rule-based nature of Holmes means that the chatbot, structural matching and topic matching use
+learning, the essentially rule-based nature of Holmes means that the chatbot, structural extraction and topic matching use
cases can be put to use out of the box without any training and that the supervised document classification use case
typically requires relatively little training data, which is a great advantage because pre-labelled training data is
not available for many real-world problems.
diff --git a/examples/example_chatbot_DE_insurance.py b/examples/example_chatbot_DE_insurance.py
new file mode 100644
index 0000000..3eed589
--- /dev/null
+++ b/examples/example_chatbot_DE_insurance.py
@@ -0,0 +1,17 @@
+import os
+import holmes_extractor as holmes
+
+if __name__ in ('__main__', 'example_chatbot_DE_insurance'):
+ script_directory = os.path.dirname(os.path.realpath(__file__))
+ ontology = holmes.Ontology(os.sep.join((
+ script_directory, 'example_chatbot_DE_insurance_ontology.owl')))
+ holmes_manager = holmes.Manager(model='de_core_news_lg', ontology=ontology, number_of_workers=2)
+ holmes_manager.register_search_phrase('Jemand benötigt eine Versicherung')
+ holmes_manager.register_search_phrase('Ein ENTITYPER schließt eine Versicherung ab')
+ holmes_manager.register_search_phrase('ENTITYPER benötigt eine Versicherung')
+ holmes_manager.register_search_phrase('Eine Versicherung für einen Zeitraum')
+ holmes_manager.register_search_phrase('Eine Versicherung fängt an')
+ holmes_manager.register_search_phrase('Jemand zahlt voraus')
+
+ holmes_manager.start_chatbot_mode_console()
+ # e.g. 'Richard Hudson und Max Mustermann brauchen eine Krankenversicherung für die nächsten fünf Jahre'
diff --git a/holmes_extractor/examples/example_chatbot_DE_insurance_ontology.owl b/examples/example_chatbot_DE_insurance_ontology.owl
similarity index 100%
rename from holmes_extractor/examples/example_chatbot_DE_insurance_ontology.owl
rename to examples/example_chatbot_DE_insurance_ontology.owl
diff --git a/examples/example_chatbot_EN_insurance.py b/examples/example_chatbot_EN_insurance.py
new file mode 100644
index 0000000..03864a0
--- /dev/null
+++ b/examples/example_chatbot_EN_insurance.py
@@ -0,0 +1,20 @@
+import os
+import holmes_extractor as holmes
+
+if __name__ in ('__main__', 'example_chatbot_EN_insurance'):
+ script_directory = os.path.dirname(os.path.realpath(__file__))
+ ontology = holmes.Ontology(os.sep.join((
+ script_directory, 'example_chatbot_EN_insurance_ontology.owl')))
+ holmes_manager = holmes.Manager(
+ model='en_core_web_lg', ontology=ontology, number_of_workers=2)
+ holmes_manager.register_search_phrase('Somebody requires insurance')
+ holmes_manager.register_search_phrase('An ENTITYPERSON takes out insurance')
+ holmes_manager.register_search_phrase('A company buys payment insurance')
+ holmes_manager.register_search_phrase('An ENTITYPERSON needs insurance')
+ holmes_manager.register_search_phrase('Insurance for a period')
+ holmes_manager.register_search_phrase('An insurance begins')
+ holmes_manager.register_search_phrase('Somebody prepays')
+ holmes_manager.register_search_phrase('Somebody makes an insurance payment')
+
+ holmes_manager.start_chatbot_mode_console()
+ # e.g. 'Richard Hudson and John Doe require health insurance for the next five years'
diff --git a/holmes_extractor/examples/example_chatbot_EN_insurance_ontology.owl b/examples/example_chatbot_EN_insurance_ontology.owl
similarity index 100%
rename from holmes_extractor/examples/example_chatbot_EN_insurance_ontology.owl
rename to examples/example_chatbot_EN_insurance_ontology.owl
diff --git a/holmes_extractor/examples/example_search_DE_law.py b/examples/example_search_DE_law.py
similarity index 64%
rename from holmes_extractor/examples/example_search_DE_law.py
rename to examples/example_search_DE_law.py
index 5e7d69b..186a946 100644
--- a/holmes_extractor/examples/example_search_DE_law.py
+++ b/examples/example_search_DE_law.py
@@ -13,10 +13,11 @@ def download_and_register(url, label):
holmes_manager.parse_and_register_document(soup.get_text(), label)
# Start the Holmes Manager with the German model
-holmes_manager = holmes.Manager(model='de_core_news_md')
-download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008')
-download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG')
-holmes_manager.start_topic_matching_search_mode_console()
+if __name__ in ('__main__', 'example_search_DE_law'):
+ holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=2)
+ download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008')
+ download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG')
+ holmes_manager.start_topic_matching_search_mode_console(initial_question_word_embedding_match_threshold=0.7)
# Example queries:
#
diff --git a/holmes_extractor/examples/example_search_DE_literature.py b/examples/example_search_DE_literature.py
similarity index 68%
rename from holmes_extractor/examples/example_search_DE_literature.py
rename to examples/example_search_DE_literature.py
index 96a8307..a8c6943 100644
--- a/holmes_extractor/examples/example_search_DE_literature.py
+++ b/examples/example_search_DE_literature.py
@@ -11,19 +11,18 @@
HOLMES_EXTENSION = 'hdc'
flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE'))
- print('Initializing Holmes...')
+ print('Initializing Holmes (this may take some time) ...')
# Start the Holmes manager with the German model
- holmes_manager = holmes.MultiprocessingManager(
- model='de_core_news_md', overall_similarity_threshold=0.85, number_of_workers=4)
- # set number_of_workers to prevent memory exhaustion / swapping; it should never be more
- # than the number of cores on the machine
+ holmes_manager = holmes.Manager(
+ model='de_core_news_lg')
- def process_documents_from_front_page(
- manager, front_page_uri, front_page_label):
+ def process_documents_from_front_page(front_page_uri, front_page_label):
""" Download and save all the stories from a front page."""
front_page = urllib.request.urlopen(front_page_uri)
front_page_soup = BeautifulSoup(front_page, 'html.parser')
+ document_texts = []
+ labels = []
# For each story ...
for anchor in front_page_soup.find_all('a'):
if not anchor['href'].startswith('/') and not anchor['href'].startswith('https'):
@@ -44,15 +43,16 @@ def process_documents_from_front_page(
this_document_text = ' '.join(this_document_text.split())
# Create a document label from the front page label and the story name
this_document_label = ' - '.join((front_page_label, anchor.contents[0]))
- # Parse the document
- print('Parsing', this_document_label)
- manager.parse_and_register_document(this_document_text, this_document_label)
- # Save the document
- print('Saving', this_document_label)
- output_filename = os.sep.join((working_directory, this_document_label))
- output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
- with open(output_filename, "w") as file:
- file.write(manager.serialize_document(this_document_label))
+ document_texts.append(this_document_text)
+ labels.append(this_document_label)
+ parsed_documents = holmes_manager.nlp.pipe(document_texts)
+ for index, parsed_document in enumerate(parsed_documents):
+ label = labels[index]
+ print('Saving', label)
+ output_filename = os.sep.join((working_directory, label))
+ output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
+ with open(output_filename, "wb") as file:
+ file.write(parsed_document.to_bytes())
def load_documents_from_working_directory():
serialized_documents = {}
@@ -61,31 +61,31 @@ def load_documents_from_working_directory():
print('Loading', file)
label = file[:-4]
long_filename = os.sep.join((working_directory, file))
- with open(long_filename, "r") as file:
+ with open(long_filename, "rb") as file:
contents = file.read()
serialized_documents[label] = contents
- holmes_manager.deserialize_and_register_documents(serialized_documents)
+ print('Indexing documents (this may take some time) ...')
+ holmes_manager.register_serialized_documents(serialized_documents)
if os.path.exists(working_directory):
if not os.path.isdir(working_directory):
- raise RuntimeError(' '.join((working_directory), 'must be a directory'))
+ raise RuntimeError(' '.join((working_directory, 'must be a directory')))
else:
os.mkdir(working_directory)
if os.path.isfile(flag_filename):
load_documents_from_working_directory()
else:
- normal_holmes_manager = holmes.Manager(model='de_core_news_md')
process_documents_from_front_page(
- normal_holmes_manager, "https://maerchen.com/grimm/", 'Gebrüder Grimm')
+ "https://maerchen.com/grimm/", 'Gebrüder Grimm')
process_documents_from_front_page(
- normal_holmes_manager, "https://maerchen.com/grimm2/", 'Gebrüder Grimm')
+ "https://maerchen.com/grimm2/", 'Gebrüder Grimm')
process_documents_from_front_page(
- normal_holmes_manager, "https://maerchen.com/andersen/", 'Hans Christian Andersen')
+ "https://maerchen.com/andersen/", 'Hans Christian Andersen')
process_documents_from_front_page(
- normal_holmes_manager, "https://maerchen.com/bechstein/", 'Ludwig Bechstein')
+ "https://maerchen.com/bechstein/", 'Ludwig Bechstein')
process_documents_from_front_page(
- normal_holmes_manager, "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf')
+ "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf')
# Generate flag file to indicate files can be reloaded on next run
open(flag_filename, 'a').close()
load_documents_from_working_directory()
@@ -101,8 +101,8 @@ def load_documents_from_working_directory():
class RestHandler():
def on_get(self, req, resp):
- resp.body = \
- json.dumps(holmes_manager.topic_match_documents_returning_dictionaries_against(
+ resp.text = \
+ json.dumps(holmes_manager.topic_match_documents_against(
req.params['entry'][0:200], only_one_result_per_document=True))
resp.cache_control = ["s-maxage=31536000"]
diff --git a/examples/example_search_EN_literature.py b/examples/example_search_EN_literature.py
new file mode 100644
index 0000000..15218a6
--- /dev/null
+++ b/examples/example_search_EN_literature.py
@@ -0,0 +1,117 @@
+import os
+import re
+import json
+import urllib.request
+from bs4 import BeautifulSoup
+import holmes_extractor as holmes
+import falcon
+
+if __name__ in ('__main__', 'example_search_EN_literature'):
+
+ working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
+ HOLMES_EXTENSION = 'hdc'
+ flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE'))
+ print('Initializing Holmes (this may take some time) ...')
+
+ script_directory = os.path.dirname(os.path.realpath(__file__))
+ ontology = holmes.Ontology(os.sep.join((
+ script_directory, 'example_search_EN_literature_ontology.owl')))
+
+ # Start the Holmes manager with the English model
+ holmes_manager = holmes.Manager(
+ model='en_core_web_trf', ontology=ontology)
+
+ def extract_chapters_from_book(book_uri, title):
+ """ Download and save the chapters from a book."""
+
+ print()
+ print(title)
+ print()
+ book = urllib.request.urlopen(book_uri).read().decode()
+ book = re.sub("\\nPage \|.+?Rowling \\n", "", book)
+ book = re.sub("\\nP a g e \|.+?Rowling \\n", "", book)
+ book = re.sub("\\nPage \|.+?\\n", "", book)
+ book = book.replace("Harry Potter and the Half Blood Prince - J.K. Rowling", "")
+ book = book.replace("Harry Potter and the Goblet of Fire - J.K. Rowling", "")
+ book = book.replace("Harry Potter and the Deathly Hallows - J.K. Rowling", "")
+ book = book[1:]
+ chapter_headings = [heading for heading in re.finditer("(?<=((\\n\\n\\n\\n)|(\* \\n\\n)))((?!.*(WEASLEY WILL MAKE SURE)|(DO NOT OPEN THE PARCEL)|(HEADMISTRESS OF HOGWARTS))[A-Z][A-Z\-’., ]+)(\\n{1,2}((?!.*(WHO\-MUST))[A-Z\-’., ]+))?(?=(\\n\\n([^\\n]|(\\n\\n((“Harry!”)|(Harry’s)|(Ron’s)|(“Hagrid)|(Three o’clock))))))", book)]
+ chapter_counter = 1
+ labels = []
+ chapter_texts = []
+ chapter_dict = {}
+ for chapter_heading in chapter_headings:
+ label = ''.join((
+ 'Book ', title, ' Ch ', str(chapter_counter), " ‘",
+ chapter_heading.group().replace('\n', '').strip(), "’"))
+ labels.append(label)
+ if chapter_counter == len(chapter_headings): # last chapter
+ content = book[chapter_heading.end():]
+ else:
+ content = book[chapter_heading.end():chapter_headings[chapter_counter].start()]
+ content = content.replace('\n', '')
+ if content.endswith('& '):
+ content = content[:-2]
+ chapter_texts.append(content)
+ print('Extracted', label)
+ chapter_counter += 1
+ parsed_chapters = holmes_manager.nlp.pipe(chapter_texts)
+ for index, parsed_chapter in enumerate(parsed_chapters):
+ label = labels[index]
+ print('Saving', label)
+ output_filename = os.sep.join((working_directory, label))
+ output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
+ with open(output_filename, "wb") as file:
+ file.write(parsed_chapter.to_bytes())
+
+ def load_documents_from_working_directory():
+ serialized_documents = {}
+ for file in os.listdir(working_directory):
+ if file.endswith(HOLMES_EXTENSION):
+ print('Loading', file)
+ label = file[:-4]
+ long_filename = os.sep.join((working_directory, file))
+ with open(long_filename, "rb") as file:
+ contents = file.read()
+ serialized_documents[label] = contents
+ print('Indexing documents (this may take some time) ...')
+ holmes_manager.register_serialized_documents(serialized_documents)
+
+ if os.path.exists(working_directory):
+ if not os.path.isdir(working_directory):
+ raise RuntimeError(' '.join((working_directory, 'must be a directory')))
+ else:
+ os.mkdir(working_directory)
+
+ if os.path.isfile(flag_filename):
+ load_documents_from_working_directory()
+ else:
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt", "1 ‘The Philosopher\'s Stone’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt", "2 ‘The Chamber of Secrets’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt", "3 ‘The Prisoner of Azkaban’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt", "4 ‘The Goblet of Fire’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt", "5 ‘The Order of the Phoenix’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt", "6 ‘The Half Blood Prince’")
+ extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt", "7 ‘The Deathly Hallows’")
+ # Generate flag file to indicate files can be reloaded on next run
+ open(flag_filename, 'a').close()
+ load_documents_from_working_directory()
+
+ #Comment following line in to activate interactive console
+ #holmes_manager.start_topic_matching_search_mode_console(only_one_result_per_document=True)
+
+ # The following code starts a RESTful Http service to perform topic searches. It is deployed as
+ # as WSGI application. An example of how to start it - issued from the directory that
+ # contains the script - is
+
+ # waitress-serve example_search_DE_literature:application
+
+ class RestHandler():
+ def on_get(self, req, resp):
+ resp.text = \
+ json.dumps(holmes_manager.topic_match_documents_against(
+ req.params['entry'][0:200]))
+ resp.cache_control = ["s-maxage=31536000"]
+
+ application = falcon.App()
+ application.add_route('/english', RestHandler())
diff --git a/holmes_extractor/examples/example_search_EN_literature_ontology.owl b/examples/example_search_EN_literature_ontology.owl
similarity index 100%
rename from holmes_extractor/examples/example_search_EN_literature_ontology.owl
rename to examples/example_search_EN_literature_ontology.owl
diff --git a/examples/example_supervised_topic_model_EN.py b/examples/example_supervised_topic_model_EN.py
new file mode 100644
index 0000000..42baf0a
--- /dev/null
+++ b/examples/example_supervised_topic_model_EN.py
@@ -0,0 +1,93 @@
+import os
+import shutil
+import urllib.request
+import zipfile
+import holmes_extractor as holmes
+
+working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
+
+if __name__ in ('__main__', 'example_supervised_topic_model_EN'):
+ def is_training_data(document_number):
+ # We use any documents with numbers ending in 8,9,0 for test and all other documents for
+ # training.
+ return document_number[-1:] not in ('8', '9', '0')
+
+ def get_document_filename_info(filename):
+ # e.g. 'bbc/business/001.txt'
+ category = filename.split('/')[1]
+ document_number = filename.split('/')[2].split('.')[0]
+ return category, document_number
+
+ def evaluate_classifier(zip_filename, classifier):
+ correct_classification_counter = wrong_classification_counter = \
+ no_classification_counter = correct_as_additional_classification_counter = 0
+ with zipfile.ZipFile(zip_filename) as bbc_zipfile:
+ for filename in (
+ filename for filename in bbc_zipfile.namelist() if
+ filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
+ category, document_number = get_document_filename_info(filename)
+ if not is_training_data(document_number):
+ with bbc_zipfile.open(filename, 'r') as test_doc:
+ test_contents = str(test_doc.read())
+ test_contents = test_contents.replace('\n', ' ').replace('\r', ' ')
+ suggested_categories = classifier.parse_and_classify(test_contents)
+ if len(suggested_categories) == 0:
+ no_classification_counter += 1
+ elif suggested_categories[0] == category:
+ correct_classification_counter += 1
+ elif category in suggested_categories:
+ correct_as_additional_classification_counter += 1
+ else:
+ wrong_classification_counter += 1
+ print(''.join((
+ filename, ': actual category ', category,
+ '; suggested categories ', str(suggested_categories))))
+ print()
+ print('Totals:')
+ print(correct_classification_counter, 'correct classifications;')
+ print(no_classification_counter, 'unclassified documents;')
+ print(wrong_classification_counter, 'incorrect classifications;')
+ print(
+ correct_as_additional_classification_counter, 'incorrect classifications where the '
+ 'correct classification was returned as an additional classification.')
+
+ def train_model(working_directory, zip_filename):
+ training_basis = holmes_manager.get_supervised_topic_training_basis()
+ with zipfile.ZipFile(zip_filename) as bbc_zipfile:
+ for filename in (
+ filename for filename in bbc_zipfile.namelist() if
+ filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
+ category, document_number = get_document_filename_info(filename)
+ if is_training_data(document_number):
+ with bbc_zipfile.open(filename, 'r') as training_doc:
+ training_contents = str(training_doc.read())
+ training_contents = training_contents.replace('\n', ' ').replace('\r', ' ')
+ training_basis.parse_and_register_training_document(
+ training_contents, category, filename)
+ training_basis.prepare()
+ classifier = training_basis.train().classifier()
+ output_filename = os.sep.join((working_directory, 'model.json'))
+ with open(output_filename, "w") as file:
+ file.write(classifier.serialize_model())
+ evaluate_classifier(zip_filename, classifier)
+ holmes_manager = holmes.Manager('en_core_web_lg', number_of_workers=1)
+
+ if os.path.exists(working_directory):
+ if not os.path.isdir(working_directory):
+ raise RuntimeError(' '.join((working_directory, 'must be a directory')))
+ else:
+ os.mkdir(working_directory)
+ zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip')))
+ if not os.path.exists(zip_filename):
+ url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
+ with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file:
+ shutil.copyfileobj(response, out_file)
+ model_filename = os.sep.join((working_directory, 'model.json'))
+ if not os.path.exists(model_filename):
+ train_model(working_directory, zip_filename)
+ else:
+ print('Reloading existing trained model. '
+ 'Delete model.json from working directory to repeat training.')
+ with open(model_filename) as model_file:
+ classifier = holmes_manager.deserialize_supervised_topic_classifier(model_file.read())
+ evaluate_classifier(zip_filename, classifier)
diff --git a/holmes_extractor/__init__.py b/holmes_extractor/__init__.py
index 1715597..77130a1 100644
--- a/holmes_extractor/__init__.py
+++ b/holmes_extractor/__init__.py
@@ -1,5 +1,2 @@
-import logging
-logging.getLogger("rdflib").setLevel(logging.WARNING) # avoid INFO console message on startup
from holmes_extractor.manager import Manager
-from holmes_extractor.manager import MultiprocessingManager
from holmes_extractor.ontology import Ontology
diff --git a/holmes_extractor/classification.py b/holmes_extractor/classification.py
new file mode 100644
index 0000000..8712e97
--- /dev/null
+++ b/holmes_extractor/classification.py
@@ -0,0 +1,625 @@
+import uuid
+import statistics
+import jsonpickle
+from scipy.sparse import dok_matrix
+from sklearn.neural_network import MLPClassifier
+from .errors import WrongModelDeserializationError, FewerThanTwoClassificationsError, \
+ DuplicateDocumentError, NoPhraseletsAfterFilteringError, \
+ IncompatibleAnalyzeDerivationalMorphologyDeserializationError
+
+class SupervisedTopicTrainingUtils:
+
+ def __init__(self, overlap_memory_size, oneshot):
+ self.overlap_memory_size = overlap_memory_size
+ self.oneshot = oneshot
+
+ def get_labels_to_classification_frequencies_dict(
+ self, *, matches, labels_to_classifications_dict):
+ """ Builds a dictionary from search phrase (phraselet) labels to classification
+ frequencies. Depending on the training phase, which is signalled by the parameters, the
+ dictionary tracks either raw frequencies for each search phrase label or points to a
+ second dictionary from classification labels to frequencies.
+
+ Parameters:
+
+ matches -- the structural matches from which to build the dictionary
+ labels_to_classifications_dict -- a dictionary from document labels to document
+ classifications, or 'None' if the target dictionary should contain raw frequencies.
+ """
+ def increment(search_phrase_label, document_label):
+ if labels_to_classifications_dict is not None:
+ if search_phrase_label not in labels_to_frequencies_dict:
+ classification_frequency_dict = {}
+ labels_to_frequencies_dict[search_phrase_label] = classification_frequency_dict
+ else:
+ classification_frequency_dict = labels_to_frequencies_dict[search_phrase_label]
+ classification = labels_to_classifications_dict[document_label]
+ if classification in classification_frequency_dict:
+ classification_frequency_dict[classification] += 1
+ else:
+ classification_frequency_dict[classification] = 1
+ else:
+ if search_phrase_label not in labels_to_frequencies_dict:
+ labels_to_frequencies_dict[search_phrase_label] = 1
+ else:
+ labels_to_frequencies_dict[search_phrase_label] += 1
+
+ def relation_match_involves_whole_word_containing_subwords(match):
+ # Where there are subwords, we suppress relation matches with the
+ # entire word. The same rule is not applied to single-word matches because
+ # it still makes sense to track words with more than three subwords.
+ return len(match.word_matches) > 1 and \
+ len(
+ [
+ word_match for word_match in match.word_matches if
+ len(word_match.document_token._.holmes.subwords) > 0 and
+ word_match.document_subword is None]
+ ) > 0
+
+ labels_to_frequencies_dict = {}
+ matches = [
+ match for match in matches if not
+ relation_match_involves_whole_word_containing_subwords(match)]
+ matches = sorted(
+ matches, key=lambda match: (
+ match.document_label, match.index_within_document,
+ match.get_subword_index_for_sorting()))
+ for index, match in enumerate(matches):
+ if self.oneshot:
+ if ('this_document_label' not in locals()) or \
+ this_document_label != match.document_label:
+ this_document_label = match.document_label
+ search_phrases_added_for_this_document = set()
+ if match.search_phrase_label not in search_phrases_added_for_this_document:
+ increment(match.search_phrase_label, match.document_label)
+ search_phrases_added_for_this_document.add(match.search_phrase_label)
+ else:
+ increment(match.search_phrase_label, match.document_label)
+ if not match.from_single_word_phraselet:
+ previous_match_index = index
+ number_of_analyzed_matches_counter = 0
+ while previous_match_index > 0 and number_of_analyzed_matches_counter \
+ <= self.overlap_memory_size:
+ previous_match_index -= 1
+ previous_match = matches[previous_match_index]
+ if previous_match.document_label != match.document_label:
+ break
+ if previous_match.from_single_word_phraselet:
+ continue
+ if previous_match.search_phrase_label == match.search_phrase_label:
+ continue # otherwise coreference resolution leads to phrases being
+ # combined with themselves
+ number_of_analyzed_matches_counter += 1
+ previous_word_match_doc_indexes = [
+ word_match.get_document_index() for word_match in
+ previous_match.word_matches]
+ for word_match in match.word_matches:
+ if word_match.get_document_index() in previous_word_match_doc_indexes:
+ # the same word is involved in both matches, so combine them
+ # into a new label
+ label_parts = sorted((
+ previous_match.search_phrase_label, match.search_phrase_label))
+ combined_label = '/'.join((label_parts[0], label_parts[1]))
+ if self.oneshot:
+ if combined_label not in search_phrases_added_for_this_document:
+ increment(combined_label, match.document_label)
+ search_phrases_added_for_this_document.add(combined_label)
+ else:
+ increment(combined_label, match.document_label)
+ return labels_to_frequencies_dict
+
+ def record_matches(
+ self, *, phraselet_labels_to_search_phrases, semantic_matching_helper,
+ structural_matcher, sorted_label_dict, doc_label, doc, matrix, row_index,
+ overall_similarity_threshold):
+ """ Matches a document against the currently stored phraselets and records the matches
+ in a matrix.
+
+ Parameters:
+
+ phraselet_labels_to_search_phrases -- a dictionary from search phrase (phraselet)
+ labels to search phrase objects.
+ semantic_matching_helper -- the semantic matching helper to use.
+ structural_matcher -- the structural matcher to use for comparisons.
+ sorted_label_dict -- a dictionary from search phrase (phraselet) labels to their own
+ alphabetic sorting indexes.
+ doc_label -- the document label, or '' if there is none.
+ doc -- the document to be matched.
+ matrix -- the matrix within which to record the matches.
+ row_index -- the row number within the matrix corresponding to the document.
+ overall_similarity_threshold -- the threshold for embedding-based matching.
+ """
+ document_labels_to_documents = {doc_label: doc}
+ corpus_index_dict = {}
+ semantic_matching_helper.add_to_corpus_index(corpus_index_dict, doc, doc_label)
+ found = False
+ for label, occurrences in \
+ self.get_labels_to_classification_frequencies_dict(
+ matches=structural_matcher.match(
+ document_labels_to_documents=document_labels_to_documents,
+ corpus_index_dict=corpus_index_dict,
+ search_phrases=phraselet_labels_to_search_phrases.values(),
+ match_depending_on_single_words=None,
+ compare_embeddings_on_root_words=False,
+ compare_embeddings_on_non_root_words=True,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=None,
+ process_initial_question_words=False,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=1.0),
+ labels_to_classifications_dict=None
+ ).items():
+ if self.oneshot:
+ occurrences = 1
+ if label in sorted_label_dict: # may not be the case for compound labels
+ label_index = sorted_label_dict[label]
+ matrix[row_index, label_index] = occurrences
+ found = True
+ return found
+
+class SupervisedTopicTrainingBasis:
+ """ Holder object for training documents and their classifications from which one or more
+ 'SupervisedTopicModelTrainer' objects can be derived. This class is *NOT* threadsafe.
+ """
+ def __init__(
+ self, *, linguistic_object_factory, structural_matcher, classification_ontology,
+ overlap_memory_size, oneshot, match_all_words, overall_similarity_threshold, verbose):
+ """ Parameters:
+
+ linguistic_object_factory -- the linguistic object factory to use
+ structural_matcher -- the structural matcher to use.
+ classification_ontology -- an Ontology object incorporating relationships between
+ classification labels.
+ overlap_memory_size -- how many non-word phraselet matches to the left should be
+ checked for words in common with a current match.
+ oneshot -- whether the same word or relationship matched multiple times should be
+ counted once only (value 'True') or multiple times (value 'False')
+ match_all_words -- whether all single words should be taken into account
+ (value 'True') or only single words with noun tags (value 'False')
+ overall_similarity_threshold -- the overall similarity threshold for embedding-based
+ matching. Defaults to *1.0*, which deactivates embedding-based matching.
+ verbose -- if 'True', information about training progress is outputted to the console.
+ """
+ self.linguistic_object_factory = linguistic_object_factory
+ self.structural_matcher = structural_matcher
+ self.semantic_analyzer = linguistic_object_factory.semantic_analyzer
+ self.semantic_matching_helper = linguistic_object_factory.semantic_matching_helper
+ self.overall_similarity_threshold = overall_similarity_threshold
+ self.classification_ontology = classification_ontology
+ self.utils = SupervisedTopicTrainingUtils(overlap_memory_size, oneshot)
+ self.match_all_words = match_all_words
+ self.verbose = verbose
+
+ self.training_document_labels_to_documents = {}
+ self.corpus_index_dict = {}
+ self.training_documents_labels_to_classifications_dict = {}
+ self.additional_classification_labels = set()
+ self.classification_implication_dict = {}
+ self.labels_to_classification_frequencies = None
+ self.phraselet_labels_to_phraselet_infos = {}
+ self.classifications = None
+
+ def parse_and_register_training_document(self, text, classification, label=None):
+ """ Parses and registers a document to use for training.
+
+ Parameters:
+
+ text -- the document text
+ classification -- the classification label
+ label -- a label with which to identify the document in verbose training output,
+ or 'None' if a random label should be assigned.
+ """
+ self.register_training_document(self.semantic_analyzer.parse(text), classification, label)
+
+ def register_training_document(self, doc, classification, label):
+ """ Registers a pre-parsed document to use for training.
+
+ Parameters:
+
+ doc -- the document
+ classification -- the classification label
+ label -- a label with which to identify the document in verbose training output,
+ or 'None' if a random label should be assigned.
+ """
+ if self.labels_to_classification_frequencies is not None:
+ raise RuntimeError(
+ "register_training_document() may not be called once prepare() has been called")
+ if label is None:
+ label = str(uuid.uuid4())
+ if label in self.training_document_labels_to_documents:
+ raise DuplicateDocumentError(label)
+ if self.verbose:
+ print('Registering document', label)
+ self.training_document_labels_to_documents[label] = doc
+ self.semantic_matching_helper.add_to_corpus_index(self.corpus_index_dict, doc, label)
+ self.linguistic_object_factory.add_phraselets_to_dict(
+ doc,
+ phraselet_labels_to_phraselet_infos=
+ self.phraselet_labels_to_phraselet_infos,
+ replace_with_hypernym_ancestors=True,
+ match_all_words=self.match_all_words,
+ ignore_relation_phraselets=False,
+ include_reverse_only=False,
+ stop_lemmas=self.semantic_matching_helper.\
+ supervised_document_classification_phraselet_stop_lemmas,
+ stop_tags=self.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=None,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
+ self.training_documents_labels_to_classifications_dict[label] = classification
+
+ def register_additional_classification_label(self, label):
+ """ Register an additional classification label which no training document has explicitly
+ but that should be assigned to documents whose explicit labels are related to the
+ additional classification label via the classification ontology.
+ """
+ if self.labels_to_classification_frequencies is not None:
+ raise RuntimeError(
+ "register_additional_classification_label() may not be called once prepare() has "\
+ " been called")
+ if self.classification_ontology is not None and \
+ self.classification_ontology.contains(label):
+ self.additional_classification_labels.add(label)
+
+ def prepare(self):
+ """ Matches the phraselets derived from the training documents against the training
+ documents to generate frequencies that also include combined labels, and examines the
+ explicit classification labels, the additional classification labels and the
+ classification ontology to derive classification implications.
+
+ Once this method has been called, the instance no longer accepts new training documents
+ or additional classification labels.
+ """
+ if self.labels_to_classification_frequencies is not None:
+ raise RuntimeError(
+ "prepare() may only be called once")
+ if self.verbose:
+ print('Matching documents against all phraselets')
+ search_phrases = self.linguistic_object_factory.create_search_phrases_from_phraselet_infos(
+ self.phraselet_labels_to_phraselet_infos.values()).values()
+ self.labels_to_classification_frequencies = self.utils.\
+ get_labels_to_classification_frequencies_dict(
+ matches=self.structural_matcher.match(
+ document_labels_to_documents=self.training_document_labels_to_documents,
+ corpus_index_dict=self.corpus_index_dict,
+ search_phrases=search_phrases,
+ match_depending_on_single_words=None,
+ compare_embeddings_on_root_words=False,
+ compare_embeddings_on_non_root_words=True,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=None,
+ process_initial_question_words=False,
+ overall_similarity_threshold=self.overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=1.0),
+ labels_to_classifications_dict=
+ self.training_documents_labels_to_classifications_dict)
+ self.classifications = sorted(set(
+ self.training_documents_labels_to_classifications_dict.values()
+ ).union(self.additional_classification_labels))
+ if len(self.classifications) < 2:
+ raise FewerThanTwoClassificationsError(len(self.classifications))
+ if self.classification_ontology is not None:
+ for parent in self.classifications:
+ for child in self.classifications:
+ if self.classification_ontology.matches(parent, child):
+ if child in self.classification_implication_dict.keys():
+ self.classification_implication_dict[child].append(parent)
+ else:
+ self.classification_implication_dict[child] = [parent]
+
+ def train(
+ self, *, minimum_occurrences=4, cv_threshold=1.0, mlp_activation='relu',
+ mlp_solver='adam', mlp_learning_rate='constant', mlp_learning_rate_init=0.001,
+ mlp_max_iter=200, mlp_shuffle=True, mlp_random_state=42, hidden_layer_sizes=None):
+ """ Trains a model based on the prepared state.
+
+ Parameters:
+
+ minimum_occurrences -- the minimum number of times a word or relationship has to
+ occur in the context of at least one single classification for the phraselet
+ to be accepted into the final model.
+ cv_threshold -- the minimum coefficient of variation a word or relationship has
+ to occur with respect to explicit classification labels for the phraselet to be
+ accepted into the final model.
+ mlp_* -- see https://scikit-learn.org/stable/modules/generated/
+ sklearn.neural_network.MLPClassifier.html.
+ hidden_layer_sizes -- a tuple containing the number of neurons in each hidden layer, or
+ 'None' if the topology should be determined automatically.
+ """
+
+ if self.labels_to_classification_frequencies is None:
+ raise RuntimeError("train() may only be called after prepare() has been called")
+ return SupervisedTopicModelTrainer(
+ training_basis=self,
+ linguistic_object_factory=self.linguistic_object_factory,
+ structural_matcher=self.structural_matcher,
+ labels_to_classification_frequencies=self.labels_to_classification_frequencies,
+ phraselet_infos=self.phraselet_labels_to_phraselet_infos.values(),
+ minimum_occurrences=minimum_occurrences,
+ cv_threshold=cv_threshold,
+ mlp_activation=mlp_activation,
+ mlp_solver=mlp_solver,
+ mlp_learning_rate=mlp_learning_rate,
+ mlp_learning_rate_init=mlp_learning_rate_init,
+ mlp_max_iter=mlp_max_iter,
+ mlp_shuffle=mlp_shuffle,
+ mlp_random_state=mlp_random_state,
+ hidden_layer_sizes=hidden_layer_sizes,
+ utils=self.utils
+ )
+
+class SupervisedTopicModelTrainer:
+ """ Worker object used to train and generate models. This class is *NOT* threadsafe."""
+
+ def __init__(
+ self, *, training_basis, linguistic_object_factory, structural_matcher,
+ labels_to_classification_frequencies, phraselet_infos, minimum_occurrences,
+ cv_threshold, mlp_activation, mlp_solver, mlp_learning_rate, mlp_learning_rate_init,
+ mlp_max_iter, mlp_shuffle, mlp_random_state, hidden_layer_sizes, utils):
+
+ self.utils = utils
+ self.semantic_analyzer = linguistic_object_factory.semantic_analyzer
+ self.linguistic_object_factory = linguistic_object_factory
+ self.semantic_matching_helper = linguistic_object_factory.semantic_matching_helper
+ self.structural_matcher = structural_matcher
+ self.training_basis = training_basis
+ self.minimum_occurrences = minimum_occurrences
+ self.cv_threshold = cv_threshold
+ self.labels_to_classification_frequencies, self.phraselet_infos = self.filter(
+ labels_to_classification_frequencies, phraselet_infos)
+
+ if len(self.phraselet_infos) == 0:
+ raise NoPhraseletsAfterFilteringError(
+ ''.join((
+ 'minimum_occurrences: ', str(minimum_occurrences), '; cv_threshold: ',
+ str(cv_threshold)))
+ )
+
+ phraselet_labels_to_search_phrases = \
+ self.linguistic_object_factory.create_search_phrases_from_phraselet_infos(
+ self.phraselet_infos)
+ self.sorted_label_dict = {}
+ for index, label in enumerate(sorted(self.labels_to_classification_frequencies.keys())):
+ self.sorted_label_dict[label] = index
+ self.input_matrix = dok_matrix((
+ len(self.training_basis.training_document_labels_to_documents),
+ len(self.sorted_label_dict)))
+ self.output_matrix = dok_matrix((
+ len(self.training_basis.training_document_labels_to_documents),
+ len(self.training_basis.classifications)))
+
+ if self.training_basis.verbose:
+ print('Matching documents against filtered phraselets')
+ for index, document_label in enumerate(
+ sorted(self.training_basis.training_document_labels_to_documents.keys())):
+ self.utils.record_matches(
+ semantic_matching_helper=self.semantic_matching_helper,
+ structural_matcher=self.structural_matcher,
+ phraselet_labels_to_search_phrases=phraselet_labels_to_search_phrases,
+ sorted_label_dict=self.sorted_label_dict,
+ doc_label=document_label,
+ doc=self.training_basis.training_document_labels_to_documents[document_label].doc,
+ matrix=self.input_matrix,
+ row_index=index,
+ overall_similarity_threshold=self.training_basis.overall_similarity_threshold)
+ self.record_classifications_for_training(document_label, index)
+ self._hidden_layer_sizes = hidden_layer_sizes
+ if self._hidden_layer_sizes is None:
+ start = len(self.sorted_label_dict)
+ step = (len(self.training_basis.classifications) - len(self.sorted_label_dict)) / 3
+ self._hidden_layer_sizes = (start, int(start+step), int(start+(2*step)))
+ if self.training_basis.verbose:
+ print('Hidden layer sizes:', self._hidden_layer_sizes)
+ self._mlp = MLPClassifier(
+ activation=mlp_activation,
+ solver=mlp_solver,
+ hidden_layer_sizes=self._hidden_layer_sizes,
+ learning_rate=mlp_learning_rate,
+ learning_rate_init=mlp_learning_rate_init,
+ max_iter=mlp_max_iter,
+ shuffle=mlp_shuffle,
+ verbose=self.training_basis.verbose,
+ random_state=mlp_random_state)
+ self._mlp.fit(self.input_matrix, self.output_matrix)
+ if self.training_basis.verbose and self._mlp.n_iter_ < mlp_max_iter:
+ print('MLP neural network converged after', self._mlp.n_iter_, 'iterations.')
+
+ def filter(self, labels_to_classification_frequencies, phraselet_infos):
+ """ Filters the phraselets in memory based on minimum_occurrences and cv_threshold. """
+
+ accepted = 0
+ underminimum_occurrences = 0
+ under_minimum_cv = 0
+ new_labels_to_classification_frequencies = {}
+ for label, classification_frequencies in labels_to_classification_frequencies.items():
+ at_least_minimum = False
+ working_classification_frequencies = classification_frequencies.copy()
+ for classification in working_classification_frequencies:
+ if working_classification_frequencies[classification] >= self.minimum_occurrences:
+ at_least_minimum = True
+ if not at_least_minimum:
+ underminimum_occurrences += 1
+ continue
+ frequency_list = list(working_classification_frequencies.values())
+ # We only want to take explicit classification labels into account, i.e. ignore the
+ # classification ontology.
+ number_of_classification_labels = \
+ len(set(
+ self.training_basis.training_documents_labels_to_classifications_dict.values())
+ )
+ frequency_list.extend([0] * number_of_classification_labels)
+ frequency_list = frequency_list[:number_of_classification_labels]
+ if statistics.pstdev(frequency_list) / statistics.mean(frequency_list) >= \
+ self.cv_threshold:
+ accepted += 1
+ new_labels_to_classification_frequencies[label] = classification_frequencies
+ else:
+ under_minimum_cv += 1
+ if self.training_basis.verbose:
+ print(
+ 'Filtered: accepted', accepted, '; removed minimum occurrences',
+ underminimum_occurrences, '; removed cv threshold',
+ under_minimum_cv)
+ new_phraselet_infos = [
+ phraselet_info for phraselet_info in phraselet_infos if
+ phraselet_info.label in new_labels_to_classification_frequencies.keys()]
+ return new_labels_to_classification_frequencies, new_phraselet_infos
+
+ def record_classifications_for_training(self, document_label, index):
+ classification = self.training_basis.training_documents_labels_to_classifications_dict[
+ document_label]
+ classification_index = self.training_basis.classifications.index(classification)
+ self.output_matrix[index, classification_index] = 1
+ if classification in self.training_basis.classification_implication_dict:
+ for implied_classification in \
+ self.training_basis.classification_implication_dict[classification]:
+ implied_classification_index = self.training_basis.classifications.index(
+ implied_classification)
+ self.output_matrix[index, implied_classification_index] = 1
+
+ def classifier(self):
+ """ Returns a supervised topic classifier which contains no explicit references to the
+ training data and that can be serialized.
+ """
+ self._mlp.verbose = False # we no longer require output once we are using the model
+ # to classify new documents
+ model = SupervisedTopicClassifierModel(
+ semantic_analyzer_model=self.semantic_analyzer.model,
+ structural_matcher_ontology=self.structural_matcher.ontology,
+ phraselet_infos=self.phraselet_infos,
+ mlp=self._mlp,
+ sorted_label_dict=self.sorted_label_dict,
+ classifications=self.training_basis.classifications,
+ overlap_memory_size=self.utils.overlap_memory_size,
+ oneshot=self.utils.oneshot,
+ analyze_derivational_morphology=
+ self.structural_matcher.analyze_derivational_morphology)
+ return SupervisedTopicClassifier(
+ self.semantic_analyzer, self.linguistic_object_factory,
+ self.structural_matcher, model, self.training_basis.overall_similarity_threshold,
+ self.training_basis.verbose)
+
+class SupervisedTopicClassifierModel:
+ """ A serializable classifier model.
+
+ Parameters:
+
+ semantic_analyzer_model -- a string specifying the spaCy model with which this instance
+ was generated and with which it must be used.
+ structural_matcher_ontology -- the ontology used for matching documents against this model
+ (not the classification ontology!)
+ phraselet_infos -- the phraselets used for structural matching
+ mlp -- the neural network
+ sorted_label_dict -- a dictionary from search phrase (phraselet) labels to their own
+ alphabetic sorting indexes.
+ classifications -- an ordered list of classification labels corresponding to the
+ neural network outputs
+ overlap_memory_size -- how many non-word phraselet matches to the left should be
+ checked for words in common with a current match.
+ oneshot -- whether the same word or relationship matched multiple times should be
+ counted once only (value 'True') or multiple times (value 'False')
+ analyze_derivational_morphology -- the value of this manager parameter that was in force
+ when the model was built. The same value has to be in force when the model is
+ deserialized and reused.
+ """
+
+ def __init__(
+ self, semantic_analyzer_model, structural_matcher_ontology,
+ phraselet_infos, mlp, sorted_label_dict, classifications, overlap_memory_size,
+ oneshot, analyze_derivational_morphology):
+ self.semantic_analyzer_model = semantic_analyzer_model
+ self.structural_matcher_ontology = structural_matcher_ontology
+ self.phraselet_infos = phraselet_infos
+ self.mlp = mlp
+ self.sorted_label_dict = sorted_label_dict
+ self.classifications = classifications
+ self.overlap_memory_size = overlap_memory_size
+ self.oneshot = oneshot
+ self.analyze_derivational_morphology = analyze_derivational_morphology
+
+class SupervisedTopicClassifier:
+ """Classifies new documents based on a pre-trained model."""
+
+ def __init__(self, semantic_analyzer, linguistic_object_factory, structural_matcher, model,
+ overall_similarity_threshold, verbose):
+ self.semantic_analyzer = semantic_analyzer
+ self.linguistic_object_factory = linguistic_object_factory
+ self.semantic_matching_helper = linguistic_object_factory.semantic_matching_helper
+ self.structural_matcher = structural_matcher
+ self.model = model
+ self.overall_similarity_threshold = overall_similarity_threshold
+ self.verbose = verbose
+ self.utils = SupervisedTopicTrainingUtils(model.overlap_memory_size, model.oneshot)
+ if self.semantic_analyzer.model != model.semantic_analyzer_model:
+ raise WrongModelDeserializationError(model.semantic_analyzer_model)
+ if hasattr(model, 'analyze_derivational_morphology'): # backwards compatibility
+ analyze_derivational_morphology = model.analyze_derivational_morphology
+ else:
+ analyze_derivational_morphology = False
+ if self.structural_matcher.analyze_derivational_morphology != \
+ analyze_derivational_morphology:
+ print(
+ ''.join((
+ 'manager: ', str(self.structural_matcher.analyze_derivational_morphology),
+ '; model: ', str(analyze_derivational_morphology))))
+ raise IncompatibleAnalyzeDerivationalMorphologyDeserializationError(
+ ''.join((
+ 'manager: ', str(self.structural_matcher.analyze_derivational_morphology),
+ '; model: ', str(analyze_derivational_morphology))))
+ self.structural_matcher.ontology = model.structural_matcher_ontology
+ self.linguistic_object_factory.ontology = model.structural_matcher_ontology
+ self.semantic_matching_helper = self.structural_matcher.semantic_matching_helper
+ self.semantic_matching_helper.ontology = model.structural_matcher_ontology
+ self.semantic_matching_helper.ontology_reverse_derivational_dict = \
+ self.linguistic_object_factory.get_ontology_reverse_derivational_dict()
+ self.phraselet_labels_to_search_phrases = \
+ self.linguistic_object_factory.create_search_phrases_from_phraselet_infos(
+ model.phraselet_infos)
+
+ def parse_and_classify(self, text):
+ """ Returns a list containing zero, one or many document classifications. Where more
+ than one classification is returned, the labels are ordered by decreasing
+ probability.
+
+ Parameter:
+
+ text -- the text to parse and classify.
+ """
+ return self.classify(self.semantic_analyzer.parse(text))
+
+ def classify(self, doc):
+ """ Returns a list containing zero, one or many document classifications. Where more
+ than one classification is returned, the labels are ordered by decreasing
+ probability.
+
+ Parameter:
+
+ doc -- the pre-parsed document to classify.
+ """
+
+ if self.model is None:
+ raise RuntimeError('No model defined')
+ new_document_matrix = dok_matrix((1, len(self.model.sorted_label_dict)))
+ if not self.utils.record_matches(
+ semantic_matching_helper=self.semantic_matching_helper,
+ structural_matcher=self.structural_matcher,
+ phraselet_labels_to_search_phrases=self.phraselet_labels_to_search_phrases,
+ sorted_label_dict=self.model.sorted_label_dict,
+ doc=doc,
+ doc_label='',
+ matrix=new_document_matrix,
+ row_index=0,
+ overall_similarity_threshold=self.overall_similarity_threshold):
+ return []
+ else:
+ classification_indexes = self.model.mlp.predict(new_document_matrix).nonzero()[1]
+ if len(classification_indexes) > 1:
+ probabilities = self.model.mlp.predict_proba(new_document_matrix)
+ classification_indexes = sorted(
+ classification_indexes, key=lambda index: 1-probabilities[0, index])
+ return list(map(
+ lambda index: self.model.classifications[index], classification_indexes))
+
+ def serialize_model(self):
+ return jsonpickle.encode(self.model)
diff --git a/holmes_extractor/config.cfg b/holmes_extractor/config.cfg
new file mode 100644
index 0000000..0176ae9
--- /dev/null
+++ b/holmes_extractor/config.cfg
@@ -0,0 +1,3 @@
+[vector_nlps]
+# Names of models for which a second model is used as a source of vocabularies and vectors.
+en_core_web_trf = en_core_web_lg
diff --git a/holmes_extractor/consoles.py b/holmes_extractor/consoles.py
index 500d365..75b85f6 100644
--- a/holmes_extractor/consoles.py
+++ b/holmes_extractor/consoles.py
@@ -1,14 +1,16 @@
-from .errors import *
+from .errors import SearchPhraseContainsNegationError, SearchPhraseContainsConjunctionError,\
+ SearchPhraseContainsCoreferringPronounError, SearchPhraseWithoutMatchableWordsError,\
+ NoSearchPhraseError, SearchPhraseContainsMultipleClausesError
class HolmesConsoles:
"""Manages the consoles."""
def __init__(self, holmes):
- self._holmes = holmes
- self._semantic_analyzer = holmes.semantic_analyzer
- self._structural_matcher = holmes.structural_matcher
+ self.holmes = holmes
+ self.semantic_analyzer = holmes.semantic_analyzer
+ self.structural_matcher = holmes.structural_matcher
- def _match_description(self, match_dict):
+ def match_description(self, match_dict):
"""Returns a user-readable representation of a match dictionary."""
match_description_to_return = ''
if match_dict['negated']:
@@ -26,7 +28,7 @@ def _match_description(self, match_dict):
str(overall_similarity_measure)))
return match_description_to_return
- def _string_representation_of_word_match(self, word_match):
+ def string_representation_of_word_match(self, word_match):
"""Returns a user-readable representation of a word match."""
if word_match['document_word'] != word_match['extracted_word']:
extracted_word = ''.join(("(refers to '", word_match['extracted_word'], "')"))
@@ -34,65 +36,65 @@ def _string_representation_of_word_match(self, word_match):
extracted_word = ''
string = ''.join((
"'", word_match['document_phrase'], "'", extracted_word, "->'",
- word_match['search_phrase_word'], "' (", word_match['match_type']))
- if float(word_match['similarity_measure']) < 1.0:
- string = ''.join((string, ': ', word_match['similarity_measure']))
- string = ''.join((string, ")"))
+ word_match['search_phrase_word'], "' (", word_match['explanation'][:-1], ")"))
return string
- def _common(self):
+ def common(self):
"""Contains functionality common to both consoles."""
- print("Holmes version 2.2 written by richard.hudson@msg.group")
- print("Language is", self._semantic_analyzer.language_name)
- print("Model is", self._semantic_analyzer.model)
- if self._structural_matcher.ontology is None:
+ print("Holmes version 3.0 written by richard.hudson@msg.group")
+ print("Note that the consoles do not display all information available when using Holmes programmatically.")
+ print()
+ print("Language is", self.semantic_analyzer.language_name)
+ print("Model is", self.semantic_analyzer.model)
+ if self.structural_matcher.ontology is None:
print("No ontology is being used")
else:
- print("Ontology is", self._structural_matcher.ontology.path)
- if self._structural_matcher.ontology.symmetric_matching:
+ print("Ontology is", self.structural_matcher.ontology.path)
+ if self.structural_matcher.ontology.symmetric_matching:
print("Symmetric matching is ON")
else:
print("Symmetric matching is OFF")
- if self._structural_matcher.perform_coreference_resolution:
+ if self.structural_matcher.perform_coreference_resolution:
print("Coreference resolution is ON")
else:
print("Coreference resolution is OFF")
- if self._structural_matcher.analyze_derivational_morphology:
+ if self.structural_matcher.analyze_derivational_morphology:
print("Derivational morphology analysis is ON")
else:
print("Derivational morphology analysis is OFF")
+ if self.structural_matcher.use_reverse_dependency_matching:
+ print("Reverse dependency matching is ON")
+ else:
+ print("Reverse dependeny matching is OFF")
+
+ def print_document_info(self):
+ document_labels = self.holmes.document_labels()
+ if len(document_labels) == 0:
+ raise RuntimeError('No documents registered.')
+ document_labels_string = '; '.join(''.join(("'", l, "'")) for l in document_labels)
+ print(': '.join(('Documents', document_labels_string)))
+
+ def start_chatbot_mode(self):
+ """Starts a chatbot mode console enabling the matching of pre-registered
+ search phrases to short example documents entered ad-hoc by the user.
+ """
+ self.common()
print(
"Overall similarity threshold is", str(
- self._structural_matcher.overall_similarity_threshold))
- if self._structural_matcher.overall_similarity_threshold < 1.0:
- if self._structural_matcher.embedding_based_matching_on_root_words:
+ self.holmes.overall_similarity_threshold))
+ if self.holmes.overall_similarity_threshold < 1.0:
+ if self.structural_matcher.embedding_based_matching_on_root_words:
print("Embedding-based matching on root words is ON")
else:
print("Embedding-based matching on root words is OFF")
-
-
- def start_chatbot_mode(self):
- """Starts a chatbot mode console enabling the matching of pre-registered search phrases
- to documents (chatbot entries) entered ad-hoc by the user.
- """
- self._common()
+ print()
print('Chatbot mode')
print()
- if len(self._holmes.threadsafe_container._search_phrases) == 0:
+ if len(self.holmes.search_phrases) == 0:
raise RuntimeError('No search_phrases registered.')
# Display search phrases
- for search_phrase in self._holmes.threadsafe_container._search_phrases:
- print(''.join(("Search phrase '", search_phrase.doc.text, "'")))
- # only has an effect when debug==True
- self._semantic_analyzer.debug_structures(search_phrase.doc)
- if self._structural_matcher.ontology is not None:
- for token in search_phrase.matchable_tokens:
- lemma = token._.holmes.lemma
- matching_terms = self._structural_matcher.ontology.get_words_matching(
- lemma)
- if len(matching_terms) > 0:
- print(lemma, 'also matches', matching_terms)
- print()
+ for search_phrase in self.holmes.search_phrases:
+ print(''.join(("Search phrase '", search_phrase.doc_text, "'")))
print()
print('Ready for input')
@@ -102,28 +104,34 @@ def start_chatbot_mode(self):
print()
if search_sentence in ('exit', 'exit()', 'bye'):
break
- match_dicts = self._holmes.match_search_phrases_against(entry=search_sentence)
+ match_dicts = self.holmes.match(document_text=search_sentence)
for match_dict in match_dicts:
print()
print(''.join((
- "Matched search phrase '",
- match_dict['search_phrase'], "'", self._match_description(match_dict),
+ "Matched search phrase with text '",
+ match_dict['search_phrase_text'], "'", self.match_description(match_dict),
":")))
word_matches_string = '; '.join(map(
- self._string_representation_of_word_match, match_dict['word_matches']))
+ self.string_representation_of_word_match, match_dict['word_matches']))
print(word_matches_string)
- def start_structural_search_mode(self):
- """Starts a structural search mode console enabling the matching of pre-registered documents
- to search phrases entered ad-hoc by the user.
+ def start_structural_extraction_mode(self):
+ """Starts a structural extraction mode console enabling the matching of pre-registered
+ documents to search phrases entered ad-hoc by the user.
"""
- self._common()
- print('Structural search mode')
+ self.common()
+ print(
+ "Overall similarity threshold is", str(
+ self.holmes.overall_similarity_threshold))
+ if self.holmes.overall_similarity_threshold < 1.0:
+ if self.structural_matcher.embedding_based_matching_on_root_words:
+ print("Embedding-based matching on root words is ON")
+ else:
+ print("Embedding-based matching on root words is OFF")
print()
- if len(self._holmes.document_labels()) == 0:
- raise RuntimeError('No documents registered.')
- document_labels = '; '.join(self._holmes.document_labels())
- print(': '.join(('Documents', document_labels)))
+ print('Structural extraction mode')
+ print()
+ self.print_document_info()
print()
while True:
print('Ready for phrases')
@@ -138,7 +146,7 @@ def start_structural_search_mode(self):
print()
match_dicts = []
try:
- match_dicts = self._holmes.match_documents_against(search_phrase_text=search_phrase)
+ match_dicts = self.holmes.match(search_phrase_text=search_phrase)
if len(match_dicts) == 0:
print('No structural matching results were returned.')
else:
@@ -174,31 +182,27 @@ def start_structural_search_mode(self):
print(''.join((
"Matched document '", match_dict['document'],
"' at index ", str(match_dict['index_within_document']),
- self._match_description(match_dict), ":")))
+ self.match_description(match_dict), ":")))
print(''.join(('"', match_dict['sentences_within_document'], '"')))
word_matches_string = '; '.join(
- map(self._string_representation_of_word_match, match_dict['word_matches']))
+ map(self.string_representation_of_word_match, match_dict['word_matches']))
print(word_matches_string)
def start_topic_matching_search_mode(
self, only_one_result_per_document,
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching):
+ word_embedding_match_threshold,
+ initial_question_word_embedding_match_threshold):
"""Starts a topic matching search mode console enabling the matching of pre-registered
documents to search texts entered ad-hoc by the user.
-
- Parameters:
-
- only_one_result_per_document -- if 'True', prevents multiple topic match
- results from being returned for the same document.
"""
- self._common()
+ self.common()
+ print("The embedding similarity threshold for normal words is", str(
+ word_embedding_match_threshold))
+ print("The embedding similarity threshold for initial question words is", str(
+ initial_question_word_embedding_match_threshold))
print('Topic matching search mode')
print()
- if len(self._holmes.document_labels()) == 0:
- raise RuntimeError('No documents registered.')
- document_labels = '; '.join(self._holmes.document_labels())
- print(': '.join(('Documents', document_labels)))
+ self.print_document_info()
print()
while True:
print('Ready for search texts')
@@ -211,18 +215,17 @@ def start_topic_matching_search_mode(
if search_text in ('exit', 'exit()', 'bye'):
break
print()
- print('Performing topic matching ...')
+ print('Performing topic match searching ...')
try:
print()
topic_match_dicts = \
- self._holmes.topic_match_documents_returning_dictionaries_against(
+ self.holmes.topic_match_documents_against(
search_text,
number_of_results=5,
only_one_result_per_document=only_one_result_per_document,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching)
+ word_embedding_match_threshold=word_embedding_match_threshold,
+ initial_question_word_embedding_match_threshold=
+ initial_question_word_embedding_match_threshold)
except NoSearchPhraseError:
pass
if topic_match_dicts is None or len(topic_match_dicts) == 0:
@@ -235,16 +238,23 @@ def start_topic_matching_search_mode(
print('Topic matching results:')
print()
for topic_match_dict in topic_match_dicts:
+ textual_answers = []
+ for dict_answer in topic_match_dict['answers']:
+ textual_answers.append(''.join(("'",
+ topic_match_dict['text'][dict_answer[0]: dict_answer[1]], "'")))
+ answers_string = ''.join(('; question answer ', '; '.join(textual_answers))) if \
+ len(textual_answers) > 0 else ''
output = ''.join((
topic_match_dict['rank'],
'. Document ',
topic_match_dict['document_label'],
'; sentences at character indexes ',
- str(topic_match_dict['sentences_character_start_index_in_document']),
+ str(topic_match_dict['sentences_character_start_index']),
'-',
- str(topic_match_dict['sentences_character_end_index_in_document']),
+ str(topic_match_dict['sentences_character_end_index']),
'; score ',
str(topic_match_dict['score']),
+ answers_string,
':'
))
print(output)
diff --git a/holmes_extractor/errors.py b/holmes_extractor/errors.py
index 6dae7c7..01eeb81 100644
--- a/holmes_extractor/errors.py
+++ b/holmes_extractor/errors.py
@@ -27,10 +27,7 @@ class DuplicateDocumentError(HolmesError):
class NoSearchPhraseError(HolmesError):
pass
-class NoSearchedDocumentError(HolmesError):
- pass
-
-class SerializationNotSupportedError(HolmesError):
+class NoDocumentError(HolmesError):
pass
class WrongModelDeserializationError(HolmesError):
@@ -48,7 +45,7 @@ class FewerThanTwoClassificationsError(HolmesError):
class NoPhraseletsAfterFilteringError(HolmesError):
pass
-class EmbeddingThresholdGreaterThanRelationThresholdError(HolmesError):
+class EmbeddingThresholdLessThanRelationThresholdError(HolmesError):
pass
class IncompatibleAnalyzeDerivationalMorphologyDeserializationError(HolmesError):
diff --git a/holmes_extractor/examples/example_chatbot_DE_insurance.py b/holmes_extractor/examples/example_chatbot_DE_insurance.py
deleted file mode 100644
index a7434b8..0000000
--- a/holmes_extractor/examples/example_chatbot_DE_insurance.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-import holmes_extractor as holmes
-
-script_directory = os.path.dirname(os.path.realpath(__file__))
-ontology = holmes.Ontology(os.sep.join((
- script_directory, 'example_chatbot_DE_insurance_ontology.owl')))
-holmes_manager = holmes.Manager(model='de_core_news_md', ontology=ontology)
-holmes_manager.register_search_phrase('Jemand benötigt eine Versicherung')
-holmes_manager.register_search_phrase('Ein ENTITYPER schließt eine Versicherung ab')
-holmes_manager.register_search_phrase('ENTITYPER benötigt eine Versicherung')
-holmes_manager.register_search_phrase('Eine Versicherung für einen Zeitraum')
-holmes_manager.register_search_phrase('Eine Versicherung fängt an')
-holmes_manager.register_search_phrase('Jemand zahlt voraus')
-
-holmes_manager.start_chatbot_mode_console()
diff --git a/holmes_extractor/examples/example_chatbot_EN_insurance.py b/holmes_extractor/examples/example_chatbot_EN_insurance.py
deleted file mode 100644
index d18076f..0000000
--- a/holmes_extractor/examples/example_chatbot_EN_insurance.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import os
-import holmes_extractor as holmes
-
-script_directory = os.path.dirname(os.path.realpath(__file__))
-ontology = holmes.Ontology(os.sep.join((
- script_directory, 'example_chatbot_EN_insurance_ontology.owl')))
-holmes_manager = holmes.Manager(
- model='en_core_web_lg', ontology=ontology, perform_coreference_resolution=True)
-holmes_manager.register_search_phrase('Somebody requires insurance')
-holmes_manager.register_search_phrase('An ENTITYPERSON takes out insurance')
-holmes_manager.register_search_phrase('A company buys payment insurance')
-holmes_manager.register_search_phrase('An ENTITYPERSON needs insurance')
-holmes_manager.register_search_phrase('Insurance for a period')
-holmes_manager.register_search_phrase('An insurance begins')
-holmes_manager.register_search_phrase('Somebody prepays')
-holmes_manager.register_search_phrase('Somebody makes an insurance payment')
-
-holmes_manager.start_chatbot_mode_console()
diff --git a/holmes_extractor/examples/example_search_EN_literature.py b/holmes_extractor/examples/example_search_EN_literature.py
deleted file mode 100644
index 1645eb0..0000000
--- a/holmes_extractor/examples/example_search_EN_literature.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import urllib.request
-import re
-import os
-import json
-import falcon
-import holmes_extractor as holmes
-
-if __name__ in ('__main__', 'example_search_EN_literature'):
-
- script_directory = os.path.dirname(os.path.realpath(__file__))
- ontology = holmes.Ontology(os.sep.join((
- script_directory, 'example_search_EN_literature_ontology.owl')))
- print('Initializing Holmes...')
- #Start the Holmes manager with the English model
- holmes_manager = holmes.MultiprocessingManager(
- model='en_core_web_lg', overall_similarity_threshold=0.9, ontology=ontology,
- number_of_workers=4)
- # set number_of_workers to prevent memory exhaustion / swapping; it should never be more
- # than the number of cores
-
- def extract_chapters_from_book(book_uri, title):
- """ Download and save the chapters from a book."""
-
- print()
- print(title)
- print()
- book = urllib.request.urlopen(book_uri).read().decode()
- book = re.sub("\\nPage \|.+?Rowling \\n", "", book)
- book = re.sub("\\nP a g e \|.+?Rowling \\n", "", book)
- book = re.sub("\\nPage \|.+?\\n", "", book)
- book = book.replace("Harry Potter and the Half Blood Prince - J.K. Rowling", "")
- book = book.replace("Harry Potter and the Goblet of Fire - J.K. Rowling", "")
- book = book.replace("Harry Potter and the Deathly Hallows - J.K. Rowling", "")
- book = book[1:]
- chapter_headings = [heading for heading in re.finditer("(?<=((\\n\\n\\n\\n)|(\* \\n\\n)))((?!.*(WEASLEY WILL MAKE SURE)|(DO NOT OPEN THE PARCEL)|(HEADMISTRESS OF HOGWARTS))[A-Z][A-Z\-’., ]+)(\\n{1,2}((?!.*(WHO\-MUST))[A-Z\-’., ]+))?(?=(\\n\\n([^\\n]|(\\n\\n((“Harry!”)|(Harry’s)|(Ron’s)|(“Hagrid)|(Three o’clock))))))", book)]
- chapter_counter = 1
- chapter_dict = {}
- for chapter_heading in chapter_headings:
- label = ''.join((
- 'Book ', title, '; Ch ', str(chapter_counter), ': ',
- chapter_heading.group().replace('\n', ''))).strip()
- if chapter_counter == len(chapter_headings): # last chapter
- content = book[chapter_heading.end():]
- else:
- content = book[chapter_heading.end():chapter_headings[chapter_counter].start()]
- content = content.replace('\n', '')
- if content.endswith('& '):
- content = content[:-2]
- print('Extracted', label)
- chapter_dict[label] = content
- chapter_counter += 1
- holmes_manager.parse_and_register_documents(chapter_dict)
-
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt", '1: The Philosopher\'s Stone')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt", '2: The Chamber of Secrets')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt", '3: The Prisoner of Azkaban')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt", '4: The Goblet of Fire')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt", '5: The Order of the Phoenix')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt", '6: The Half Blood Prince')
- extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt", '7: The Deathly Hallows')
-
- #Comment following lines in to activate interactive console
- #holmes_manager.start_topic_matching_search_mode_console(
- # only_one_result_per_document=True,
- # maximum_number_of_single_word_matches_for_relation_matching=300,
- # maximum_number_of_single_word_matches_for_embedding_matching=50)
- #Only return one topic match per story
-
- # The following code starts a RESTful Http service to perform topic searches. It is deployed as
- # as WSGI application. An example of how to start it - issued from the directory that
- # contains the script - is
-
- # waitress-serve example_search_EN_literature:application
-
- class RestHandler():
- def on_get(self, req, resp):
- resp.body = \
- json.dumps(
- holmes_manager.topic_match_documents_returning_dictionaries_against(
- req.params['entry'][0:200]))
- resp.cache_control = ["s-maxage=31536000"]
-
- application = falcon.App()
- application.add_route('/english', RestHandler())
diff --git a/holmes_extractor/examples/example_supervised_topic_model_EN.py b/holmes_extractor/examples/example_supervised_topic_model_EN.py
deleted file mode 100644
index 4b59c23..0000000
--- a/holmes_extractor/examples/example_supervised_topic_model_EN.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import shutil
-import urllib.request
-import zipfile
-import holmes_extractor as holmes
-
-working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
-
-def is_training_data(document_number):
- # We use any documents with numbers ending in 8,9,0 for test and all other documents for
- # training.
- return document_number[-1:] not in ('8', '9', '0')
-
-def get_document_filename_info(filename):
- # e.g. 'bbc/business/001.txt'
- category = filename.split('/')[1]
- document_number = filename.split('/')[2].split('.')[0]
- return category, document_number
-
-def evaluate_classifier(zip_filename, classifier):
- correct_classification_counter = wrong_classification_counter = \
- no_classification_counter = correct_as_additional_classification_counter = 0
- with zipfile.ZipFile(zip_filename) as bbc_zipfile:
- for filename in (
- filename for filename in bbc_zipfile.namelist() if
- filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
- category, document_number = get_document_filename_info(filename)
- if not is_training_data(document_number):
- with bbc_zipfile.open(filename, 'r') as test_doc:
- test_contents = str(test_doc.read())
- test_contents = test_contents.replace('\n', ' ').replace('\r', ' ')
- suggested_categories = classifier.parse_and_classify(test_contents)
- if len(suggested_categories) == 0:
- no_classification_counter += 1
- elif suggested_categories[0] == category:
- correct_classification_counter += 1
- elif category in suggested_categories:
- correct_as_additional_classification_counter += 1
- else:
- wrong_classification_counter += 1
- print(''.join((
- filename, ': actual category ', category,
- '; suggested categories ', str(suggested_categories))))
- print()
- print('Totals:')
- print(correct_classification_counter, 'correct classifications;')
- print(no_classification_counter, 'unclassified documents;')
- print(wrong_classification_counter, 'incorrect classifications;')
- print(
- correct_as_additional_classification_counter, 'incorrect classifications where the '
- 'correct classification was returned as an additional classification.')
-
-def train_model(working_directory, zip_filename):
- training_basis = holmes_manager.get_supervised_topic_training_basis()
- with zipfile.ZipFile(zip_filename) as bbc_zipfile:
- for filename in (
- filename for filename in bbc_zipfile.namelist() if
- filename.lower().endswith('.txt') and not filename.endswith('README.TXT')):
- category, document_number = get_document_filename_info(filename)
- if is_training_data(document_number):
- with bbc_zipfile.open(filename, 'r') as training_doc:
- training_contents = str(training_doc.read())
- training_contents = training_contents.replace('\n', ' ').replace('\r', ' ')
- training_basis.parse_and_register_training_document(
- training_contents, category, filename)
- training_basis.prepare()
- classifier = training_basis.train().classifier()
- output_filename = os.sep.join((working_directory, 'model.json'))
- with open(output_filename, "w") as file:
- file.write(classifier.serialize_model())
- evaluate_classifier(zip_filename, classifier)
-holmes_manager = holmes.Manager('en_core_web_lg')
-
-if os.path.exists(working_directory):
- if not os.path.isdir(working_directory):
- raise RuntimeError(' '.join((working_directory), 'must be a directory'))
-else:
- os.mkdir(working_directory)
-zip_filename = (os.sep.join((working_directory, 'bbc-fulltext.zip')))
-if not os.path.exists(zip_filename):
- url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
- with urllib.request.urlopen(url) as response, open(zip_filename, 'wb') as out_file:
- shutil.copyfileobj(response, out_file)
-model_filename = os.sep.join((working_directory, 'model.json'))
-if not os.path.exists(model_filename):
- train_model(working_directory, zip_filename)
-else:
- print('Reloading existing trained model. Delete model.json from working directory to repeat '\
- 'training.')
- with open(model_filename) as model_file:
- classifier = holmes_manager.deserialize_supervised_topic_classifier(model_file.read())
- evaluate_classifier(zip_filename, classifier)
diff --git a/holmes_extractor/extensive_matching.py b/holmes_extractor/extensive_matching.py
deleted file mode 100644
index aaeda10..0000000
--- a/holmes_extractor/extensive_matching.py
+++ /dev/null
@@ -1,1552 +0,0 @@
-import uuid
-import statistics
-import jsonpickle
-from scipy.sparse import dok_matrix
-from sklearn.neural_network import MLPClassifier
-from .structural_matching import Index
-from .errors import WrongModelDeserializationError, FewerThanTwoClassificationsError, \
- DuplicateDocumentError, NoPhraseletsAfterFilteringError, \
- EmbeddingThresholdGreaterThanRelationThresholdError, \
- IncompatibleAnalyzeDerivationalMorphologyDeserializationError
-
-class TopicMatch:
- """A topic match between some text and part of a document. Note that the end indexes refer
- to the token in question rather than to the following token.
-
- Properties:
-
- document_label -- the document label.
- index_within_document -- the index of the token within the document where 'score' was achieved.
- subword_index -- the index of the subword within the token within the document where 'score'
- was achieved, or *None* if the match involved the whole word.
- start_index -- the start index of the topic match within the document.
- end_index -- the end index of the topic match within the document.
- sentences_start_index -- the start index within the document of the sentence that contains
- 'start_index'
- sentences_end_index -- the end index within the document of the sentence that contains
- 'end_index'
- relative_start_index -- the start index of the topic match relative to 'sentences_start_index'
- relative_end_index -- the end index of the topic match relative to 'sentences_start_index'
- score -- the similarity score of the topic match
- text -- the text between 'sentences_start_index' and 'sentences_end_index'
- structural_matches -- a list of `Match` objects that were used to derive this object.
- """
-
- def __init__(
- self, document_label, index_within_document, subword_index, start_index, end_index,
- sentences_start_index, sentences_end_index, score, text, structural_matches):
- self.document_label = document_label
- self.index_within_document = index_within_document
- self.subword_index = subword_index
- self.start_index = start_index
- self.end_index = end_index
- self.sentences_start_index = sentences_start_index
- self.sentences_end_index = sentences_end_index
- self.score = score
- self.text = text
- self.structural_matches = structural_matches
-
- @property
- def relative_start_index(self):
- return self.start_index - self.sentences_start_index
-
- @property
- def relative_end_index(self):
- return self.end_index - self.sentences_start_index
-
-class PhraseletActivationTracker:
- """ Tracks the activation for a specific phraselet - the most recent score
- and the position within the document at which that score was calculated.
- """
- def __init__(self, position, score):
- self.position = position
- self.score = score
-
-class TopicMatcher:
- """A topic matcher object. See manager.py for details of the properties."""
-
- def __init__(
- self, *, semantic_analyzer, structural_matcher, indexed_documents,
- maximum_activation_distance, relation_score, reverse_only_relation_score,
- single_word_score, single_word_any_tag_score, overlapping_relation_multiplier,
- embedding_penalty, ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching,
- sideways_match_extent, only_one_result_per_document, number_of_results,
- document_label_filter):
- if maximum_number_of_single_word_matches_for_embedding_matching > \
- maximum_number_of_single_word_matches_for_relation_matching:
- raise EmbeddingThresholdGreaterThanRelationThresholdError(' '.join((
- 'embedding',
- str(maximum_number_of_single_word_matches_for_embedding_matching),
- 'relation',
- str(maximum_number_of_single_word_matches_for_relation_matching))))
- self._semantic_analyzer = semantic_analyzer
- self.structural_matcher = structural_matcher
- self.indexed_documents = indexed_documents
- self._ontology = structural_matcher.ontology
- self.maximum_activation_distance = maximum_activation_distance
- self.relation_score = relation_score
- self.reverse_only_relation_score = reverse_only_relation_score
- self.single_word_score = single_word_score
- self.single_word_any_tag_score = single_word_any_tag_score
- self.overlapping_relation_multiplier = overlapping_relation_multiplier
- self.embedding_penalty = embedding_penalty
- self.ontology_penalty = ontology_penalty
- self.maximum_number_of_single_word_matches_for_relation_matching = \
- maximum_number_of_single_word_matches_for_relation_matching
- self.maximum_number_of_single_word_matches_for_embedding_matching = \
- maximum_number_of_single_word_matches_for_embedding_matching
- self.sideways_match_extent = sideways_match_extent
- self.only_one_result_per_document = only_one_result_per_document
- self.number_of_results = number_of_results
- self.document_label_filter = document_label_filter
- self._words_to_phraselet_word_match_infos = {}
-
-
- def _get_word_match_from_match(self, match, parent):
- ## child if parent==False
- for word_match in match.word_matches:
- if parent and word_match.search_phrase_token.dep_ == 'ROOT':
- return word_match
- if not parent and word_match.search_phrase_token.dep_ != 'ROOT':
- return word_match
- raise RuntimeError(''.join(('Word match not found with parent==', str(parent))))
-
- def _add_to_dict_list(self, dictionary, key, value):
- if key in dictionary:
- dictionary[key].append(value)
- else:
- dictionary[key] = [value]
-
- def _add_to_dict_set(self, dictionary, key, value):
- if not key in dictionary:
- dictionary[key] = set()
- dictionary[key].add(value)
-
- def topic_match_documents_against(self, text_to_match):
- """ Performs a topic match against the loaded documents.
-
- Property:
-
- text_to_match -- the text to match against the documents.
- """
-
- class CorpusWordPosition:
- def __init__(self, document_label, index):
- self.document_label = document_label
- self.index = index
-
- def __eq__(self, other):
- return isinstance(other, CorpusWordPosition) and self.index == other.index and \
- self.document_label == other.document_label
-
- def __hash__(self):
- return hash((self.document_label, self.index))
-
- def __str__(self):
- return ':'.join((self.document_label, str(self.index)))
-
- class PhraseletWordMatchInfo:
- def __init__(self):
- self.single_word_match_corpus_words = set()
- # The indexes at which the single word phraselet for this word was matched.
-
- self.phraselet_labels_to_parent_match_corpus_words = {}
- # Dictionary from phraselets with this word as the parent to indexes where the
- # phraselet was matched.
-
- self.phraselet_labels_to_child_match_corpus_words = {}
- # Dictionary from phraselets with this word as the child to indexes where the
- # phraselet was matched.
-
- self.parent_match_corpus_words_to_matches = {}
- # Dictionary from indexes where phraselets with this word as the parent were matched
- # to the match objects.
-
- self.child_match_corpus_words_to_matches = {}
- # Dictionary from indexes where phraselets with this word as the child were matched
- # to the match objects.
-
- def get_phraselet_word_match_info(word):
- if word in self._words_to_phraselet_word_match_infos:
- return self._words_to_phraselet_word_match_infos[word]
- else:
- phraselet_word_match_info = PhraseletWordMatchInfo()
- self._words_to_phraselet_word_match_infos[word] = phraselet_word_match_info
- return phraselet_word_match_info
-
- def set_phraselet_to_reverse_only_where_too_many_single_word_matches(phraselet):
- """ Where the parent word of a phraselet matched too often in the corpus, the phraselet
- is set to reverse matching only to improve performance.
- """
- parent_token = phraselet.root_token
- parent_word = parent_token._.holmes.lemma_or_derived_lemma()
- if parent_word in self._words_to_phraselet_word_match_infos:
- parent_phraselet_word_match_info = self._words_to_phraselet_word_match_infos[
- parent_word]
- parent_single_word_match_corpus_words = \
- parent_phraselet_word_match_info.single_word_match_corpus_words
- if len(parent_single_word_match_corpus_words) > \
- self.maximum_number_of_single_word_matches_for_relation_matching:
- phraselet.treat_as_reverse_only_during_initial_relation_matching = True
-
- def get_indexes_for_reverse_matching(
- *, phraselet,
- parent_document_labels_to_indexes_for_direct_retry_sets,
- parent_document_labels_to_indexes_for_embedding_retry_sets,
- child_document_labels_to_indexes_for_embedding_retry_sets):
- """
- parent_document_labels_to_indexes_for_direct_retry_sets -- indexes where matching
- against a reverse matching phraselet should be attempted. These are ascertained
- by examining the child words.
- parent_document_labels_to_indexes_for_embedding_retry_sets -- indexes where matching
- against a phraselet should be attempted with embedding-based matching on the
- parent (root) word. These are ascertained by examining the child words.
- child_document_labels_to_indexes_for_embedding_retry_sets -- indexes where matching
- against a phraselet should be attempted with embedding-based matching on the
- child (non-root) word. These are ascertained by examining the parent words.
- """
-
- parent_token = phraselet.root_token
- parent_word = parent_token._.holmes.lemma_or_derived_lemma()
- if parent_word in self._words_to_phraselet_word_match_infos and not \
- phraselet.reverse_only and not \
- phraselet.treat_as_reverse_only_during_initial_relation_matching:
- parent_phraselet_word_match_info = self._words_to_phraselet_word_match_infos[
- parent_word]
- parent_single_word_match_corpus_words = \
- parent_phraselet_word_match_info.single_word_match_corpus_words
- if phraselet.label in parent_phraselet_word_match_info.\
- phraselet_labels_to_parent_match_corpus_words:
- parent_relation_match_corpus_words = \
- parent_phraselet_word_match_info.\
- phraselet_labels_to_parent_match_corpus_words[phraselet.label]
- else:
- parent_relation_match_corpus_words = []
- if len(parent_single_word_match_corpus_words) <= \
- self.maximum_number_of_single_word_matches_for_embedding_matching:
- # we deliberately use the number of single matches rather than the difference
- # because the deciding factor should be whether or not enough match information
- # has been returned without checking the embeddings
- for corpus_word_position in parent_single_word_match_corpus_words.difference(
- parent_relation_match_corpus_words):
- self._add_to_dict_set(
- child_document_labels_to_indexes_for_embedding_retry_sets,
- corpus_word_position.document_label, Index(
- corpus_word_position.index.token_index,
- corpus_word_position.index.subword_index))
- child_token = [token for token in phraselet.matchable_tokens if token.i !=
- parent_token.i][0]
- child_word = child_token._.holmes.lemma_or_derived_lemma()
- if child_word in self._words_to_phraselet_word_match_infos:
- child_phraselet_word_match_info = \
- self._words_to_phraselet_word_match_infos[child_word]
- child_single_word_match_corpus_words = \
- child_phraselet_word_match_info.single_word_match_corpus_words
- if phraselet.label in child_phraselet_word_match_info.\
- phraselet_labels_to_child_match_corpus_words:
- child_relation_match_corpus_words = child_phraselet_word_match_info.\
- phraselet_labels_to_child_match_corpus_words[phraselet.label]
- else:
- child_relation_match_corpus_words = []
- if len(child_single_word_match_corpus_words) <= \
- self.maximum_number_of_single_word_matches_for_embedding_matching:
- set_to_add_to = parent_document_labels_to_indexes_for_embedding_retry_sets
- elif len(child_single_word_match_corpus_words) <= \
- self.maximum_number_of_single_word_matches_for_relation_matching and (
- phraselet.reverse_only or
- phraselet.treat_as_reverse_only_during_initial_relation_matching):
- set_to_add_to = parent_document_labels_to_indexes_for_direct_retry_sets
- else:
- return
- linking_dependency = parent_token._.holmes.get_label_of_dependency_with_child_index(
- child_token.i)
- for corpus_word_position in child_single_word_match_corpus_words.difference(
- child_relation_match_corpus_words):
- doc = self.indexed_documents[corpus_word_position.document_label].doc
- working_index = corpus_word_position.index
- working_token = doc[working_index.token_index]
- if not working_index.is_subword() or \
- working_token._.holmes.subwords[working_index.subword_index].is_head:
- for parent_dependency in working_token._.holmes.parent_dependencies:
- if self._semantic_analyzer.dependency_labels_match(
- search_phrase_dependency_label=linking_dependency,
- document_dependency_label=parent_dependency[1]):
- self._add_to_dict_set(
- set_to_add_to,
- corpus_word_position.document_label,
- Index(parent_dependency[0], None))
- else:
- working_subword = \
- working_token._.holmes.subwords[working_index.subword_index]
- if self._semantic_analyzer.dependency_labels_match(
- search_phrase_dependency_label=linking_dependency,
- document_dependency_label=
- working_subword.governing_dependency_label):
- self._add_to_dict_set(
- set_to_add_to,
- corpus_word_position.document_label,
- Index(working_index.token_index,
- working_subword.governor_index))
-
- def rebuild_document_info_dict(matches, phraselet_labels_to_phraselet_infos):
-
- def process_word_match(match, parent): # 'True' -> parent, 'False' -> child
- word_match = self._get_word_match_from_match(match, parent)
- word = word_match.search_phrase_token._.holmes.lemma_or_derived_lemma()
- phraselet_word_match_info = get_phraselet_word_match_info(word)
- corpus_word_position = CorpusWordPosition(
- match.document_label, word_match.get_document_index())
- if parent:
- self._add_to_dict_list(
- phraselet_word_match_info.parent_match_corpus_words_to_matches,
- corpus_word_position, match)
- self._add_to_dict_list(
- phraselet_word_match_info.phraselet_labels_to_parent_match_corpus_words,
- match.search_phrase_label, corpus_word_position)
- else:
- self._add_to_dict_list(
- phraselet_word_match_info.child_match_corpus_words_to_matches,
- corpus_word_position, match)
- self._add_to_dict_list(
- phraselet_word_match_info.phraselet_labels_to_child_match_corpus_words,
- match.search_phrase_label, corpus_word_position)
-
- self._words_to_phraselet_word_match_infos = {}
- for match in matches:
- if match.from_single_word_phraselet:
- phraselet_info = phraselet_labels_to_phraselet_infos[match.search_phrase_label]
- word = phraselet_info.parent_derived_lemma
- phraselet_word_match_info = get_phraselet_word_match_info(word)
- word_match = match.word_matches[0]
- phraselet_word_match_info.single_word_match_corpus_words.add(
- CorpusWordPosition(match.document_label, word_match.get_document_index()))
- else:
- process_word_match(match, True)
- process_word_match(match, False)
-
- def filter_superfluous_matches(match):
-
- def get_other_matches_at_same_word(match, parent): # 'True' -> parent, 'False' -> child
- word_match = self._get_word_match_from_match(match, parent)
- word = word_match.search_phrase_token._.holmes.lemma_or_derived_lemma()
- phraselet_word_match_info = get_phraselet_word_match_info(word)
- corpus_word_position = CorpusWordPosition(
- match.document_label, word_match.get_document_index())
- if parent:
- match_dict = phraselet_word_match_info.parent_match_corpus_words_to_matches
- else:
- match_dict = phraselet_word_match_info.child_match_corpus_words_to_matches
- return match_dict[corpus_word_position]
-
- def check_for_sibling_match_with_higher_similarity(
- match, other_match, word_match, other_word_match):
- # We do not want the same phraselet to match multiple siblings, so choose
- # the sibling that is most similar to the search phrase token.
- if self.structural_matcher.overall_similarity_threshold == 1.0:
- return True
- if word_match.document_token.i == other_word_match.document_token.i:
- return True
- working_sibling = word_match.document_token.doc[
- word_match.document_token._.holmes.token_or_lefthand_sibling_index]
- for sibling in \
- working_sibling._.holmes.loop_token_and_righthand_siblings(
- word_match.document_token.doc):
- if match.search_phrase_label == other_match.search_phrase_label and \
- other_word_match.document_token.i == sibling.i and \
- other_word_match.similarity_measure > word_match.similarity_measure:
- return False
- return True
-
- def perform_checks_at_pole(match, parent): # pole is 'True' -> parent, 'False' -> child
- this_this_pole_word_match = self._get_word_match_from_match(match, parent)
- this_pole_index = this_this_pole_word_match.document_token.i
- this_other_pole_word_match = self._get_word_match_from_match(match, not parent)
- for other_this_pole_match in get_other_matches_at_same_word(match, parent):
- other_other_pole_word_match = \
- self._get_word_match_from_match(other_this_pole_match, not parent)
- if this_other_pole_word_match.document_subword is not None:
- this_other_pole_subword_index = this_other_pole_word_match.\
- document_subword.index
- else:
- this_other_pole_subword_index = None
- if other_other_pole_word_match.document_subword is not None:
- other_other_pole_subword_index = other_other_pole_word_match.\
- document_subword.index
- else:
- other_other_pole_subword_index = None
- if this_other_pole_word_match.document_token.i == other_other_pole_word_match.\
- document_token.i and this_other_pole_subword_index == \
- other_other_pole_subword_index and \
- other_other_pole_word_match.similarity_measure > \
- this_other_pole_word_match.similarity_measure:
- # The other match has a higher similarity measure at the other pole than
- # this match. The matched tokens are the same. The matching phraselets
- # must be different.
- return False
- if this_other_pole_word_match.document_token.i == other_other_pole_word_match.\
- document_token.i and this_other_pole_subword_index is not None \
- and other_other_pole_subword_index is None:
- # This match is with a subword where the other match has matched the entire
- # word, so this match should be removed.
- return False
- # Check unnecessary if parent==True as it has then already
- # been carried out during structural matching.
- if not parent and this_other_pole_word_match.document_token.i != \
- other_other_pole_word_match.document_token.i and \
- other_other_pole_word_match.document_token.i in \
- this_other_pole_word_match.document_token._.\
- holmes.token_and_coreference_chain_indexes and \
- match.search_phrase_label == other_this_pole_match.search_phrase_label \
- and (
- (
- abs(this_pole_index -
- this_other_pole_word_match.document_token.i) >
- abs(this_pole_index -
- other_other_pole_word_match.document_token.i)
- )
- or
- (
- abs(this_pole_index -
- this_other_pole_word_match.document_token.i) ==
- abs(this_pole_index -
- other_other_pole_word_match.document_token.i) and
- this_other_pole_word_match.document_token.i >
- other_other_pole_word_match.document_token.i
- )
- ):
- # The document tokens at the other poles corefer with each other and
- # the other match's token is closer to the second document token (the
- # one at this pole). Both matches are from the same phraselet.
- # If the tokens from the two matches are the same distance from the document
- # token at this pole but on opposite sides of it, the preceding one beats
- # the succeeding one simply because we have to choose one or the other.
- return False
-
- if not check_for_sibling_match_with_higher_similarity(
- match, other_this_pole_match, this_other_pole_word_match,
- other_other_pole_word_match):
- return False
- return True
-
- if match.from_single_word_phraselet:
- return True
- if not perform_checks_at_pole(match, True):
- return False
- if not perform_checks_at_pole(match, False):
- return False
- return True
-
- def remove_duplicates(matches):
- # Situations where the same document tokens have been matched by multiple phraselets
- matches_to_return = []
- if len(matches) == 0:
- return matches_to_return
- else:
- matches_to_return.append(matches[0])
- if len(matches) > 1:
- previous_whole_word_single_word_match = None
- for counter in range(1, len(matches)):
- this_match = matches[counter]
- previous_match = matches[counter-1]
- if this_match.index_within_document == previous_match.index_within_document:
- if previous_match.from_single_word_phraselet and \
- previous_match.get_subword_index() is None:
- previous_whole_word_single_word_match = previous_match
- if this_match.get_subword_index() is not None and \
- previous_whole_word_single_word_match is not None and \
- this_match.index_within_document == \
- previous_whole_word_single_word_match.index_within_document:
- # This match is against a subword where the whole word has also been
- # matched, so reject it
- continue
- if this_match.document_label != previous_match.document_label:
- matches_to_return.append(this_match)
- elif len(this_match.word_matches) != len(previous_match.word_matches):
- matches_to_return.append(this_match)
- else:
- this_word_matches_indexes = [
- word_match.get_document_index() for word_match in
- this_match.word_matches]
- previous_word_matches_indexes = [
- word_match.get_document_index() for word_match in
- previous_match.word_matches]
- # In some circumstances the two phraselets may have matched the same
- # tokens the opposite way round
- if sorted(this_word_matches_indexes) != \
- sorted(previous_word_matches_indexes):
- matches_to_return.append(this_match)
- return matches_to_return
-
- doc = self._semantic_analyzer.parse(text_to_match)
- phraselet_labels_to_phraselet_infos = {}
- self.structural_matcher.add_phraselets_to_dict(
- doc,
- phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
- replace_with_hypernym_ancestors=False,
- match_all_words=False,
- ignore_relation_phraselets=False,
- include_reverse_only=True,
- stop_lemmas=self._semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=
- self._semantic_analyzer.topic_matching_reverse_only_parent_lemmas)
-
- # now add the single word phraselets whose tags did not match.
- self.structural_matcher.add_phraselets_to_dict(
- doc,
- phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
- replace_with_hypernym_ancestors=False,
- match_all_words=True,
- ignore_relation_phraselets=True,
- include_reverse_only=False, # value is irrelevant with
- # ignore_relation_phraselets == True
- stop_lemmas=self._semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=
- self._semantic_analyzer.topic_matching_reverse_only_parent_lemmas)
- if len(phraselet_labels_to_phraselet_infos) == 0:
- return []
- phraselet_labels_to_search_phrases = \
- self.structural_matcher.create_search_phrases_from_phraselet_infos(
- phraselet_labels_to_phraselet_infos.values())
- # First get single-word matches
- structural_matches = self.structural_matcher.match(
- indexed_documents=self.indexed_documents,
- search_phrases=phraselet_labels_to_search_phrases.values(),
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=True,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=False,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None,
- document_label_filter=self.document_label_filter)
- if not self.structural_matcher.embedding_based_matching_on_root_words:
- rebuild_document_info_dict(structural_matches, phraselet_labels_to_phraselet_infos)
- for phraselet in (
- phraselet_labels_to_search_phrases[phraselet_info.label] for
- phraselet_info in phraselet_labels_to_phraselet_infos.values() if
- phraselet_info.child_lemma is not None):
- set_phraselet_to_reverse_only_where_too_many_single_word_matches(phraselet)
-
- # Now get normally matched relations
- structural_matches.extend(self.structural_matcher.match(
- indexed_documents=self.indexed_documents,
- search_phrases=phraselet_labels_to_search_phrases.values(),
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=False,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=False,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None,
- document_label_filter=self.document_label_filter))
-
- rebuild_document_info_dict(structural_matches, phraselet_labels_to_phraselet_infos)
- parent_document_labels_to_indexes_for_direct_retry_sets = {}
- parent_document_labels_to_indexes_for_embedding_retry_sets = {}
- child_document_labels_to_indexes_for_embedding_retry_sets = {}
- for phraselet in (
- phraselet_labels_to_search_phrases[phraselet_info.label] for
- phraselet_info in phraselet_labels_to_phraselet_infos.values() if
- phraselet_info.child_lemma is not None):
- get_indexes_for_reverse_matching(
- phraselet=phraselet,
- parent_document_labels_to_indexes_for_direct_retry_sets=
- parent_document_labels_to_indexes_for_direct_retry_sets,
- parent_document_labels_to_indexes_for_embedding_retry_sets=
- parent_document_labels_to_indexes_for_embedding_retry_sets,
- child_document_labels_to_indexes_for_embedding_retry_sets=
- child_document_labels_to_indexes_for_embedding_retry_sets)
- if len(parent_document_labels_to_indexes_for_embedding_retry_sets) > 0 or \
- len(parent_document_labels_to_indexes_for_direct_retry_sets) > 0:
-
- # Perform reverse matching at selected indexes
- structural_matches.extend(self.structural_matcher.match(
- indexed_documents=self.indexed_documents,
- search_phrases=phraselet_labels_to_search_phrases.values(),
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=False,
- compare_embeddings_on_root_words=True,
- compare_embeddings_on_non_root_words=False,
- document_labels_to_indexes_for_reverse_matching_sets=
- parent_document_labels_to_indexes_for_direct_retry_sets,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=
- parent_document_labels_to_indexes_for_embedding_retry_sets,
- document_label_filter=self.document_label_filter))
-
- if len(child_document_labels_to_indexes_for_embedding_retry_sets) > 0:
-
- # Retry normal matching at selected indexes with embedding-based matching on children
- structural_matches.extend(self.structural_matcher.match(
- indexed_documents=self.indexed_documents,
- search_phrases=phraselet_labels_to_search_phrases.values(),
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=False,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=
- child_document_labels_to_indexes_for_embedding_retry_sets,
- document_label_filter=self.document_label_filter))
- if len(parent_document_labels_to_indexes_for_direct_retry_sets) > 0 or \
- len(parent_document_labels_to_indexes_for_embedding_retry_sets) > 0 or \
- len(child_document_labels_to_indexes_for_embedding_retry_sets) > 0:
- rebuild_document_info_dict(structural_matches, phraselet_labels_to_phraselet_infos)
- structural_matches = list(filter(filter_superfluous_matches, structural_matches))
- position_sorted_structural_matches = sorted(
- structural_matches, key=lambda match:
- (
- match.document_label, match.index_within_document,
- match.get_subword_index_for_sorting(), match.from_single_word_phraselet))
- position_sorted_structural_matches = remove_duplicates(position_sorted_structural_matches)
- # Read through the documents measuring the activation based on where
- # in the document structural matches were found
- score_sorted_structural_matches = self.perform_activation_scoring(
- position_sorted_structural_matches)
- return self.get_topic_matches(
- score_sorted_structural_matches, position_sorted_structural_matches)
-
- def perform_activation_scoring(self, position_sorted_structural_matches):
- """
- Read through the documents measuring the activation based on where
- in the document structural matches were found.
- """
- def get_set_from_dict(dictionary, key):
- if key in dictionary:
- return dictionary[key]
- else:
- return set()
-
- def get_current_activation_for_phraselet(phraselet_activation_tracker, current_index):
- distance_to_last_match = current_index - phraselet_activation_tracker.position
- tailoff_quotient = distance_to_last_match / self.maximum_activation_distance
- if tailoff_quotient > 1.0:
- tailoff_quotient = 1.0
- return (1-tailoff_quotient) * phraselet_activation_tracker.score
-
- document_labels_to_indexes_to_phraselet_labels = {}
- for match in (
- match for match in position_sorted_structural_matches if not
- match.from_single_word_phraselet):
- if match.document_label in document_labels_to_indexes_to_phraselet_labels:
- inner_dict = document_labels_to_indexes_to_phraselet_labels[match.document_label]
- else:
- inner_dict = {}
- document_labels_to_indexes_to_phraselet_labels[match.document_label] = inner_dict
- parent_word_match = self._get_word_match_from_match(match, True)
- self._add_to_dict_set(
- inner_dict, parent_word_match.get_document_index(), match.search_phrase_label)
- child_word_match = self._get_word_match_from_match(match, False)
- self._add_to_dict_set(
- inner_dict, child_word_match.get_document_index(), match.search_phrase_label)
- current_document_label = None
- for pssm_index, match in enumerate(position_sorted_structural_matches):
- match.original_index_within_list = pssm_index # store for later use after resorting
- if match.document_label != current_document_label or pssm_index == 0:
- current_document_label = match.document_label
- phraselet_labels_to_phraselet_activation_trackers = {}
- indexes_to_phraselet_labels = document_labels_to_indexes_to_phraselet_labels.get(
- current_document_label, {})
- match.is_overlapping_relation = False
- if match.from_single_word_phraselet:
- if match.from_topic_match_phraselet_created_without_matching_tags:
- this_match_score = self.single_word_any_tag_score
- else:
- this_match_score = self.single_word_score
- else:
- if match.from_reverse_only_topic_match_phraselet:
- this_match_score = self.reverse_only_relation_score
- else:
- this_match_score = self.relation_score
- this_match_parent_word_match = self._get_word_match_from_match(match, True)
- this_match_parent_index = this_match_parent_word_match.get_document_index()
- this_match_child_word_match = self._get_word_match_from_match(match, False)
- this_match_child_index = this_match_child_word_match.get_document_index()
- other_relevant_phraselet_labels = get_set_from_dict(
- indexes_to_phraselet_labels,
- this_match_parent_index) | \
- get_set_from_dict(indexes_to_phraselet_labels, this_match_child_index)
- other_relevant_phraselet_labels.remove(match.search_phrase_label)
- if len(other_relevant_phraselet_labels) > 0:
- match.is_overlapping_relation = True
- this_match_score *= self.overlapping_relation_multiplier
- overall_similarity_measure = float(match.overall_similarity_measure)
- if overall_similarity_measure < 1.0:
- this_match_score *= self.embedding_penalty * overall_similarity_measure
- for word_match in (word_match for word_match in match.word_matches \
- if word_match.type == 'ontology'):
- this_match_score *= (self.ontology_penalty ** (abs(word_match.depth) + 1))
- if match.search_phrase_label in phraselet_labels_to_phraselet_activation_trackers:
- phraselet_activation_tracker = phraselet_labels_to_phraselet_activation_trackers[
- match.search_phrase_label]
- current_score = get_current_activation_for_phraselet(
- phraselet_activation_tracker, match.index_within_document)
- if this_match_score > current_score:
- phraselet_activation_tracker.score = this_match_score
- else:
- phraselet_activation_tracker.score = current_score
- phraselet_activation_tracker.position = match.index_within_document
- else:
- phraselet_labels_to_phraselet_activation_trackers[match.search_phrase_label] =\
- PhraseletActivationTracker(match.index_within_document, this_match_score)
- match.topic_score = 0
- for phraselet_label in list(phraselet_labels_to_phraselet_activation_trackers):
- phraselet_activation_tracker = phraselet_labels_to_phraselet_activation_trackers[
- phraselet_label]
- current_activation = get_current_activation_for_phraselet(
- phraselet_activation_tracker, match.index_within_document)
- if current_activation <= 0:
- del phraselet_labels_to_phraselet_activation_trackers[phraselet_label]
- else:
- match.topic_score += current_activation
- return sorted(position_sorted_structural_matches, key=lambda match: 0-match.topic_score)
-
- def get_topic_matches(
- self, score_sorted_structural_matches, position_sorted_structural_matches):
- """Resort the matches starting with the highest (most active) and
- create topic match objects with information about the surrounding sentences.
- """
-
- def match_contained_within_existing_topic_match(topic_matches, match):
- for topic_match in topic_matches:
- if match.document_label == topic_match.document_label and \
- match.index_within_document >= topic_match.start_index and \
- match.index_within_document <= topic_match.end_index:
- return True
- return False
-
- def alter_start_and_end_indexes_for_match(start_index, end_index, match):
- for word_match in match.word_matches:
- if word_match.first_document_token.i < start_index:
- start_index = word_match.first_document_token.i
- if word_match.document_subword is not None and \
- word_match.document_subword.containing_token_index < start_index:
- start_index = word_match.document_subword.containing_token_index
- if word_match.last_document_token.i > end_index:
- end_index = word_match.last_document_token.i
- if word_match.document_subword is not None and \
- word_match.document_subword.containing_token_index > end_index:
- end_index = word_match.document_subword.containing_token_index
- return start_index, end_index
-
- if self.only_one_result_per_document:
- existing_document_labels = []
- topic_matches = []
- counter = 0
- for score_sorted_match in score_sorted_structural_matches:
- if counter >= self.number_of_results:
- break
- if match_contained_within_existing_topic_match(topic_matches, score_sorted_match):
- continue
- if self.only_one_result_per_document and score_sorted_match.document_label \
- in existing_document_labels:
- continue
- start_index, end_index = alter_start_and_end_indexes_for_match(
- score_sorted_match.index_within_document,
- score_sorted_match.index_within_document,
- score_sorted_match)
- previous_index_within_list = score_sorted_match.original_index_within_list
- while previous_index_within_list > 0 and position_sorted_structural_matches[
- previous_index_within_list-1].document_label == \
- score_sorted_match.document_label and position_sorted_structural_matches[
- previous_index_within_list].topic_score > self.single_word_score:
- # previous_index_within_list rather than previous_index_within_list -1 :
- # when a complex structure is matched, it will often begin with a single noun
- # that should be included within the topic match indexes
- if match_contained_within_existing_topic_match(
- topic_matches, position_sorted_structural_matches[
- previous_index_within_list-1]):
- break
- if score_sorted_match.index_within_document - position_sorted_structural_matches[
- previous_index_within_list-1].index_within_document > \
- self.sideways_match_extent:
- break
- previous_index_within_list -= 1
- start_index, end_index = alter_start_and_end_indexes_for_match(
- start_index, end_index,
- position_sorted_structural_matches[previous_index_within_list])
- next_index_within_list = score_sorted_match.original_index_within_list
- while next_index_within_list + 1 < len(score_sorted_structural_matches) and \
- position_sorted_structural_matches[next_index_within_list+1].document_label == \
- score_sorted_match.document_label and \
- position_sorted_structural_matches[next_index_within_list+1].topic_score >= \
- self.single_word_score:
- if match_contained_within_existing_topic_match(
- topic_matches, position_sorted_structural_matches[
- next_index_within_list+1]):
- break
- if position_sorted_structural_matches[
- next_index_within_list+1].index_within_document - \
- score_sorted_match.index_within_document > self.sideways_match_extent:
- break
- next_index_within_list += 1
- start_index, end_index = alter_start_and_end_indexes_for_match(
- start_index, end_index,
- position_sorted_structural_matches[next_index_within_list])
- working_document = self.indexed_documents[score_sorted_match.document_label].doc
- relevant_sentences = [
- sentence for sentence in working_document.sents
- if sentence.end > start_index and sentence.start <= end_index]
- sentences_start_index = relevant_sentences[0].start
- sentences_end_index = relevant_sentences[-1].end
- text = working_document[sentences_start_index: sentences_end_index].text
- topic_matches.append(
- TopicMatch(
- score_sorted_match.document_label,
- score_sorted_match.index_within_document,
- score_sorted_match.get_subword_index(),
- start_index, end_index, sentences_start_index, sentences_end_index - 1,
- score_sorted_match.topic_score, text, position_sorted_structural_matches[
- previous_index_within_list:next_index_within_list+1]))
- if self.only_one_result_per_document:
- existing_document_labels.append(score_sorted_match.document_label)
- counter += 1
- # If two matches have the same score, order them by length
- return sorted(
- topic_matches, key=lambda topic_match: (
- 0-topic_match.score, topic_match.start_index - topic_match.end_index))
-
- def topic_match_documents_returning_dictionaries_against(
- self, text_to_match, tied_result_quotient):
- """Returns a list of dictionaries representing the results of a topic match between an
- entered text and the loaded documents. Callers of this method do not have to manage any
- further dependencies on spaCy or Holmes.
-
- Properties:
-
- text_to_match -- the text to match against the loaded documents.
- tied_result_quotient -- the quotient between a result and following results above which
- the results are interpreted as tied
- """
-
- class WordInfo:
-
- def __init__(self, relative_start_index, relative_end_index, typ, explanation):
- self.relative_start_index = relative_start_index
- self.relative_end_index = relative_end_index
- self.typ = typ
- self.explanation = explanation
- self.is_highest_activation = False
-
- def __eq__(self, other):
- return isinstance(other, WordInfo) and \
- self.relative_start_index == other.relative_start_index and \
- self.relative_end_index == other.relative_end_index
-
- def __hash__(self):
- return hash((self.relative_start_index, self.relative_end_index))
-
- def get_containing_word_info_key(word_infos_to_word_infos, this_word_info):
- for other_word_info in word_infos_to_word_infos:
- if this_word_info.relative_start_index > other_word_info.relative_start_index and \
- this_word_info.relative_end_index <= other_word_info.relative_end_index:
- return other_word_info
- if this_word_info.relative_start_index >= other_word_info.relative_start_index and\
- this_word_info.relative_end_index < other_word_info.relative_end_index:
- return other_word_info
- return None
-
- topic_matches = self.topic_match_documents_against(text_to_match)
- topic_match_dicts = []
- for topic_match_counter, topic_match in enumerate(topic_matches):
- doc = self.indexed_documents[topic_match.document_label].doc
- sentences_character_start_index_in_document = doc[topic_match.sentences_start_index].idx
- sentences_character_end_index_in_document = doc[topic_match.sentences_end_index].idx + \
- len(doc[topic_match.sentences_end_index].text)
- word_infos_to_word_infos = {}
- for match in topic_match.structural_matches:
- for word_match in match.word_matches:
- if word_match.document_subword is not None:
- subword = word_match.document_subword
- relative_start_index = doc[subword.containing_token_index].idx + \
- subword.char_start_index - \
- sentences_character_start_index_in_document
- relative_end_index = relative_start_index + len(subword.text)
- else:
- relative_start_index = word_match.first_document_token.idx - \
- sentences_character_start_index_in_document
- relative_end_index = word_match.last_document_token.idx + \
- len(word_match.last_document_token.text) - \
- sentences_character_start_index_in_document
- if match.is_overlapping_relation:
- word_info = WordInfo(
- relative_start_index, relative_end_index, 'overlapping_relation',
- word_match.explain())
- elif match.from_single_word_phraselet:
- word_info = WordInfo(
- relative_start_index, relative_end_index, 'single',
- word_match.explain())
- else:
- word_info = WordInfo(
- relative_start_index, relative_end_index, 'relation',
- word_match.explain())
- if word_info in word_infos_to_word_infos:
- existing_word_info = word_infos_to_word_infos[word_info]
- if not existing_word_info.typ == 'overlapping_relation':
- if match.is_overlapping_relation:
- existing_word_info.typ = 'overlapping_relation'
- elif not match.from_single_word_phraselet:
- existing_word_info.typ = 'relation'
- else:
- word_infos_to_word_infos[word_info] = word_info
- for word_info in list(word_infos_to_word_infos.keys()):
- if get_containing_word_info_key(word_infos_to_word_infos, word_info) is not None:
- del word_infos_to_word_infos[word_info]
- if topic_match.subword_index is not None:
- subword = doc[topic_match.index_within_document]._.holmes.subwords\
- [topic_match.subword_index]
- highest_activation_relative_start_index = \
- doc[subword.containing_token_index].idx + \
- subword.char_start_index - \
- sentences_character_start_index_in_document
- highest_activation_relative_end_index = \
- highest_activation_relative_start_index + len(subword.text)
- else:
- highest_activation_relative_start_index = \
- doc[topic_match.index_within_document].idx - \
- sentences_character_start_index_in_document
- highest_activation_relative_end_index = doc[topic_match.index_within_document].idx \
- + len(doc[topic_match.index_within_document].text) - \
- sentences_character_start_index_in_document
- highest_activation_word_info = WordInfo(
- highest_activation_relative_start_index, highest_activation_relative_end_index,
- 'temp', 'temp')
- containing_word_info = get_containing_word_info_key(
- word_infos_to_word_infos, highest_activation_word_info)
- if containing_word_info is not None:
- highest_activation_word_info = containing_word_info
- word_infos_to_word_infos[highest_activation_word_info].is_highest_activation = True
- word_infos = sorted(
- word_infos_to_word_infos.values(), key=lambda word_info: (
- word_info.relative_start_index, word_info.relative_end_index))
- topic_match_dict = {
- 'document_label': topic_match.document_label,
- 'text': topic_match.text,
- 'text_to_match': text_to_match,
- 'rank': str(topic_match_counter + 1), # ties are corrected by
- # TopicMatchDictionaryOrderer
- 'sentences_character_start_index_in_document':
- sentences_character_start_index_in_document,
- 'sentences_character_end_index_in_document':
- sentences_character_end_index_in_document,
- 'score': topic_match.score,
- 'word_infos': [
- [
- word_info.relative_start_index, word_info.relative_end_index,
- word_info.typ, word_info.is_highest_activation, word_info.explanation]
- for word_info in word_infos]
- # The word infos are labelled by array index alone to prevent the JSON from
- # becoming too bloated
- }
- topic_match_dicts.append(topic_match_dict)
- return TopicMatchDictionaryOrderer().order(
- topic_match_dicts, self.number_of_results, tied_result_quotient)
-
-class TopicMatchDictionaryOrderer:
- # extracted into its own class to facilite use by MultiprocessingManager
-
- def order(self, topic_match_dicts, number_of_results, tied_result_quotient):
-
- topic_match_dicts = sorted(
- topic_match_dicts, key=lambda dict: (
- 0-dict['score'], 0-len(dict['text'].split()), dict['document_label'],
- dict['word_infos'][0][0]))
- topic_match_dicts = topic_match_dicts[0:number_of_results]
- topic_match_counter = 0
- while topic_match_counter < len(topic_match_dicts):
- topic_match_dicts[topic_match_counter]['rank'] = str(topic_match_counter + 1)
- following_topic_match_counter = topic_match_counter + 1
- while following_topic_match_counter < len(topic_match_dicts) and \
- topic_match_dicts[following_topic_match_counter]['score'] / topic_match_dicts[
- topic_match_counter]['score'] > tied_result_quotient:
- working_rank = ''.join((str(topic_match_counter + 1), '='))
- topic_match_dicts[topic_match_counter]['rank'] = working_rank
- topic_match_dicts[following_topic_match_counter]['rank'] = working_rank
- following_topic_match_counter += 1
- topic_match_counter = following_topic_match_counter
- return topic_match_dicts
-
-
-class SupervisedTopicTrainingUtils:
-
- def __init__(self, overlap_memory_size, oneshot):
- self.overlap_memory_size = overlap_memory_size
- self.oneshot = oneshot
-
- def get_labels_to_classification_frequencies_dict(
- self, *, matches, labels_to_classifications_dict):
- """ Builds a dictionary from search phrase (phraselet) labels to classification
- frequencies. Depending on the training phase, which is signalled by the parameters, the
- dictionary tracks either raw frequencies for each search phrase label or points to a
- second dictionary from classification labels to frequencies.
-
- Parameters:
-
- matches -- the structural matches from which to build the dictionary
- labels_to_classifications_dict -- a dictionary from document labels to document
- classifications, or 'None' if the target dictionary should contain raw frequencies.
- """
- def increment(search_phrase_label, document_label):
- if labels_to_classifications_dict is not None:
- if search_phrase_label not in labels_to_frequencies_dict:
- classification_frequency_dict = {}
- labels_to_frequencies_dict[search_phrase_label] = classification_frequency_dict
- else:
- classification_frequency_dict = labels_to_frequencies_dict[search_phrase_label]
- classification = labels_to_classifications_dict[document_label]
- if classification in classification_frequency_dict:
- classification_frequency_dict[classification] += 1
- else:
- classification_frequency_dict[classification] = 1
- else:
- if search_phrase_label not in labels_to_frequencies_dict:
- labels_to_frequencies_dict[search_phrase_label] = 1
- else:
- labels_to_frequencies_dict[search_phrase_label] += 1
-
- def relation_match_involves_whole_word_containing_subwords(match):
- # Where there are subwords, we suppress relation matches with the
- # entire word. The same rule is not applied to single-word matches because
- # it still makes sense to track words with more than three subwords.
- return len(match.word_matches) > 1 and \
- len(
- [
- word_match for word_match in match.word_matches if
- len(word_match.document_token._.holmes.subwords) > 0 and
- word_match.document_subword is None]
- ) > 0
-
- labels_to_frequencies_dict = {}
- matches = [
- match for match in matches if not
- relation_match_involves_whole_word_containing_subwords(match)]
- matches = sorted(
- matches, key=lambda match: (
- match.document_label, match.index_within_document,
- match.get_subword_index_for_sorting()))
- for index, match in enumerate(matches):
- if self.oneshot:
- if ('this_document_label' not in locals()) or \
- this_document_label != match.document_label:
- this_document_label = match.document_label
- search_phrases_added_for_this_document = set()
- if match.search_phrase_label not in search_phrases_added_for_this_document:
- increment(match.search_phrase_label, match.document_label)
- search_phrases_added_for_this_document.add(match.search_phrase_label)
- else:
- increment(match.search_phrase_label, match.document_label)
- if not match.from_single_word_phraselet:
- previous_match_index = index
- number_of_analyzed_matches_counter = 0
- while previous_match_index > 0 and number_of_analyzed_matches_counter \
- <= self.overlap_memory_size:
- previous_match_index -= 1
- previous_match = matches[previous_match_index]
- if previous_match.document_label != match.document_label:
- break
- if previous_match.from_single_word_phraselet:
- continue
- if previous_match.search_phrase_label == match.search_phrase_label:
- continue # otherwise coreference resolution leads to phrases being
- # combined with themselves
- number_of_analyzed_matches_counter += 1
- previous_word_match_doc_indexes = [
- word_match.get_document_index() for word_match in
- previous_match.word_matches]
- for word_match in match.word_matches:
- if word_match.get_document_index() in previous_word_match_doc_indexes:
- # the same word is involved in both matches, so combine them
- # into a new label
- label_parts = sorted((
- previous_match.search_phrase_label, match.search_phrase_label))
- combined_label = '/'.join((label_parts[0], label_parts[1]))
- if self.oneshot:
- if combined_label not in search_phrases_added_for_this_document:
- increment(combined_label, match.document_label)
- search_phrases_added_for_this_document.add(combined_label)
- else:
- increment(combined_label, match.document_label)
- return labels_to_frequencies_dict
-
- def record_matches(
- self, *, phraselet_labels_to_search_phrases, structural_matcher,
- sorted_label_dict, doc_label, doc, matrix, row_index, verbose):
- """ Matches a document against the currently stored phraselets and records the matches
- in a matrix.
-
- Parameters:
-
- phraselet_labels_to_search_phrases -- a dictionary from search phrase (phraselet)
- labels to search phrase objects.
- structural_matcher -- the structural matcher to use for comparisons.
- sorted_label_dict -- a dictionary from search phrase (phraselet) labels to their own
- alphabetic sorting indexes.
- doc_label -- the document label, or 'None' if there is none.
- doc -- the document to be matched.
- matrix -- the matrix within which to record the matches.
- row_index -- the row number within the matrix corresponding to the document.
- verbose -- if 'True', matching information is outputted to the console.
- """
- indexed_document = structural_matcher.index_document(doc)
- indexed_documents = {doc_label:indexed_document}
- found = False
- for label, occurrences in \
- self.get_labels_to_classification_frequencies_dict(
- matches=structural_matcher.match(
- indexed_documents=indexed_documents,
- search_phrases=phraselet_labels_to_search_phrases.values(),
- output_document_matching_message_to_console=verbose,
- match_depending_on_single_words=None,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None),
- labels_to_classifications_dict=None
- ).items():
- if self.oneshot:
- occurrences = 1
- if label in sorted_label_dict: # may not be the case for compound labels
- label_index = sorted_label_dict[label]
- matrix[row_index, label_index] = occurrences
- found = True
- return found
-
-class SupervisedTopicTrainingBasis:
- """ Holder object for training documents and their classifications from which one or more
- 'SupervisedTopicModelTrainer' objects can be derived. This class is *NOT* threadsafe.
- """
- def __init__(
- self, *, structural_matcher, classification_ontology, overlap_memory_size,
- oneshot, match_all_words, verbose):
- """ Parameters:
-
- structural_matcher -- the structural matcher to use.
- classification_ontology -- an Ontology object incorporating relationships between
- classification labels.
- overlap_memory_size -- how many non-word phraselet matches to the left should be
- checked for words in common with a current match.
- oneshot -- whether the same word or relationship matched multiple times should be
- counted once only (value 'True') or multiple times (value 'False')
- match_all_words -- whether all single words should be taken into account
- (value 'True') or only single words with noun tags (value 'False')
- verbose -- if 'True', information about training progress is outputted to the console.
- """
- self.semantic_analyzer = structural_matcher.semantic_analyzer
- self.structural_matcher = structural_matcher
- self.classification_ontology = classification_ontology
- self._utils = SupervisedTopicTrainingUtils(overlap_memory_size, oneshot)
- self._match_all_words = match_all_words
- self.verbose = verbose
-
- self.training_documents = {}
- self.training_documents_labels_to_classifications_dict = {}
- self.additional_classification_labels = set()
- self.classification_implication_dict = {}
- self.labels_to_classification_frequencies = None
- self.phraselet_labels_to_phraselet_infos = {}
- self.classifications = None
-
- def parse_and_register_training_document(self, text, classification, label=None):
- """ Parses and registers a document to use for training.
-
- Parameters:
-
- text -- the document text
- classification -- the classification label
- label -- a label with which to identify the document in verbose training output,
- or 'None' if a random label should be assigned.
- """
- self.register_training_document(self.semantic_analyzer.parse(text), classification, label)
-
- def register_training_document(self, doc, classification, label):
- """ Registers a pre-parsed document to use for training.
-
- Parameters:
-
- doc -- the document
- classification -- the classification label
- label -- a label with which to identify the document in verbose training output,
- or 'None' if a random label should be assigned.
- """
- if self.labels_to_classification_frequencies is not None:
- raise RuntimeError(
- "register_training_document() may not be called once prepare() has been called")
- if label is None:
- label = str(uuid.uuid4())
- if label in self.training_documents:
- raise DuplicateDocumentError(label)
- if self.verbose:
- print('Registering document', label)
- indexed_document = self.structural_matcher.index_document(doc)
- self.training_documents[label] = indexed_document
- self.structural_matcher.add_phraselets_to_dict(
- doc,
- phraselet_labels_to_phraselet_infos=
- self.phraselet_labels_to_phraselet_infos,
- replace_with_hypernym_ancestors=True,
- match_all_words=self._match_all_words,
- ignore_relation_phraselets=False,
- include_reverse_only=False,
- stop_lemmas=self.semantic_analyzer.\
- supervised_document_classification_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=None)
- self.training_documents_labels_to_classifications_dict[label] = classification
-
- def register_additional_classification_label(self, label):
- """ Register an additional classification label which no training document has explicitly
- but that should be assigned to documents whose explicit labels are related to the
- additional classification label via the classification ontology.
- """
- if self.labels_to_classification_frequencies is not None:
- raise RuntimeError(
- "register_additional_classification_label() may not be called once prepare() has "\
- " been called")
- if self.classification_ontology is not None and \
- self.classification_ontology.contains(label):
- self.additional_classification_labels.add(label)
-
- def prepare(self):
- """ Matches the phraselets derived from the training documents against the training
- documents to generate frequencies that also include combined labels, and examines the
- explicit classification labels, the additional classification labels and the
- classification ontology to derive classification implications.
-
- Once this method has been called, the instance no longer accepts new training documents
- or additional classification labels.
- """
- if self.labels_to_classification_frequencies is not None:
- raise RuntimeError(
- "prepare() may only be called once")
- if self.verbose:
- print('Matching documents against all phraselets')
- search_phrases = self.structural_matcher.create_search_phrases_from_phraselet_infos(
- self.phraselet_labels_to_phraselet_infos.values()).values()
- self.labels_to_classification_frequencies = self._utils.\
- get_labels_to_classification_frequencies_dict(
- matches=self.structural_matcher.match(
- indexed_documents=self.training_documents,
- search_phrases=search_phrases,
- output_document_matching_message_to_console=self.verbose,
- match_depending_on_single_words=None,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None),
- labels_to_classifications_dict=
- self.training_documents_labels_to_classifications_dict)
- self.classifications = sorted(set(
- self.training_documents_labels_to_classifications_dict.values()
- ).union(self.additional_classification_labels))
- if len(self.classifications) < 2:
- raise FewerThanTwoClassificationsError(len(self.classifications))
- if self.classification_ontology is not None:
- for parent in self.classifications:
- for child in self.classifications:
- if self.classification_ontology.matches(parent, child):
- if child in self.classification_implication_dict.keys():
- self.classification_implication_dict[child].append(parent)
- else:
- self.classification_implication_dict[child] = [parent]
-
- def train(
- self, *, minimum_occurrences=4, cv_threshold=1.0, mlp_activation='relu',
- mlp_solver='adam', mlp_learning_rate='constant', mlp_learning_rate_init=0.001,
- mlp_max_iter=200, mlp_shuffle=True, mlp_random_state=42, overlap_memory_size=10,
- hidden_layer_sizes=None):
- """ Trains a model based on the prepared state.
-
- Parameters:
-
- minimum_occurrences -- the minimum number of times a word or relationship has to
- occur in the context of at least one single classification for the phraselet
- to be accepted into the final model.
- cv_threshold -- the minimum coefficient of variation a word or relationship has
- to occur with respect to explicit classification labels for the phraselet to be
- accepted into the final model.
- mlp_* -- see https://scikit-learn.org/stable/modules/generated/
- sklearn.neural_network.MLPClassifier.html.
- overlap_memory_size -- No longer has any effect - the value defined in __init__()
- is used instead. Retained for backwards compatibility.
- hidden_layer_sizes -- a tuple containing the number of neurons in each hidden layer, or
- 'None' if the topology should be determined automatically.
- """
-
- if self.labels_to_classification_frequencies is None:
- raise RuntimeError("train() may only be called after prepare() has been called")
- return SupervisedTopicModelTrainer(
- training_basis=self,
- semantic_analyzer=self.semantic_analyzer,
- structural_matcher=self.structural_matcher,
- labels_to_classification_frequencies=self.labels_to_classification_frequencies,
- phraselet_infos=self.phraselet_labels_to_phraselet_infos.values(),
- minimum_occurrences=minimum_occurrences,
- cv_threshold=cv_threshold,
- mlp_activation=mlp_activation,
- mlp_solver=mlp_solver,
- mlp_learning_rate=mlp_learning_rate,
- mlp_learning_rate_init=mlp_learning_rate_init,
- mlp_max_iter=mlp_max_iter,
- mlp_shuffle=mlp_shuffle,
- mlp_random_state=mlp_random_state,
- hidden_layer_sizes=hidden_layer_sizes,
- utils=self._utils
- )
-
-class SupervisedTopicModelTrainer:
- """ Worker object used to train and generate models. This class is *NOT* threadsafe."""
-
- def __init__(
- self, *, training_basis, semantic_analyzer, structural_matcher,
- labels_to_classification_frequencies, phraselet_infos, minimum_occurrences,
- cv_threshold, mlp_activation, mlp_solver, mlp_learning_rate, mlp_learning_rate_init,
- mlp_max_iter, mlp_shuffle, mlp_random_state, hidden_layer_sizes, utils):
-
- self._utils = utils
- self._semantic_analyzer = semantic_analyzer
- self._structural_matcher = structural_matcher
- self._training_basis = training_basis
- self._minimum_occurrences = minimum_occurrences
- self._cv_threshold = cv_threshold
- self._labels_to_classification_frequencies, self._phraselet_infos = self._filter(
- labels_to_classification_frequencies, phraselet_infos)
-
- if len(self._phraselet_infos) == 0:
- raise NoPhraseletsAfterFilteringError(
- ''.join((
- 'minimum_occurrences: ', str(minimum_occurrences), '; cv_threshold: ',
- str(cv_threshold)))
- )
-
- phraselet_labels_to_search_phrases = \
- self._structural_matcher.create_search_phrases_from_phraselet_infos(
- self._phraselet_infos)
- self._sorted_label_dict = {}
- for index, label in enumerate(sorted(self._labels_to_classification_frequencies.keys())):
- self._sorted_label_dict[label] = index
- self._input_matrix = dok_matrix((
- len(self._training_basis.training_documents), len(self._sorted_label_dict)))
- self._output_matrix = dok_matrix((
- len(self._training_basis.training_documents),
- len(self._training_basis.classifications)))
-
- if self._training_basis.verbose:
- print('Matching documents against filtered phraselets')
- for index, document_label in enumerate(
- sorted(self._training_basis.training_documents.keys())):
- self._utils.record_matches(
- structural_matcher=self._structural_matcher,
- phraselet_labels_to_search_phrases=phraselet_labels_to_search_phrases,
- sorted_label_dict=self._sorted_label_dict,
- doc_label=document_label,
- doc=self._training_basis.training_documents[document_label].doc,
- matrix=self._input_matrix,
- row_index=index,
- verbose=self._training_basis.verbose)
- self._record_classifications_for_training(document_label, index)
- self._hidden_layer_sizes = hidden_layer_sizes
- if self._hidden_layer_sizes is None:
- start = len(self._sorted_label_dict)
- step = (len(self._training_basis.classifications) - len(self._sorted_label_dict)) / 3
- self._hidden_layer_sizes = (start, int(start+step), int(start+(2*step)))
- if self._training_basis.verbose:
- print('Hidden layer sizes:', self._hidden_layer_sizes)
- self._mlp = MLPClassifier(
- activation=mlp_activation,
- solver=mlp_solver,
- hidden_layer_sizes=self._hidden_layer_sizes,
- learning_rate=mlp_learning_rate,
- learning_rate_init=mlp_learning_rate_init,
- max_iter=mlp_max_iter,
- shuffle=mlp_shuffle,
- verbose=self._training_basis.verbose,
- random_state=mlp_random_state)
- self._mlp.fit(self._input_matrix, self._output_matrix)
- if self._training_basis.verbose and self._mlp.n_iter_ < mlp_max_iter:
- print('MLP neural network converged after', self._mlp.n_iter_, 'iterations.')
-
- def _filter(self, labels_to_classification_frequencies, phraselet_infos):
- """ Filters the phraselets in memory based on minimum_occurrences and cv_threshold. """
-
- accepted = 0
- under_minimum_occurrences = 0
- under_minimum_cv = 0
- new_labels_to_classification_frequencies = {}
- for label, classification_frequencies in labels_to_classification_frequencies.items():
- at_least_minimum = False
- working_classification_frequencies = classification_frequencies.copy()
- for classification in working_classification_frequencies:
- if working_classification_frequencies[classification] >= self._minimum_occurrences:
- at_least_minimum = True
- if not at_least_minimum:
- under_minimum_occurrences += 1
- continue
- frequency_list = list(working_classification_frequencies.values())
- # We only want to take explicit classification labels into account, i.e. ignore the
- # classification ontology.
- number_of_classification_labels = \
- len(set(
- self._training_basis.training_documents_labels_to_classifications_dict.values())
- )
- frequency_list.extend([0] * number_of_classification_labels)
- frequency_list = frequency_list[:number_of_classification_labels]
- if statistics.pstdev(frequency_list) / statistics.mean(frequency_list) >= \
- self._cv_threshold:
- accepted += 1
- new_labels_to_classification_frequencies[label] = classification_frequencies
- else:
- under_minimum_cv += 1
- if self._training_basis.verbose:
- print(
- 'Filtered: accepted', accepted, '; removed minimum occurrences',
- under_minimum_occurrences, '; removed cv threshold',
- under_minimum_cv)
- new_phraselet_infos = [
- phraselet_info for phraselet_info in phraselet_infos if
- phraselet_info.label in new_labels_to_classification_frequencies.keys()]
- return new_labels_to_classification_frequencies, new_phraselet_infos
-
- def _record_classifications_for_training(self, document_label, index):
- classification = self._training_basis.training_documents_labels_to_classifications_dict[
- document_label]
- classification_index = self._training_basis.classifications.index(classification)
- self._output_matrix[index, classification_index] = 1
- if classification in self._training_basis.classification_implication_dict:
- for implied_classification in \
- self._training_basis.classification_implication_dict[classification]:
- implied_classification_index = self._training_basis.classifications.index(
- implied_classification)
- self._output_matrix[index, implied_classification_index] = 1
-
- def classifier(self):
- """ Returns a supervised topic classifier which contains no explicit references to the
- training data and that can be serialized.
- """
- self._mlp.verbose = False # we no longer require output once we are using the model
- # to classify new documents
- model = SupervisedTopicClassifierModel(
- semantic_analyzer_model=self._semantic_analyzer.model,
- structural_matcher_ontology=self._structural_matcher.ontology,
- phraselet_infos=self._phraselet_infos,
- mlp=self._mlp,
- sorted_label_dict=self._sorted_label_dict,
- classifications=self._training_basis.classifications,
- overlap_memory_size=self._utils.overlap_memory_size,
- oneshot=self._utils.oneshot,
- analyze_derivational_morphology=
- self._structural_matcher.analyze_derivational_morphology)
- return SupervisedTopicClassifier(
- self._semantic_analyzer, self._structural_matcher, model, self._training_basis.verbose)
-
-class SupervisedTopicClassifierModel:
- """ A serializable classifier model.
-
- Parameters:
-
- semantic_analyzer_model -- a string specifying the spaCy model with which this instance
- was generated and with which it must be used.
- structural_matcher_ontology -- the ontology used for matching documents against this model
- (not the classification ontology!)
- phraselet_infos -- the phraselets used for structural matching
- mlp -- the neural network
- sorted_label_dict -- a dictionary from search phrase (phraselet) labels to their own
- alphabetic sorting indexes.
- classifications -- an ordered list of classification labels corresponding to the
- neural network outputs
- overlap_memory_size -- how many non-word phraselet matches to the left should be
- checked for words in common with a current match.
- oneshot -- whether the same word or relationship matched multiple times should be
- counted once only (value 'True') or multiple times (value 'False')
- analyze_derivational_morphology -- the value of this manager parameter that was in force
- when the model was built. The same value has to be in force when the model is
- deserialized and reused.
- """
-
- def __init__(
- self, semantic_analyzer_model, structural_matcher_ontology,
- phraselet_infos, mlp, sorted_label_dict, classifications, overlap_memory_size,
- oneshot, analyze_derivational_morphology):
- self.semantic_analyzer_model = semantic_analyzer_model
- self.structural_matcher_ontology = structural_matcher_ontology
- self.phraselet_infos = phraselet_infos
- self.mlp = mlp
- self.sorted_label_dict = sorted_label_dict
- self.classifications = classifications
- self.overlap_memory_size = overlap_memory_size
- self.oneshot = oneshot
- self.analyze_derivational_morphology = analyze_derivational_morphology
-
-class SupervisedTopicClassifier:
- """Classifies new documents based on a pre-trained model."""
-
- def __init__(self, semantic_analyzer, structural_matcher, model, verbose):
- self._semantic_analyzer = semantic_analyzer
- self._structural_matcher = structural_matcher
- self._model = model
- self._verbose = verbose
- self._utils = SupervisedTopicTrainingUtils(model.overlap_memory_size, model.oneshot)
- if self._semantic_analyzer.model != model.semantic_analyzer_model:
- raise WrongModelDeserializationError(model.semantic_analyzer_model)
- if hasattr(model, 'analyze_derivational_morphology'): # backwards compatibility
- analyze_derivational_morphology = model.analyze_derivational_morphology
- else:
- analyze_derivational_morphology = False
- if self._structural_matcher.analyze_derivational_morphology != \
- analyze_derivational_morphology:
- print(
- ''.join((
- 'manager: ', str(self._structural_matcher.analyze_derivational_morphology),
- '; model: ', str(analyze_derivational_morphology))))
- raise IncompatibleAnalyzeDerivationalMorphologyDeserializationError(
- ''.join((
- 'manager: ', str(self._structural_matcher.analyze_derivational_morphology),
- '; model: ', str(analyze_derivational_morphology))))
- self._structural_matcher.ontology = model.structural_matcher_ontology
- self._structural_matcher.populate_ontology_reverse_derivational_dict()
- self._phraselet_labels_to_search_phrases = \
- self._structural_matcher.create_search_phrases_from_phraselet_infos(
- model.phraselet_infos)
-
- def parse_and_classify(self, text):
- """ Returns a list containing zero, one or many document classifications. Where more
- than one classifications are returned, the labels are ordered by decreasing
- probability.
-
- Parameter:
-
- text -- the text to parse and classify.
- """
- return self.classify(self._semantic_analyzer.parse(text))
-
- def classify(self, doc):
- """ Returns a list containing zero, one or many document classifications. Where more
- than one classifications are returned, the labels are ordered by decreasing
- probability.
-
- Parameter:
-
- doc -- the pre-parsed document to classify.
- """
-
- if self._model is None:
- raise RuntimeError('No model defined')
- new_document_matrix = dok_matrix((1, len(self._model.sorted_label_dict)))
- if not self._utils.record_matches(
- structural_matcher=self._structural_matcher,
- phraselet_labels_to_search_phrases=self._phraselet_labels_to_search_phrases,
- sorted_label_dict=self._model.sorted_label_dict,
- doc=doc,
- doc_label='',
- matrix=new_document_matrix,
- row_index=0,
- verbose=self._verbose):
- return []
- else:
- classification_indexes = self._model.mlp.predict(new_document_matrix).nonzero()[1]
- if len(classification_indexes) > 1:
- probabilities = self._model.mlp.predict_proba(new_document_matrix)
- classification_indexes = sorted(
- classification_indexes, key=lambda index: 1-probabilities[0, index])
- return list(map(
- lambda index: self._model.classifications[index], classification_indexes))
-
- def serialize_model(self):
- return jsonpickle.encode(self._model)
diff --git a/holmes_extractor/lang/__init__.py b/holmes_extractor/lang/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/holmes_extractor/lang/de/__init__.py b/holmes_extractor/lang/de/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/holmes_extractor/lang/de/data/__init__.py b/holmes_extractor/lang/de/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/holmes_extractor/data/derivation_de.csv b/holmes_extractor/lang/de/data/derivation.csv
similarity index 94%
rename from holmes_extractor/data/derivation_de.csv
rename to holmes_extractor/lang/de/data/derivation.csv
index 1e88c1b..bf3a47d 100644
--- a/holmes_extractor/data/derivation_de.csv
+++ b/holmes_extractor/lang/de/data/derivation.csv
@@ -1,433 +1,433 @@
-abbau,abbauen
-abbonement,abbonnieren
-abbruch,abbrechen
-abfahrt,abfahren
-abflug,abfliegen
-abgabe,abgeben
-ablauf,ablaufen
-abnahme,abnehmen
-abreise,abreisen
-absage,absagen
-abschluss,abschließen
-abschrift,abschreiben
-absicht,beabsichtigen
-abstieg,absteigen
-abwehr,abwehren
-adoption,adoptieren
-akzeptanz,akzeptieren
-amputation,amputieren
-analyse,analysieren
-anbau,anbauen
-anfang,anfangen
-angabe,angeben
-angebot,anbieten
-angriff,angreifen
-ankunft,ankommen
-anlage,anlegen
-annahme,annehmen
-anprobe,anprobieren
-anreise,anreisen
-anruf,anrufen
-anschluss,anschließen
-ansporn,anspornen
-anstieg,ansteigen
-anstoß,anstoßen
-anstrich,anstreichen
-antrieb,antreiben
-antwort,antworten
-anzeige,anzeigen
-arbeit,arbeiten
-arrangement,arrangieren
-assimilation,assimilieren
-attacke,attackieren
-ärger,ärgern
-audit,auditieren,auditierung
-aufbau,aufbauen
-aufbruch,aufbrechen
-aufgabe,aufgeben
-aufnahme,aufnehmen
-aufsicht,beaufsichtigen
-aufstieg,aufsteigen
-auftrag,beauftragen
-aufwand,aufwenden
-ausbau,ausbauen
-ausdruck,ausdrücken
-ausfall,ausfallen
-ausgabe,ausgeben
-ausgang,ausgehen
-ausgleich,ausgleichen
-ausleihe,ausleihen
-ausschluss,ausschließen
-aussprache,aussprechen
-ausstieg,aussteigen
-austausch,austauschen
-auswahl,auswählen
-bau,bauen
-bedrängen,bedrängnis
-befehl,befehlen
-beginn,beginnen
-beichte,beichten
-beistand,beistehen
-beitrag,beitragen
-beitritt,beitreten
-bekennen,bekenntnis
-beleg,belegen
-bericht,berichten
-beschluss,beschließen
-beschwerde,beschweren
-besitz,besitzen
-besuch,besuchen
-beten,gebet
-betrieb,betreiben
-betrug,betrügen
-beweis,beweisen
-biss,beißen
-bitte,bitten
-blamage,blamieren
-blick,blicken
-blitz,blitzen
-blockade,blockieren
-blüte,blühen
-boykott,boykottieren
-brand,brennen
-brüllen,gebrüll
-bummel,bummeln
-dank,bedanken
-dank,danken
-dauer,dauern
-debatte,debattieren
-deklaration,deklarieren
-dekoration,dekorieren
-dementi,dementieren
-demonstration,demonstrieren
-demontage,demontieren
-denken,gedanke
-deportation,deportieren
-desertation,desertieren
-desinfektion,desinfizieren
-destillation,destillieren
-diagnose,diagnostizieren
-dienen,dienst
-diskussion,diskutieren
-dokumentation,dokumentieren
-donner,donnern
-dopen,doping
-druck,drucken
-duft,duften
-dusche,duschen
-ehre,ehren
-eile,eilen
-einfall,einfallen
-eingabe,eingeben
-eingriff,eingreifen
-einkauf,einkaufen
-einnahme,einnehmen
-einsatz,einsetzen
-einsehen,einsicht
-einstieg,einsteigen
-einsturz,einstürzen
-einwurf,einwerfen
-einzug,einziehen
-emigration,emigrieren
-empfang,empfangen
-ende,enden
-engagement,engagieren
-entnahme,entnehmen
-entschluss,entschließen
-entwurf,entwerfen
-ereignen,ereignis
-erhalt,erhalten
-erkennen,erkenntnis
-erlass,erlassen
-erlauben,erlaubnis
-erleben,erlebnis
-ernte,ernten
-erschweren,erschwernis
-erwerb,erwerben
-existenz,existieren
-experiment,experimentieren
-explosion,explodieren
-export,exportieren
-extraktion,extrahieren
-fahrt,fahren
-fall,fallen
-fang,fangen
-faszination,faszinieren
-feier,feiern
-festnahme,festnehmen
-flirt,flirten
-flucht,fliehen
-flucht,flüchten
-flug,fliegen
-folge,folgen
-fortschritt,fortschreiten
-frage,fragen
-freigabe,freigeben
-freude,freuen
-frost,frieren
-frustration,frustrieren
-frühstück,frühstücken
-fund,finden
-furcht,fürchten
-fühlen,gefühl
-gabe,geben
-garantie,garantieren
-geruch,riechen
-gesang,singen
-geschmack,schmecken
-glanz,glänzen
-glaube,glauben
-glückwunsch,beglückwünschen
-gratulation,gratulieren
-griff,greifen
-gruß,grüßen
-guss,gießen
-hagel,hageln
-halt,halten
-harmonie,harmonieren
-hass,hassen
-hauch,hauchen
-heirat,heiraten
-herrschen,herrschaft
-hetze,hetzen
-hilfe,helfen
-hinweis,hinweisen
-identifikation,identifizieren
-ignoranz,ignorieren
-illustration,illustrieren
-immigration,immigrieren
-import,importieren
-infektion,infizieren
-information,informieren
-inhalt,beinhalten
-inspiration,inspirieren
-installation,installieren
-integration,integrieren
-interesse,interessieren
-interpretation,interpretieren
-interview,interviewen
-investieren,investition
-irritation,irritieren
-jagd,jagen
-joggen,jogging
-jubel,jubeln
-kampf,kämpfen
-kauf,kaufen
-kennen,kenntnis
-klage,klagen,beklagen
-klang,klingen
-kollision,kollidieren
-kombination,kombinieren
-kommunikation,kommunizieren
-komponieren,komposition
-konfrontation,konfrontieren
-konstruieren,konstruktion
-kontraktion,kontrahieren
-kontrolle,kontrollieren
-konzentration,konzentrieren
-kopie,kopieren
-korrektur,korrigieren
-korrespondenz,korrespondieren
-kritik,kritisieren
-kummer,bekümmern
-kuss,küssen
-langeweile,langweilen
-lauf,laufen
-lehre,lehren
-leihen,verleih,ausleihe
-liebe,lieben
-lob,loben
-lüge,lügen
-managen,management
-mangel,mangeln
-marsch,marschieren
-massage,massieren
-miete,mieten
-mitarbeit,mitarbeiten
-mitfühlen,mitgefühl
-mitschrift,mitschreiben
-montage,montieren
-müde,müdigkeit
-nachfolge,nachfolgen
-nachfrage,nachfragen
-nachlass,nachlassen
-nachweis,nachweisen
-neid,beneiden
-notiz,notieren
-operation,operieren
-opfer,opfern
-patrouille,patrouillieren
-pflege,pflegen
-picknick,picknicken
-plädoyer,plädieren
-politur,polieren
-pose,posieren
-predigt,predigen
-privileg,privilegieren
-probe,proben,probieren
-produktion,produzieren
-protest,protestieren
-protokoll,protokollieren
-provokation,provozieren
-qual,quälen
-quatschen,gequatsche
-rache,rächen
-rat,raten
-raub,rauben
-reaktion,reagieren
-rebellion,rebellieren
-rede,reden
-reduktion,reduzieren
-reform,reformieren
-regen,regnen
-regeneration,regenerieren
-reise,reisen
-reiz,reizen
-reklamation,reklamieren
-reparatur,reparieren
-respekt,respektieren
-restauration,restaurieren
-reue,bereuen
-revision,revidieren
-risiko,riskieren
-riss,reißen
-ritt,reiten
-rotation,rotieren
-ruf,rufen
-ruhe,ruhen
-ruin,ruinieren
-rückgabe,zurückgeben
-rückgriff,zurückgreifen
-rückkehr,zurückkehren
-rücktritt,zurücktreten
-rückzug,zurückziehen
-sabotage,sabotieren
-schau,schauen
-schauder,schaudern
-schein,scheinen
-scherz,scherzen
-schikane,schikanieren
-schimmer,schimmern
-schimpfen,geschimpfe
-schlaf,schlafen
-schlag,schlagen
-schmerz,schmerzen
-schmuggel,schmuggeln
-schnee,schneien
-schrei,schreien
-schrift,schreiben
-schritt,schreiten
-schuss,schießen
-schutz,schützen,beschützen
-schwatz,schwatzen
-schweiß,schwitzen
-schwindel,schwindeln
-schwur,schwüren
-schwung,schwingen
-sehen,sicht
-seufzen,seufzer
-sieg,siegen,besiegen
-sorge,sorgen
-spazieren,spaziergang
-spekulation,spekulieren
-spende,spenden
-spiel,spielen
-spionage,spionieren
-spott,spotten
-sprung,springen
-stagnation,stagnieren
-start,starten
-stau,stauen
-stimulation,stimulieren
-stopp,stoppen
-stoß,stoßen
-streik,streiken
-streit,streiten
-studium,studieren
-sturm,stürmen
-sturz,stürzen
-suche,suchen
-sünde,sündigen
-süß,sußigkeit
-tanz,tanzen
-tat,tun
-taufe,taufen
-tausch,tauschen
-teilnahme,teilnehmen
-telefonat,telefonieren
-test,testen
-training,trainieren
-transport,transportieren
-trauer,trauern
-traum,träumen
-tritt,treten
-triumph,triumphieren
-trost,trösten
-überfall,überfallen
-übergabe,übergeben
-umbau,umbauen
-umgang,umgehen
-umkehr,umkehren
-umstieg,umsteigen
-umtausch,umtauschen
-umzug,umziehen
-unterricht,unterrichten
-unterschrift,unterschreiben
-urteil,urteilen
-variation,variieren
-verbot,verbieten
-verbrauch,verbrauchen
-verbund,verbinden
-verdienen,verdienst
-vergabe,vergeben
-vergleich,vergleichen
-verhör,verhören
-verkauf,verkaufen
-verlauf,verlaufen
-verlust,verlieren
-verrat,verraten
-versand,versenden
-verschleiß,verschleißen
-verschluss,verschließen
-versteck,verstecken
-verstehen,verständnis
-versuch,versuchen
-versäumen,versäumnis
-verzehr,verzehren
-verzicht,verzichten
-voraussage,voraussagen
-vorgabe,vorgeben
-vorhersage,vorhersagen
-vorkommen,vorkommnis
-vorschlag,vorschlagen
-vorschrift,vorschreiben
-vortrag,vortragen
-wachsen,wachstum
-wagen,wagnis
-wahl,wählen
-wandel,wandeln
-wechsel,wechseln
-weggang,weggehen
-wegnahme,wegnehmen
-weiterfahrt,weiterfahren
-weitergabe,weitergeben
-wende,wenden
-wette,wetten
-widerruf,widerrufen
-widerspruch,widersprechen
-widerstand,widerstehen
-wiegen,gewicht
-wille,wollen
-wunsch,wünschen
-wurf,werfen
-wäsche,waschen
-zensur,zensieren
-zitation,zitieren
-zug,ziehen
-zunahme,zunehmen
-zusammenarbeit,zusammenarbeiten
-zusammenbau,zusammenbauen
-zusammenstoß,zusammenstoßen
-zwang,zwingen
-zweifel,bezweifeln
-zweifel,zweifeln
+abbau,abbauen
+abbonement,abbonnieren
+abbruch,abbrechen
+abfahrt,abfahren
+abflug,abfliegen
+abgabe,abgeben
+ablauf,ablaufen
+abnahme,abnehmen
+abreise,abreisen
+absage,absagen
+abschluss,abschließen
+abschrift,abschreiben
+absicht,beabsichtigen
+abstieg,absteigen
+abwehr,abwehren
+adoption,adoptieren,adoptiert
+akzeptanz,akzeptieren
+amputation,amputieren
+analyse,analysieren
+anbau,anbauen
+anfang,anfangen
+angabe,angeben
+angebot,anbieten
+angriff,angreifen
+ankunft,ankommen
+anlage,anlegen
+annahme,annehmen
+anprobe,anprobieren
+anreise,anreisen
+anruf,anrufen
+anschluss,anschließen
+ansporn,anspornen
+anstieg,ansteigen
+anstoß,anstoßen
+anstrich,anstreichen
+antrieb,antreiben
+antwort,antworten
+anzeige,anzeigen
+arbeit,arbeiten
+arrangement,arrangieren
+assimilation,assimilieren
+attacke,attackieren
+ärger,ärgern
+audit,auditieren,auditierung
+aufbau,aufbauen
+aufbruch,aufbrechen
+aufgabe,aufgeben
+aufnahme,aufnehmen
+aufsicht,beaufsichtigen
+aufstieg,aufsteigen
+auftrag,beauftragen
+aufwand,aufwenden
+ausbau,ausbauen
+ausdruck,ausdrücken
+ausfall,ausfallen
+ausgabe,ausgeben
+ausgang,ausgehen
+ausgleich,ausgleichen
+ausleihe,ausleihen
+ausschluss,ausschließen
+aussprache,aussprechen
+ausstieg,aussteigen
+austausch,austauschen
+auswahl,auswählen
+bau,bauen
+bedrängen,bedrängnis
+befehl,befehlen
+beginn,beginnen
+beichte,beichten
+beistand,beistehen
+beitrag,beitragen
+beitritt,beitreten
+bekennen,bekenntnis
+beleg,belegen
+bericht,berichten
+beschluss,beschließen
+beschwerde,beschweren
+besitz,besitzen
+besuch,besuchen
+beten,gebet
+betrieb,betreiben
+betrug,betrügen
+beweis,beweisen
+biss,beißen
+bitte,bitten
+blamage,blamieren
+blick,blicken
+blitz,blitzen
+blockade,blockieren
+blüte,blühen
+boykott,boykottieren
+brand,brennen
+brüllen,gebrüll
+bummel,bummeln
+dank,bedanken
+dank,danken
+dauer,dauern
+debatte,debattieren
+deklaration,deklarieren
+dekoration,dekorieren
+dementi,dementieren
+demonstration,demonstrieren
+demontage,demontieren
+denken,gedanke
+deportation,deportieren
+desertation,desertieren
+desinfektion,desinfizieren
+destillation,destillieren
+diagnose,diagnostizieren
+dienen,dienst
+diskussion,diskutieren
+dokumentation,dokumentieren
+donner,donnern
+dopen,doping
+druck,drucken
+duft,duften
+dusche,duschen
+ehre,ehren
+eile,eilen
+einfall,einfallen
+eingabe,eingeben
+eingriff,eingreifen
+einkauf,einkaufen
+einnahme,einnehmen
+einsatz,einsetzen
+einsehen,einsicht
+einstieg,einsteigen
+einsturz,einstürzen
+einwurf,einwerfen
+einzug,einziehen
+emigration,emigrieren
+empfang,empfangen
+ende,enden
+engagement,engagieren
+entnahme,entnehmen
+entschluss,entschließen
+entwurf,entwerfen
+ereignen,ereignis
+erhalt,erhalten
+erkennen,erkenntnis
+erlass,erlassen
+erlauben,erlaubnis
+erleben,erlebnis
+ernte,ernten
+erschweren,erschwernis
+erwerb,erwerben
+existenz,existieren
+experiment,experimentieren
+explosion,explodieren
+export,exportieren
+extraktion,extrahieren
+fahrt,fahren
+fall,fallen
+fang,fangen
+faszination,faszinieren
+feier,feiern
+festnahme,festnehmen
+flirt,flirten
+flucht,fliehen
+flucht,flüchten
+flug,fliegen
+folge,folgen
+fortschritt,fortschreiten
+frage,fragen
+freigabe,freigeben
+freude,freuen
+frost,frieren
+frustration,frustrieren
+frühstück,frühstücken
+fund,finden
+furcht,fürchten
+fühlen,gefühl
+gabe,geben
+garantie,garantieren
+geruch,riechen
+gesang,singen
+geschmack,schmecken
+glanz,glänzen
+glaube,glauben
+glückwunsch,beglückwünschen
+gratulation,gratulieren
+griff,greifen
+gruß,grüßen
+guss,gießen
+hagel,hageln
+halt,halten
+harmonie,harmonieren
+hass,hassen
+hauch,hauchen
+heirat,heiraten
+herrschen,herrschaft
+hetze,hetzen
+hilfe,helfen
+hinweis,hinweisen
+identifikation,identifizieren
+ignoranz,ignorieren
+illustration,illustrieren
+immigration,immigrieren
+import,importieren
+infektion,infizieren
+information,informieren
+inhalt,beinhalten
+inspiration,inspirieren
+installation,installieren
+integration,integrieren
+interesse,interessieren
+interpretation,interpretieren
+interview,interviewen
+investieren,investition
+irritation,irritieren
+jagd,jagen
+joggen,jogging
+jubel,jubeln
+kampf,kämpfen
+kauf,kaufen
+kennen,kenntnis
+klage,klagen,beklagen
+klang,klingen
+kollision,kollidieren
+kombination,kombinieren
+kommunikation,kommunizieren
+komponieren,komposition
+konfrontation,konfrontieren
+konstruieren,konstruktion
+kontraktion,kontrahieren
+kontrolle,kontrollieren
+konzentration,konzentrieren
+kopie,kopieren
+korrektur,korrigieren
+korrespondenz,korrespondieren
+kritik,kritisieren
+kummer,bekümmern
+kuss,küssen
+langeweile,langweilen
+lauf,laufen
+lehre,lehren
+leihen,verleih,ausleihe
+liebe,lieben
+lob,loben
+lüge,lügen
+managen,management
+mangel,mangeln
+marsch,marschieren
+massage,massieren
+miete,mieten
+mitarbeit,mitarbeiten
+mitfühlen,mitgefühl
+mitschrift,mitschreiben
+montage,montieren
+müde,müdigkeit
+nachfolge,nachfolgen
+nachfrage,nachfragen
+nachlass,nachlassen
+nachweis,nachweisen
+neid,beneiden
+notiz,notieren
+operation,operieren
+opfer,opfern
+patrouille,patrouillieren
+pflege,pflegen
+picknick,picknicken
+plädoyer,plädieren
+politur,polieren
+pose,posieren
+predigt,predigen
+privileg,privilegieren
+probe,proben,probieren
+produktion,produzieren
+protest,protestieren
+protokoll,protokollieren
+provokation,provozieren
+qual,quälen
+quatschen,gequatsche
+rache,rächen
+rat,raten
+raub,rauben
+reaktion,reagieren
+rebellion,rebellieren
+rede,reden
+reduktion,reduzieren
+reform,reformieren
+regen,regnen
+regeneration,regenerieren
+reise,reisen
+reiz,reizen
+reklamation,reklamieren
+reparatur,reparieren
+respekt,respektieren
+restauration,restaurieren
+reue,bereuen
+revision,revidieren
+risiko,riskieren
+riss,reißen
+ritt,reiten
+rotation,rotieren
+ruf,rufen
+ruhe,ruhen
+ruin,ruinieren
+rückgabe,zurückgeben
+rückgriff,zurückgreifen
+rückkehr,zurückkehren
+rücktritt,zurücktreten
+rückzug,zurückziehen
+sabotage,sabotieren
+schau,schauen
+schauder,schaudern
+schein,scheinen
+scherz,scherzen
+schikane,schikanieren
+schimmer,schimmern
+schimpfen,geschimpfe
+schlaf,schlafen
+schlag,schlagen
+schmerz,schmerzen
+schmuggel,schmuggeln
+schnee,schneien
+schrei,schreien
+schrift,schreiben
+schritt,schreiten
+schuss,schießen
+schutz,schützen,beschützen
+schwatz,schwatzen
+schweiß,schwitzen
+schwindel,schwindeln
+schwur,schwüren
+schwung,schwingen
+sehen,sicht
+seufzen,seufzer
+sieg,siegen,besiegen
+sorge,sorgen
+spazieren,spaziergang
+spekulation,spekulieren
+spende,spenden
+spiel,spielen
+spionage,spionieren
+spott,spotten
+sprung,springen
+stagnation,stagnieren
+start,starten
+stau,stauen
+stimulation,stimulieren
+stopp,stoppen
+stoß,stoßen
+streik,streiken
+streit,streiten
+studium,studieren
+sturm,stürmen
+sturz,stürzen
+suche,suchen
+sünde,sündigen
+süß,sußigkeit
+tanz,tanzen
+tat,tun
+taufe,taufen
+tausch,tauschen
+teilnahme,teilnehmen
+telefonat,telefonieren
+test,testen
+training,trainieren
+transport,transportieren
+trauer,trauern
+traum,träumen
+tritt,treten
+triumph,triumphieren
+trost,trösten
+überfall,überfallen
+übergabe,übergeben
+umbau,umbauen
+umgang,umgehen
+umkehr,umkehren
+umstieg,umsteigen
+umtausch,umtauschen
+umzug,umziehen
+unterricht,unterrichten
+unterschrift,unterschreiben
+urteil,urteilen
+variation,variieren
+verbot,verbieten
+verbrauch,verbrauchen
+verbund,verbinden
+verdienen,verdienst
+vergabe,vergeben
+vergleich,vergleichen
+verhör,verhören
+verkauf,verkaufen
+verlauf,verlaufen
+verlust,verlieren
+verrat,verraten
+versand,versenden
+verschleiß,verschleißen
+verschluss,verschließen
+versteck,verstecken
+verstehen,verständnis
+versuch,versuchen
+versäumen,versäumnis
+verzehr,verzehren
+verzicht,verzichten
+voraussage,voraussagen
+vorgabe,vorgeben
+vorhersage,vorhersagen
+vorkommen,vorkommnis
+vorschlag,vorschlagen
+vorschrift,vorschreiben
+vortrag,vortragen
+wachsen,wachstum
+wagen,wagnis
+wahl,wählen
+wandel,wandeln
+wechsel,wechseln
+weggang,weggehen
+wegnahme,wegnehmen
+weiterfahrt,weiterfahren
+weitergabe,weitergeben
+wende,wenden
+wette,wetten
+widerruf,widerrufen
+widerspruch,widersprechen
+widerstand,widerstehen
+wiegen,gewicht
+wille,wollen
+wunsch,wünschen
+wurf,werfen
+wäsche,waschen
+zensur,zensieren
+zitation,zitieren
+zug,ziehen
+zunahme,zunehmen
+zusammenarbeit,zusammenarbeiten
+zusammenbau,zusammenbauen
+zusammenstoß,zusammenstoßen
+zwang,zwingen
+zweifel,bezweifeln
+zweifel,zweifeln
diff --git a/holmes_extractor/lang/de/language_specific_rules.py b/holmes_extractor/lang/de/language_specific_rules.py
new file mode 100644
index 0000000..f0dd8a1
--- /dev/null
+++ b/holmes_extractor/lang/de/language_specific_rules.py
@@ -0,0 +1,1155 @@
+from string import punctuation
+from spacy.tokens import Token
+from ...parsing import SemanticAnalyzer, SemanticMatchingHelper, MatchImplication,\
+ PhraseletTemplate, SemanticDependency, Subword
+
+class LanguageSpecificSemanticAnalyzer(SemanticAnalyzer):
+
+ language_name = 'German'
+
+ noun_pos = ('NOUN', 'PROPN', 'ADJ')
+
+ matchable_pos = ('ADJ', 'ADP', 'ADV', 'NOUN', 'NUM', 'PROPN', 'VERB', 'AUX', 'X', 'INTJ')
+
+ predicate_head_pos = ('VERB', 'AUX')
+
+ adjectival_predicate_head_pos = ('AUX')
+
+ adjectival_predicate_subject_pos = ('NOUN', 'PROPN', 'PRON')
+
+ adjectival_predicate_subject_dep = 'sb'
+
+ adjectival_predicate_predicate_pos = 'ADV'
+
+ adjectival_predicate_predicate_dep = 'pd'
+
+ modifier_dep = 'nk'
+
+ spacy_noun_to_preposition_dep = 'mnr'
+
+ spacy_verb_to_preposition_dep = 'mo'
+
+ holmes_noun_to_preposition_dep = 'mnrposs'
+
+ holmes_verb_to_preposition_dep = 'moposs'
+
+ conjunction_deps = ('cj', 'cd', 'punct', 'app')
+
+ interrogative_pronoun_tags = ('PWAT', 'PWAV', 'PWS')
+
+ semantic_dependency_excluded_tags = ('ART')
+
+ generic_pronoun_lemmas = ('jemand', 'etwas')
+
+ or_lemma = 'oder'
+
+ mark_child_dependencies_copied_to_siblings_as_uncertain = False
+
+ maximum_mentions_in_coreference_chain = 3
+
+ maximum_word_distance_in_coreference_chain = 300
+
+ sibling_marker_deps = ('cj', 'app')
+
+ entity_labels_to_corresponding_lexemes = {
+ 'PER': 'person',
+ 'LOC': 'ort',
+ 'ORG': 'organisation',
+ 'MISC': 'sache'
+ }
+
+ whose_lemma = 'wessen'
+
+ # Only words at least this long are examined for possible subwords
+ minimum_length_for_subword_search = 10
+
+ # Part-of-speech pos examined for subwords (additionally to part-of-speech tags below)
+ pos_for_subword_search = ('X')
+
+ # Part-of-speech tags examined for subwords
+ # Verbs are not examined because the separable parts that would typically be found as
+ # subwords are too short to be found.
+ tag_for_subword_search = ('NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD', '$(')
+
+ # Absolute minimum length of a subword.
+ minimum_subword_length = 3
+
+ # Subwords at least this long are preferred.
+ minimum_normal_subword_length = 6
+
+ # Subwords longer than this are likely not to be atomic and solutions that split them up are
+ # preferred
+ maximum_realistic_subword_length = 12
+
+ # Scoring bonus where a Fugen-S follows a whitelisted ending
+ # (one where a Fugen-S is normally expected)
+ fugen_s_after_whitelisted_ending_bonus = 5
+
+ # Scoring bonus where a Fugen-S follows an ending where it is neither expected nor disallowed
+ fugen_s_after_non_whitelisted_non_blacklisted_ending_bonus = 3
+
+ # Both words around a Fugen-S have to be at least this long for the scoring bonus to be applied
+ fugen_s_whitelist_bonus_surrounding_word_minimum_length = 5
+
+ # Endings after which a Fugen-S is normally expected
+ fugen_s_ending_whitelist = (
+ 'tum', 'ling', 'ion', 'tät', 'heit', 'keit', 'schaft', 'sicht', 'ung')
+
+ # Endings after which a Fugen-S is normally disallowed
+ fugen_s_ending_blacklist = (
+ 'a', 'ä', 'e', 'i', 'o', 'ö', 'u', 'ü', 'nt', 'sch', 's', 'ß', 'st', 'tz', 'z')
+
+ # Whitelisted subwords
+ subword_whitelist = (
+ 'haltig')
+
+ # Blacklisted subwords
+ subword_blacklist = (
+ 'igkeit', 'igkeiten', 'digkeit', 'digkeiten', 'schaft', 'schaften',
+ 'keit', 'keiten', 'lichkeit', 'lichkeiten', 'tigten', 'tigung', 'tigungen', 'barkeit',
+ 'barkeiten', 'heit', 'heiten', 'ung', 'ungen', 'aften', 'erung', 'erungen', 'mungen', 'tig')
+
+ # Bigraphs of two consonants that can occur at the start of a subword.
+ subword_start_consonant_bigraph_whitelist = (
+ 'bl', 'br', 'ch', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gm', 'gn', 'gr', 'kl', 'kn', 'kr',
+ 'kw', 'pf', 'ph', 'pl', 'pn', 'pr', 'ps', 'rh', 'sc', 'sh', 'sk', 'sl', 'sm', 'sp', 'st',
+ 'sw', 'sz', 'th', 'tr', 'vl', 'vr', 'wr', 'zw')
+
+ # Bigraphs of two consonants that can occur at the end of a subword.
+ # Bigraphs where the second consonant is 's' are always allowed.
+ subword_end_consonant_bigraph_whitelist = (
+ 'bb', 'bs', 'bt', 'ch', 'ck', 'ct', 'dd', 'ds', 'dt', 'ff', 'fs', 'ft', 'gd', 'gg', 'gn',
+ 'gs', 'gt', 'hb', 'hd', 'hf', 'hg', 'hk', 'hl', 'hm', 'hn', 'hp', 'hr', 'hs', 'ht', 'ks',
+ 'kt', 'lb', 'lc', 'ld', 'lf', 'lg', 'lk', 'll', 'lm', 'ln', 'lp', 'ls', 'lt', 'lx', 'lz',
+ 'mb', 'md', 'mk', 'mm', 'mp', 'ms', 'mt', 'mx', 'nb', 'nd', 'nf', 'ng', 'nk', 'nn', 'np',
+ 'ns', 'nt', 'nx', 'nz', 'pf', 'ph', 'pp', 'ps', 'pt', 'rb', 'rc', 'rd', 'rf', 'rg', 'rk',
+ 'rl', 'rm', 'rn', 'rp', 'rr', 'rs', 'rt', 'rx', 'rz', 'sk', 'sl', 'sp', 'ss', 'st', 'th',
+ 'ts', 'tt', 'tz', 'xt', 'zt', 'ßt')
+
+ # Letters that can represent vowel sounds
+ vowels = ('a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü', 'y')
+
+ # Subwords used in analysis but not recorded on the Holmes dictionary instances. At present
+ # the code only supports these in word-final position; word-initial position would require
+ # a code change.
+ non_recorded_subword_list = ('lein', 'chen')
+
+ # Subword solutions that scored higher than this are regarded as probably wrong and so are
+ # not recorded.
+ maximum_acceptable_subword_score = 8
+
+ def is_oov(self, word):
+ working_word = word.lower()
+ if not self.vectors_nlp.vocab[working_word].is_oov:
+ return False
+ if len(word) == 1:
+ return True
+ working_word = ''.join((working_word[0].upper(), working_word[1:]))
+ return self.vectors_nlp.vocab[working_word].is_oov
+
+ def is_separable_prefix(self, token:Token):
+ return token.dep_ == 'svp' or (token.dep_ == 'mo' and token.pos_ == 'ADP' and
+ len(list(token.children)) == 0)
+
+ def add_subwords(self, token, subword_cache):
+ """ Adds any subwords to *token._.holmes*. """
+
+ class PossibleSubword:
+ """ A subword within a possible solution.
+
+ text -- the text
+ char_start_index -- the character start index of the subword within the word.
+ fugen_s_status --
+ '1' if the preceding word has an ending that normally has a Fugen-S,
+ '2' if the preceding word has an ending that precludes using a Fugen-S,
+ '0' otherwise.
+ """
+
+ def __init__(self, text, char_start_index, fugen_s_status):
+ self.text = text
+ self.char_start_index = char_start_index
+ self.fugen_s_status = fugen_s_status
+
+ def get_subword(lemma, initial_index, length):
+ # find the shortest subword longer than length.
+ for end_index in range(initial_index + length, len(lemma) + 1):
+ possible_word = lemma[initial_index: end_index]
+ if (not self.is_oov(possible_word) or possible_word in self.subword_whitelist) \
+ and len(possible_word) >= 2 and \
+ (
+ possible_word[0] in self.vowels or possible_word[1] in self.vowels
+ or
+ possible_word[:2] in self.subword_start_consonant_bigraph_whitelist) \
+ and (
+ possible_word[-1] in self.vowels or possible_word[-2] in self.vowels
+ or
+ possible_word[-2:] in self.subword_end_consonant_bigraph_whitelist):
+ return possible_word
+ return None
+
+ def score(possible_solution):
+ # Lower scores are better.
+ number = 0
+ for subword in possible_solution:
+ # subwords shorter than minimum_normal_subword_length: penalty of 2
+ if len(subword.text) < self.minimum_normal_subword_length:
+ number += 2 * (self.minimum_normal_subword_length - len(subword.text))
+ # subwords longer than 12: penalty of 2
+ elif len(subword.text) > self.maximum_realistic_subword_length:
+ number += 2 * (len(subword.text) - self.maximum_realistic_subword_length)
+ # fugen-s after a whitelist ending
+ if subword.fugen_s_status == 2:
+ number -= self.fugen_s_after_whitelisted_ending_bonus
+ # fugen-s after an ending that is neither whitelist nor blacklist
+ elif subword.fugen_s_status == 1:
+ number -= self.fugen_s_after_non_whitelisted_non_blacklisted_ending_bonus
+ return number
+
+ def scan_recursively_for_subwords(lemma, initial_index=0):
+
+ if initial_index == 0: # only need to check on the initial (outermost) call
+ for char in lemma:
+ if not char.isalpha() and char != '-':
+ return None
+ if initial_index + 1 < len(lemma) and lemma[initial_index] == '-':
+ return scan_recursively_for_subwords(lemma, initial_index + 1)
+ lengths = list(range(self.minimum_subword_length, 1 + len(lemma) - initial_index))
+ possible_solutions = []
+ working_subword = None
+ for length in lengths:
+ if working_subword is not None and len(working_subword) >= length:
+ # we are catching up with the length already returned by get_subword
+ continue
+ working_subword = get_subword(lemma, initial_index, length)
+ if working_subword is None or working_subword in self.subword_blacklist or \
+ '-' in working_subword:
+ continue
+ possible_solution = [PossibleSubword(working_subword, initial_index, 0)]
+ if \
+ (
+ initial_index + len(working_subword) == len(lemma)) or (
+ initial_index + len(working_subword)
+ + 1 == len(lemma) and lemma[-1] == '-') \
+ or (
+ initial_index + len(working_subword) + 2 == len(lemma) and lemma[-2:] ==
+ 's-'):
+ # we have reached the end of the word
+ possible_solutions.append(possible_solution)
+ break
+ following_subwords = scan_recursively_for_subwords(
+ lemma, initial_index + len(working_subword))
+ if following_subwords is not None:
+ possible_solution.extend(following_subwords)
+ possible_solutions.append(possible_solution)
+ if initial_index + len(working_subword) + 2 < len(lemma) and lemma[
+ initial_index + len(working_subword): initial_index +
+ len(working_subword) + 2] == 's-':
+ following_initial_index = initial_index + len(working_subword) + 2
+ elif initial_index + len(working_subword) + 1 < len(lemma) and \
+ lemma[initial_index + len(working_subword)] == 's':
+ following_initial_index = initial_index + len(working_subword) + 1
+ else:
+ continue
+ possible_solution = [PossibleSubword(working_subword, initial_index, 0)]
+ following_subwords = scan_recursively_for_subwords(lemma, following_initial_index)
+ if following_subwords is not None:
+ for ending in self.fugen_s_ending_whitelist:
+ if working_subword.endswith(ending):
+ following_subwords[0].fugen_s_status = 2
+ if following_subwords[0].fugen_s_status == 0 and len(working_subword) >= \
+ self.fugen_s_whitelist_bonus_surrounding_word_minimum_length and \
+ len(following_subwords[0].text) >= \
+ self.fugen_s_whitelist_bonus_surrounding_word_minimum_length:
+ # if the first does not have a whitelist ending and one of the words is
+ # short, do not give the score bonus
+ following_subwords[0].fugen_s_status = 1
+ for ending in self.fugen_s_ending_blacklist:
+ # blacklist ending: take the bonus away again
+ if working_subword.endswith(ending):
+ following_subwords[0].fugen_s_status = 0
+ possible_solution.extend(following_subwords)
+ possible_solutions.append(possible_solution)
+ if len(possible_solutions) > 0:
+ possible_solutions = sorted(
+ possible_solutions, key=score)
+ return possible_solutions[0]
+
+ def get_lemmatization_doc(possible_subwords, pos):
+ # We retrieve the lemma for each subword by calling spaCy. To reduce the
+ # overhead, we concatenate the subwords in the form:
+ # Subword1. Subword2. Subword3
+ entry_words = []
+ for counter, _ in enumerate(possible_subwords):
+ if counter + 1 == len(possible_subwords) and pos == 'ADJ':
+ entry_words.append(possible_subwords[counter].text)
+ else:
+ entry_words.append(possible_subwords[counter].text.capitalize())
+ subword_lemmatization_string = ' . '.join(entry_words)
+ return self.spacy_parse(subword_lemmatization_string)
+
+ if not (token.tag_ in self.tag_for_subword_search or token.pos_ in
+ self.pos_for_subword_search) or (
+ len(token._.holmes.lemma) < self.minimum_length_for_subword_search and
+ '-' not in token._.holmes.lemma) or token._.holmes.lemma in punctuation:
+ return
+ if token.text in subword_cache:
+ cached_subwords = subword_cache[token.text]
+ for cached_subword in cached_subwords:
+ token._.holmes.subwords.append(Subword(
+ token.i, cached_subword.index, cached_subword.text, cached_subword.lemma,
+ cached_subword.derived_lemma, self.get_vector(cached_subword.lemma),
+ cached_subword.char_start_index, cached_subword.dependent_index,
+ cached_subword.dependency_label, cached_subword.governor_index,
+ cached_subword.governing_dependency_label))
+ else:
+ working_subwords = []
+ possible_subwords = scan_recursively_for_subwords(token._.holmes.lemma)
+ if possible_subwords is None or score(possible_subwords) > \
+ self.maximum_acceptable_subword_score:
+ return
+ if len(possible_subwords) == 1 and token._.holmes.lemma.isalpha():
+ # not ... isalpha(): hyphenation
+ subword_cache[token.text] = []
+ else:
+ index = 0
+ if token._.holmes.lemma[0] == '-':
+ # with truncated nouns, the righthand siblings may actually occur to the left
+ # of the head noun
+ head_sibling = token.doc[token._.holmes.token_or_lefthand_sibling_index]
+ if len(head_sibling._.holmes.righthand_siblings) > 0:
+ indexes = token._.holmes.get_sibling_indexes(token.doc)
+ first_sibling = token.doc[indexes[0]]
+ first_sibling_possible_subwords = \
+ scan_recursively_for_subwords(first_sibling._.holmes.lemma)
+ if first_sibling_possible_subwords is not None:
+ first_sibling_lemmatization_doc = get_lemmatization_doc(
+ first_sibling_possible_subwords, token.pos_)
+ final_subword_counter = len(first_sibling_possible_subwords) - 1
+ if final_subword_counter > 0 and \
+ first_sibling_possible_subwords[
+ final_subword_counter].text \
+ in self.non_recorded_subword_list:
+ final_subword_counter -= 1
+ for counter in range(final_subword_counter):
+ first_sibling_possible_subword = \
+ first_sibling_possible_subwords[counter]
+ if first_sibling_possible_subword.text in \
+ self.non_recorded_subword_list:
+ continue
+ text = first_sibling.text[
+ first_sibling_possible_subword.char_start_index:
+ first_sibling_possible_subword.char_start_index +
+ len(first_sibling_possible_subword.text)]
+ lemma = first_sibling_lemmatization_doc[counter*2].lemma_.lower()
+ derived_lemma = self.derived_holmes_lemma(None, lemma)
+ working_subwords.append(Subword(
+ first_sibling.i, index, text, lemma, derived_lemma,
+ self.get_vector(lemma),
+ first_sibling_possible_subword.char_start_index,
+ None, None, None, None))
+ index += 1
+ lemmatization_doc = get_lemmatization_doc(possible_subwords, token.pos_)
+ for counter, possible_subword in enumerate(possible_subwords):
+ possible_subword = possible_subwords[counter]
+ if possible_subword.text in self.non_recorded_subword_list:
+ continue
+ text = token.text[
+ possible_subword.char_start_index:
+ possible_subword.char_start_index + len(possible_subword.text)]
+ lemma = lemmatization_doc[counter*2].lemma_.lower()
+ derived_lemma = self.derived_holmes_lemma(None, lemma)
+ working_subwords.append(Subword(
+ token.i, index, text, lemma, derived_lemma, self.get_vector(lemma),
+ possible_subword.char_start_index, None, None, None, None))
+ index += 1
+ if token._.holmes.lemma[-1] == '-':
+ # with truncated nouns, the righthand siblings may actually occur to the left
+ # of the head noun
+ head_sibling = token.doc[token._.holmes.token_or_lefthand_sibling_index]
+ if len(head_sibling._.holmes.righthand_siblings) > 0:
+ indexes = token._.holmes.get_sibling_indexes(token.doc)
+ last_sibling_index = indexes[-1]
+ if token.i != last_sibling_index:
+ last_sibling = token.doc[last_sibling_index]
+ last_sibling_possible_subwords = \
+ scan_recursively_for_subwords(last_sibling._.holmes.lemma)
+ if last_sibling_possible_subwords is not None:
+ last_sibling_lemmatization_doc = get_lemmatization_doc(
+ last_sibling_possible_subwords, token.pos_)
+ for counter in range(1, len(last_sibling_possible_subwords)):
+ last_sibling_possible_subword = \
+ last_sibling_possible_subwords[counter]
+ if last_sibling_possible_subword.text in \
+ self.non_recorded_subword_list:
+ continue
+ text = last_sibling.text[
+ last_sibling_possible_subword.char_start_index:
+ last_sibling_possible_subword.char_start_index +
+ len(last_sibling_possible_subword.text)]
+ lemma = last_sibling_lemmatization_doc[counter*2].lemma_.lower()
+ derived_lemma = self.derived_holmes_lemma(None, lemma)
+ working_subwords.append(Subword(
+ last_sibling.i, index, text, lemma, derived_lemma,
+ self.get_vector(lemma),
+ last_sibling_possible_subword.char_start_index,
+ None, None, None, None))
+ index += 1
+
+ if index > 1: # if only one subword was found, no need to record it on ._.holmes
+ for counter, working_subword in enumerate(working_subwords):
+ if counter > 0:
+ dependency_label = 'intcompound'
+ dependent_index = counter - 1
+ else:
+ dependency_label = None
+ dependent_index = None
+ if counter + 1 < len(working_subwords):
+ governing_dependency_label = 'intcompound'
+ governor_index = counter + 1
+ else:
+ governing_dependency_label = None
+ governor_index = None
+ working_subword = working_subwords[counter]
+ token._.holmes.subwords.append(Subword(
+ working_subword.containing_token_index,
+ working_subword.index, working_subword.text, working_subword.lemma,
+ working_subword.derived_lemma, self.get_vector(working_subword.lemma),
+ working_subword.char_start_index,
+ dependent_index, dependency_label, governor_index,
+ governing_dependency_label))
+ if token._.holmes.lemma.isalpha(): # caching only where no hyphenation
+ subword_cache[token.text] = token._.holmes.subwords
+ if len(token._.holmes.subwords) > 1 and 'nicht' in (
+ subword.lemma for subword in token._.holmes.subwords):
+ token._.holmes.is_negated = True
+
+ def set_negation(self, token):
+ """Marks any negation on *token*. A token is negative if it or one of its ancestors
+ has a negation word as a syntactic (not semantic!) child.
+ """
+ if token._.holmes.is_negated is not None:
+ return
+ for child in token.children:
+ if child._.holmes.lemma in ('nicht', 'kein', 'keine', 'nie') or \
+ child._.holmes.lemma.startswith('nirgend'):
+ token._.holmes.is_negated = True
+ return
+ if token.dep_ == 'ROOT':
+ token._.holmes.is_negated = False
+ return
+ self.set_negation(token.head)
+ token._.holmes.is_negated = token.head._.holmes.is_negated
+
+ def correct_auxiliaries_and_passives(self, token):
+ """Wherever auxiliaries and passives are found, derive the semantic information
+ from the syntactic information supplied by spaCy.
+ """
+
+ def correct_auxiliaries_and_passives_recursively(token, processed_auxiliary_indexes):
+ if token.i not in processed_auxiliary_indexes:
+ processed_auxiliary_indexes.append(token.i)
+ if (token.pos_ == 'AUX' or token.tag_.startswith('VM') or
+ token.tag_.startswith('VA')) and len([
+ dependency for dependency in token._.holmes.children if
+ dependency.child_index >= 0 and self.is_separable_prefix(
+ token.doc[dependency.child_index])]) == 0: # 'vorhaben'
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if token.doc[dependency.child_index].pos_ in ('VERB', 'AUX') and
+ token.doc[dependency.child_index].dep_ in ('oc', 'pd')):
+ token._.holmes.is_matchable = False
+ child = token.doc[dependency.child_index]
+ self.move_information_between_tokens(token, child)
+ # VM indicates a modal verb, which has to be marked as uncertain
+ if token.tag_.startswith('VM') or dependency.is_uncertain:
+ for child_dependency in child._.holmes.children:
+ child_dependency.is_uncertain = True
+ # 'er ist froh zu kommen' / 'er ist schwer zu erreichen'
+ # set dependency label to 'arg' because semantic role could be either
+ # subject or object
+ if token._.holmes.lemma == 'sein' and (
+ len([
+ child_dependency for child_dependency in
+ child._.holmes.children if child_dependency.label == 'pm' and
+ child_dependency.child_token(token.doc).tag_ == 'PTKZU']) > 0
+ or child.tag_ == 'VVIZU'):
+ for new_dependency in (
+ new_dependency for new_dependency in
+ child._.holmes.children if new_dependency.label == 'sb'):
+ new_dependency.label = 'arg'
+ new_dependency.is_uncertain = True
+ # passive construction
+ if (token._.holmes.lemma == 'werden' and child.tag_ not in (
+ 'VVINF', 'VAINF', 'VAFIN', 'VAINF')) and len(
+ [c for c in token.children if
+ c.dep_ == 'oc' and c.lemma_ == 'haben']) == 0:
+ for child_or_sib in \
+ child._.holmes.loop_token_and_righthand_siblings(token.doc):
+ #mark syntactic subject as semantic object
+ for grandchild_dependency in [
+ grandchild_dependency for
+ grandchild_dependency in child_or_sib._.holmes.children
+ if grandchild_dependency.label == 'sb']:
+ grandchild_dependency.label = 'oa'
+ #mark syntactic object as synctactic subject, removing the
+ #preposition 'von' or 'durch' from the construction and marking
+ #it as non-matchable
+ for grandchild_dependency in (
+ gd for gd in
+ child_or_sib._.holmes.children if gd.child_index >= 0):
+ grandchild = token.doc[grandchild_dependency.child_index]
+ if (
+ grandchild_dependency.label == 'sbp' and
+ grandchild._.holmes.lemma in ('von', 'vom')) or (
+ grandchild_dependency.label == 'mo' and
+ grandchild._.holmes.lemma in (
+ 'von', 'vom', 'durch')):
+ grandchild._.holmes.is_matchable = False
+ for great_grandchild_dependency in \
+ grandchild._.holmes.children:
+ if child_or_sib.i != \
+ great_grandchild_dependency.child_index:
+ child_or_sib._.holmes.children.append(
+ SemanticDependency(
+ child_or_sib.i,
+ great_grandchild_dependency.child_index,
+ 'sb', dependency.is_uncertain))
+ child_or_sib._.holmes.remove_dependency_with_child_index(
+ grandchild_dependency.child_index)
+ for syntactic_child in token.children:
+ correct_auxiliaries_and_passives_recursively(
+ syntactic_child, processed_auxiliary_indexes)
+
+ if token.dep_ == 'ROOT':
+ correct_auxiliaries_and_passives_recursively(token, [])
+
+ def handle_relative_constructions(self, token):
+ for dependency in (
+ dependency for dependency in token._.holmes.children if
+ dependency.child_index >= 0 and
+ dependency.child_token(token.doc).tag_ in ('PRELS', 'PRELAT') and
+ dependency.child_token(token.doc).dep_ != 'par'):
+ counter = dependency.child_index
+ while counter > token.sent.start:
+ # find the antecedent
+ counter -= 1
+ working_token = token.doc[counter]
+ if working_token.pos_ in ('NOUN', 'PROPN') and working_token.dep_ not in \
+ self.sibling_marker_deps:
+ working_dependency = None
+ for antecedent in (
+ antecedent for antecedent in
+ working_token._.holmes.loop_token_and_righthand_siblings(token.doc)
+ if antecedent.i != token.i):
+ # add new dependency from the verb to the antecedent
+ working_dependency = SemanticDependency(
+ token.i, antecedent.i, dependency.label, True)
+ token._.holmes.children.append(working_dependency)
+ # the last antecedent before the pronoun is not uncertain, so reclassify it
+ if working_dependency is not None:
+ working_dependency.is_uncertain = False
+ # remove the dependency from the verb to the relative pronoun
+ token._.holmes.remove_dependency_with_child_index(
+ dependency.child_index)
+ # label the relative pronoun as a grammatical token pointing to its
+ # direct antecedent
+ dependency.child_token(token.doc)._.holmes.children = [SemanticDependency(
+ dependency.child_index, 0 - (working_dependency.child_index + 1),
+ None)]
+
+ def holmes_lemma(self, token):
+ """Relabel the lemmas of separable verbs in sentences like 'er steht auf' to incorporate
+ the entire separable verb to facilitate matching.
+ """
+ if token.pos_ in ('VERB', 'AUX') and token.tag_ not in ('VAINF', 'VMINF', 'VVINF', 'VVIZU'):
+ for child in token.children:
+ if self.is_separable_prefix(child):
+ child_lemma = child.lemma_.lower()
+ if child_lemma == 'einen':
+ child_lemma = 'ein'
+ return ''.join([child_lemma, token.lemma_.lower()])
+ if token.tag_ == 'APPRART':
+ if token.lemma_.lower() in ('im', 'ins'):
+ return 'in'
+ if token.lemma_.lower() in ('am', 'ans'):
+ return 'an'
+ if token.lemma_.lower() == 'beim':
+ return 'bei'
+ if token.lemma_.lower() == 'vom':
+ return 'von'
+ if token.lemma_.lower() in ('zum', 'zur'):
+ return 'zu'
+ # sometimes adjectives retain their inflectional endings
+ if token.tag_ in ('ADJA', 'ADJD') and len(token.lemma_) > 5:
+ if token.lemma_.lower().endswith('ten'):
+ working_lemma = token.lemma_.lower()[:-2]
+ elif token.lemma_.lower().endswith('tes'):
+ working_lemma = token.lemma_.lower()[:-2]
+ elif token.lemma_.lower().endswith('ter'):
+ working_lemma = token.lemma_.lower()[:-2]
+ elif token.lemma_.lower().endswith('te'):
+ working_lemma = token.lemma_.lower()[:-1]
+ else:
+ working_lemma = token.lemma_.lower()
+ # see if the adjective is a participle
+ participle_test_doc = self.spacy_parse(' '.join(('Jemand hat', working_lemma)))
+ return participle_test_doc[2].lemma_.lower()
+ return token.lemma_.lower()
+
+ _ung_ending_blacklist = ('sprung', 'schwung', 'nibelung')
+
+ def language_specific_derived_holmes_lemma(self, token, lemma):
+ """ token is None where *lemma* belongs to a subword """
+
+ # verbs with 'ieren' -> 'ation'
+ if (token is None or token.pos_ == 'VERB') and len(lemma) > 9 and \
+ lemma.endswith('ieren'):
+ working_lemma = ''.join((lemma[:-5], 'ation'))
+ if not self.is_oov(working_lemma):
+ return working_lemma
+ # nouns with 'ierung' -> 'ation'
+ if (token is None or token.pos_ == 'NOUN') and len(lemma) > 10 and \
+ lemma.endswith('ierung'):
+ working_lemma = ''.join((lemma[:-6], 'ation'))
+ if not self.is_oov(working_lemma):
+ return working_lemma
+ # nominalization with 'ung'
+ if (token is None or token.tag_ == 'NN') and lemma.endswith('ung'):
+ for word in self._ung_ending_blacklist:
+ if lemma.endswith(word):
+ return None
+ if (lemma.endswith('erung') and not lemma.endswith('ierung')) or \
+ lemma.endswith('elung'):
+ return ''.join((lemma[:-3], 'n'))
+ elif lemma.endswith('lung') and len(lemma) >= 5 and \
+ lemma[-5] not in ('a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü', 'h'):
+ return ''.join((lemma[:-4], 'eln'))
+ return ''.join((lemma[:-3], 'en'))
+ # nominalization with 'heit', 'keit'
+ if (token is None or token.tag_ == 'NN') and (
+ lemma.endswith('keit') or lemma.endswith('heit')):
+ return lemma[:-4]
+ if (token is None or token.pos_ in ('NOUN', 'PROPN')) and len(lemma) > 6 and \
+ (lemma.endswith('chen') or lemma.endswith('lein')):
+ # len > 6: because e.g. Dach and Loch have lemmas 'dachen' and 'lochen'
+ working_lemma = lemma[-12:-4]
+ # replace umlauts in the last 8 characters of the derived lemma
+ working_lemma = working_lemma.replace('ä', 'a').replace('ö', 'o').replace('ü', 'u')
+ working_lemma = ''.join((lemma[:-12], working_lemma))
+ if not self.is_oov(working_lemma):
+ return working_lemma
+ if lemma[-4] == 'l': # 'lein' where original word ends in 'l'
+ second_working_lemma = ''.join((working_lemma, 'l'))
+ if not self.is_oov(working_lemma):
+ return second_working_lemma
+ second_working_lemma = lemma[:-4] # 'Löffelchen'
+ if not self.is_oov(second_working_lemma):
+ return second_working_lemma
+ if lemma[-4] == 'l': # 'Schlüsselein'
+ second_working_lemma = ''.join((second_working_lemma, 'l'))
+ if not self.is_oov(second_working_lemma):
+ return second_working_lemma
+ return working_lemma
+ if (token is None or token.tag_ == 'NN') and lemma.endswith('e') and len(lemma) > 1 and \
+ not lemma[-2] in self.vowels:
+ # for comparability with diminutive forms, e.g. äuglein <-> auge
+ return lemma[:-1]
+ return None
+
+ def perform_language_specific_tasks(self, token):
+
+ # Because separable verbs are conflated into a single lemma, remove the dependency
+ # from the verb to the preposition
+ if self.is_separable_prefix(token) and token.head.pos_ in ('VERB', 'AUX') and \
+ token.head.tag_ not in ('VAINF', 'VMINF', 'VVINF', 'VVIZU'):
+ token.head._.holmes.remove_dependency_with_child_index(token.i)
+ token._.holmes.is_matchable = False
+
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label in ('mo', 'mnr', 'pg', 'op')):
+ child = dependency.child_token(token.doc)
+ for child_dependency in (
+ child_dependency for child_dependency in
+ child._.holmes.children if child_dependency.label == 'nk' and
+ token.i != child_dependency.child_index and child.pos_ == 'ADP'):
+ if dependency.label in ('mnr', 'pg', 'op') and \
+ dependency.child_token(token.doc)._.holmes.lemma in ('von', 'vom'):
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_dependency.child_index, 'pobjo'))
+ # pobjO from English 'of'
+ child._.holmes.is_matchable = False
+ elif dependency.label in ('mnr') and \
+ dependency.child_token(token.doc)._.holmes.lemma in ('durch'):
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_dependency.child_index, 'pobjb'))
+ # pobjB from English 'by'
+ else:
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_dependency.child_index, 'pobjp',
+ dependency.is_uncertain or child_dependency.is_uncertain))
+
+ # # where a 'moposs' or 'mnrposs' dependency has been added and the preposition is not
+ # 'von' or 'vom' add a corresponding uncertain 'pobjp'
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label in ['moposs', 'mnrposs']):
+ child = dependency.child_token(token.doc)
+ for child_dependency in (
+ child_dependency for child_dependency in
+ child._.holmes.children if child_dependency.label == 'nk' and
+ token.i != child_dependency.child_index and child._.holmes.is_matchable):
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_dependency.child_index, 'pobjp', True))
+
+ # Loop through the structure around a dependent verb to find the lexical token at which
+ # to add new dependencies, and find out whether it is active or passive so we know
+ # whether to add an 'sb' or an 'oa'.
+ def find_target_tokens_and_dependency_recursively(token, visited=None):
+ if visited is None:
+ visited = []
+ visited.append(token.i)
+ tokens_to_return = []
+ target_dependency = 'sb'
+ # Loop through grammatical tokens. 'dependency.child_index + token.i != -1' would mean
+ # a grammatical token were pointing to itself (should never happen!)
+ if len([
+ dependency for dependency in token._.holmes.children
+ if dependency.child_index < 0 and dependency.child_index + token.i != -1]) > 0:
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.child_index < 0 and dependency.child_index + token.i != -1):
+ # resolve the grammatical token pointer
+ child_token = token.doc[0 - (dependency.child_index + 1)]
+ # passive construction
+ if (token._.holmes.lemma == 'werden' and child_token.tag_ not in
+ ('VVINF', 'VAINF', 'VAFIN', 'VAINF')):
+ target_dependency = 'oa'
+ if child_token.i not in visited:
+ new_tokens, new_target_dependency = \
+ find_target_tokens_and_dependency_recursively(child_token, visited)
+ tokens_to_return.extend(new_tokens)
+ if new_target_dependency == 'oa':
+ target_dependency = 'oa'
+ else:
+ tokens_to_return.append(token)
+ else:
+ # we have reached the target token
+ tokens_to_return.append(token)
+ return tokens_to_return, target_dependency
+
+ def has_morphs(token, *morphs):
+ if len(morphs) == 0:
+ raise ValueError('At least one morph is required.')
+ for morph in morphs:
+ if not morph in token.morph:
+ return False
+ return True
+
+ # 'Der Mann hat xxx, es zu yyy' and similar structures
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label in ('oc', 'oa', 'mo', 're') and
+ token.pos_ in ('VERB', 'AUX') and dependency.child_token(token.doc).pos_ in \
+ ('VERB', 'AUX')):
+ dependencies_to_add = []
+ target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
+ dependency.child_token(token.doc))
+ # with um ... zu structures the antecedent subject is always the subject of the
+ # dependent clause, unlike with 'zu' structures without the 'um'
+ if len([other_dependency for other_dependency in target_tokens[0]._.holmes.children
+ if other_dependency.child_token(token.doc)._.holmes.lemma == 'um' and
+ other_dependency.child_token(token.doc).tag_ == 'KOUI']) == 0:
+ # er hat ihm vorgeschlagen, etwas zu tun
+ for other_dependency in (
+ other_dependency for other_dependency
+ in token._.holmes.children if other_dependency.label == 'da'):
+ dependencies_to_add.append(other_dependency)
+ if len(dependencies_to_add) == 0:
+ # er hat ihn gezwungen, etwas zu tun
+ # We have to distinguish this type of 'oa' relationship from dependent
+ # clauses and reflexive pronouns ('er entschied sich, ...')
+ for other_dependency in (
+ other_dependency for other_dependency
+ in token._.holmes.children if other_dependency.label == 'oa' and
+ other_dependency.child_token(token.doc).pos_ not in ('VERB', 'AUX') and
+ other_dependency.child_token(token.doc).tag_ != 'PRF'):
+ dependencies_to_add.append(other_dependency)
+ if len(dependencies_to_add) == 0:
+ # We haven't found any object dependencies, so take the subject dependency
+ for other_dependency in (
+ other_dependency for other_dependency
+ in token._.holmes.children if other_dependency.label == 'sb'):
+ dependencies_to_add.append(other_dependency)
+ for target_token in target_tokens:
+ for other_dependency in (
+ other_dependency for other_dependency in
+ dependencies_to_add if target_token.i != other_dependency.child_index and
+ len([dep for dep in target_token._.holmes.children if dep.label ==
+ target_dependency and not dep.is_uncertain]) == 0):
+ # these dependencies are always uncertain
+ target_token._.holmes.children.append(SemanticDependency(
+ target_token.i, other_dependency.child_index, target_dependency, True))
+
+ # 'Der Löwe bat den Hund, die Katze zu jagen' and similar structures
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'oc' and token.pos_ == 'NOUN' and
+ dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')):
+ target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
+ dependency.child_token(token.doc))
+ for target_token in (target_token for target_token in target_tokens
+ if target_token.i != token.i and len([dep for dep in
+ target_token._.holmes.children if dep.label ==
+ target_dependency and not dep.is_uncertain]) == 0):
+ target_token._.holmes.children.append(SemanticDependency(
+ target_token.i, token.i, target_dependency, True))
+
+ # 'er dachte darüber nach, es zu tun' and similar structures
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'op' and
+ dependency.child_token(token.doc).tag_ == 'PROAV'):
+ child_token = dependency.child_token(token.doc)
+ for child_dependency in (
+ child_dependency for child_dependency in
+ child_token._.holmes.children if child_dependency.label == 're' and
+ child_dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')):
+ target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
+ child_dependency.child_token(token.doc))
+ for other_dependency in (
+ other_dependency for other_dependency
+ in token._.holmes.children if other_dependency.label == 'sb'):
+ for target_token in (target_token for target_token in target_tokens if
+ len([dep for dep in target_token._.holmes.children if dep.label ==
+ target_dependency and not dep.is_uncertain]) == 0):
+ target_token._.holmes.children.append(SemanticDependency(
+ target_token.i, other_dependency.child_index, target_dependency, True))
+
+ # 'er war froh, etwas zu tun'
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'nk' and token.pos_ in ('NOUN', 'PROPN')
+ and token.dep_ == 'sb' and dependency.child_token(token.doc).pos_ ==
+ self.adjectival_predicate_predicate_pos):
+ child_token = dependency.child_token(token.doc)
+ relevant_dependencies = [child_dependency for child_dependency in
+ child_token._.holmes.children if child_dependency.label in ('oc', 're') and
+ child_dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')]
+ for grandchild_token in (gd.child_token(token.doc) for gd in
+ child_token._.holmes.children if gd.label == 'mo' and
+ gd.child_token(token.doc).tag_ == 'PROAV'):
+ relevant_dependencies.extend([grandchild_dependency for grandchild_dependency in
+ grandchild_token._.holmes.children if
+ grandchild_dependency.label in ('oc', 're') and
+ grandchild_dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')])
+ for relevant_dependency in relevant_dependencies:
+ target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
+ relevant_dependency.child_token(token.doc))
+ for target_token in (
+ target_token for target_token in target_tokens
+ if target_token.i != dependency.parent_index and len([dep for dep in
+ target_token._.holmes.children if dep.label ==
+ target_dependency and not dep.is_uncertain]) == 0):
+ # these dependencies are always uncertain
+ target_token._.holmes.children.append(SemanticDependency(
+ target_token.i, dependency.parent_index, target_dependency, True))
+
+ # sometimes two verb arguments are interpreted as both subjects or both objects,
+ # if this occurs reinterpret them
+
+ def get_first_real_subject(dependencies):
+ potential_first_real_subject = None
+ for working_dep in dependencies:
+ potential_first_real_subject = working_dep.child_token(token.doc)
+ if has_morphs(potential_first_real_subject, 'Case=Nom', 'Gender=Masc',
+ 'Number=Sing'):
+ # if there is a 'der' later on in the dependencies it must be the subject
+ return potential_first_real_subject
+ for working_dep in dependencies:
+ potential_first_real_subject = working_dep.child_token(token.doc)
+ if not has_morphs(potential_first_real_subject, 'Case=Acc'):
+ # a 'den' can never be the subject
+ return potential_first_real_subject
+ return None
+
+ # find first 'sb' dependency for verb
+ dependencies = [
+ dependency for dependency in token._.holmes.children
+ if token.pos_ == 'VERB' and dependency.label == 'sb' and not
+ dependency.is_uncertain and (dependency.child_token(token.doc).i == 0 or
+ token.doc[dependency.child_token(token.doc).i-1].dep_ not in self.conjunction_deps)]
+ if len(dependencies) > 0 and len([
+ object_dependency for object_dependency
+ in dependencies if object_dependency.label == 'oa' and not
+ dependency.is_uncertain]) == 0:
+ dependencies.sort(key=lambda dependency: dependency.child_index)
+ first_real_subject = get_first_real_subject(dependencies)
+ if first_real_subject is not None:
+ for real_subject_index in \
+ first_real_subject._.holmes.get_sibling_indexes(token.doc):
+ for dependency in dependencies:
+ if dependency.child_index == real_subject_index:
+ dependencies.remove(dependency)
+ for dependency in (other_dependency for other_dependency in dependencies):
+ dependency.label = 'oa'
+
+ dependencies = [
+ dependency for dependency in token._.holmes.children
+ if token.pos_ == 'VERB' and dependency.label == 'oa' and not
+ dependency.is_uncertain and (dependency.child_token(token.doc).i == 0 or
+ token.doc[dependency.child_token(token.doc).i-1].dep_ not in self.conjunction_deps)]
+ if len(dependencies) > 0 and len([
+ object_dependency for object_dependency
+ in dependencies if object_dependency.label == 'sb' and not
+ dependency.is_uncertain]) == 0:
+ dependencies.sort(key=lambda dependency: dependency.child_index)
+ first_real_subject = get_first_real_subject(dependencies)
+ if first_real_subject is not None:
+ real_subject_indexes = \
+ first_real_subject._.holmes.get_sibling_indexes(token.doc)
+ if len(dependencies) > len(real_subject_indexes):
+ for dependency in (
+ dependency for dependency in dependencies if
+ dependency.child_index in real_subject_indexes):
+ dependency.label = 'sb'
+
+class LanguageSpecificSemanticMatchingHelper(SemanticMatchingHelper):
+
+ noun_pos = ('NOUN', 'PROPN', 'ADJ')
+
+ preposition_deps = ('prep')
+
+ permissible_embedding_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV')
+
+ noun_kernel_dep = ('nk', 'pnc')
+
+ minimum_embedding_match_word_length = 4
+
+ topic_matching_phraselet_stop_lemmas = ('dann', 'danach', 'so', 'ich', 'mein')
+
+ topic_matching_reverse_only_parent_lemmas = (
+ ('sein', 'AUX'), ('werden', 'AUX'), ('haben', 'AUX'), ('sagen', 'VERB'),
+ ('machen', 'VERB'), ('tun', 'VERB'), ('haben', 'VERB'), ('werden', 'VERB'))
+
+ topic_matching_phraselet_stop_tags = ('PPER', 'PDS', 'PRF')
+
+ supervised_document_classification_phraselet_stop_lemmas = ('sein', 'haben')
+
+ preferred_phraselet_pos = ('NOUN', 'PROPN')
+
+ entity_defined_multiword_pos = ('NOUN', 'PROPN')
+
+ entity_defined_multiword_entity_types = ('PER', 'LOC')
+
+ sibling_marker_deps = ('cj', 'app')
+
+ question_answer_blacklist_deps = ('cj', 'cd', 'punct', 'app', 'punct')
+
+ question_answer_final_blacklist_deps = ()
+
+ match_implication_dict = {
+ 'sb': MatchImplication(search_phrase_dependency='sb',
+ document_dependencies=['pobjb', 'ag', 'arg', 'intcompound'],
+ reverse_document_dependencies=['nk']),
+ 'ag': MatchImplication(search_phrase_dependency='ag',
+ document_dependencies=['nk', 'pobjo', 'intcompound'],
+ reverse_document_dependencies=['nk']),
+ 'oa': MatchImplication(search_phrase_dependency='oa',
+ document_dependencies=['pobjo', 'ag', 'arg', 'intcompound', 'og', 'oc'],
+ reverse_document_dependencies=['nk']),
+ 'arg': MatchImplication(search_phrase_dependency='arg',
+ document_dependencies=['sb', 'oa', 'ag', 'intcompound', 'pobjb', 'pobjo'],
+ reverse_document_dependencies=['nk']),
+ 'mo': MatchImplication(search_phrase_dependency='mo',
+ document_dependencies=['moposs', 'mnr', 'mnrposs', 'nk', 'oc', 'pd']),
+ 'mnr': MatchImplication(search_phrase_dependency='mnr',
+ document_dependencies=['mnrposs', 'mo', 'moposs', 'nk', 'oc']),
+ 'nk': MatchImplication(search_phrase_dependency='nk',
+ document_dependencies=['ag', 'pobjo', 'intcompound', 'oc', 'mo'],
+ reverse_document_dependencies=['sb', 'ag', 'oa', 'arg', 'pobjo',
+ 'intcompound']),
+ 'oc': MatchImplication(search_phrase_dependency='oc',
+ document_dependencies=['pobjo', 'ag', 'arg', 'intcompound', 'og', 'oa'],
+ reverse_document_dependencies=['nk']),
+ 'pd': MatchImplication(search_phrase_dependency='pd',
+ document_dependencies=['moposs', 'mo']),
+ 'pobjo': MatchImplication(search_phrase_dependency='pobjo',
+ document_dependencies=['ag', 'intcompound'],
+ reverse_document_dependencies=['nk']),
+ 'pobjp': MatchImplication(search_phrase_dependency='pobjp',
+ document_dependencies=['intcompound']),
+ 'wh_sb': MatchImplication(search_phrase_dependency='wh_sb',
+ document_dependencies=['pobjb', 'ag', 'arg', 'intcompound', 'sb', 'pd'],
+ reverse_document_dependencies=['nk']),
+ 'wh_wildcard': MatchImplication(search_phrase_dependency='wh_wildcard',
+ document_dependencies=['mo', 'oc', 'mnr', 'mnrposs', 'prep']),
+ # intcompound is only used within extensive matching because it is not assigned
+ # in the context of registering search phrases.
+ 'intcompound': MatchImplication(search_phrase_dependency='intcompound',
+ document_dependencies=['sb', 'oa', 'ag', 'og', 'nk', 'mo', 'pobjo', 'pobjp'],
+ reverse_document_dependencies=['nk']),
+ }
+
+ phraselet_templates = [
+ PhraseletTemplate(
+ "verb-nom", "Eine Sache tut", 2, 1,
+ ['sb', 'pobjb'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "verb-acc", "Jemand tut eine Sache", 1, 3,
+ ['oa', 'pobjo', 'ag', 'og', 'oc'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "verb-dat", "Jemand gibt einer Sache etwas", 1, 3,
+ ['da'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "verb-pd", "Jemand ist eine Sache", 1, 3,
+ ['pd'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=True, question=False),
+ PhraseletTemplate(
+ "noun-dependent", "Eine große Sache", 2, 1,
+ ['nk'],
+ ['FM', 'NE', 'NNE', 'NN'],
+ ['FM', 'NE', 'NNE', 'NN', 'ADJA', 'ADJD', 'ADV', 'CARD'], reverse_only=False,
+ question=False),
+ PhraseletTemplate(
+ "verb-adverb", "schnell machen", 1, 0,
+ ['mo', 'moposs', 'oc'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
+ ['ADJA', 'ADJD', 'ADV'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "prepgovernor-noun", "Eine Sache in einer Sache", 1, 4,
+ ['pobjp'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "prep-noun", "in einer Sache", 0, 2,
+ ['nk'],
+ ['APPO', 'APPR', 'APPRART', 'APZR'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=True, question=False),
+ PhraseletTemplate(
+ "verb-toughmovedargument", "Eine Sache ist schwer zu tun", 5, 1,
+ ['arg'],
+ [
+ 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
+ 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
+ ['FM', 'NE', 'NNE', 'NN'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "intcompound", "Eine Sache in einer Sache", 1, 4,
+ ['intcompound'],
+ ['NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD', 'TRUNC'],
+ ['NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD', 'TRUNC'], reverse_only=False,
+ question=False, assigned_dependency_label='intcompound'),
+ PhraseletTemplate(
+ "head-WHnom", "wer kam?", 1, 0,
+ ['sb'],
+ ['VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP', 'VAFIN',
+ 'VAINF'],
+ ['PWS'], reverse_only=False, question=True,
+ assigned_dependency_label='wh_sb'),
+ PhraseletTemplate(
+ "head-WHacc", "wen sahst du?", 1, 0,
+ ['oa'],
+ ['VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP'],
+ ['PWS'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "head-WHdat", "wem hilfst du?", 1, 0,
+ ['da'],
+ ['VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP'],
+ ['PWS'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "head-WHadv", "womit machst du es?", 1, 0,
+ ['mo'],
+ ['VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP'],
+ ['PWAV'], reverse_only=False, question=True,
+ assigned_dependency_label='wh_wildcard'),
+ PhraseletTemplate(
+ "word", "Sache", 0, None,
+ None,
+ ['FM', 'NE', 'NNE', 'NN'],
+ None, reverse_only=False, question=False)]
+
+ def question_word_matches(self, search_phrase_label:str, search_phrase_token:Token,
+ document_token:Token, document_vector, entity_label_to_vector_dict:dict,
+ initial_question_word_embedding_match_threshold:float) -> bool:
+ if search_phrase_token._.holmes.lemma in ('wer', 'wen', 'wem'):
+ ent_types = ('PER', 'ORG')
+ return document_token.ent_type_ in ent_types or \
+ self.token_matches_ent_type(document_vector,
+ entity_label_to_vector_dict, ent_types,
+ initial_question_word_embedding_match_threshold) > 0
+ if search_phrase_token._.holmes.lemma in ('was'):
+ return True
+ if search_phrase_token._.holmes.lemma == 'wo':
+ # spaCy model does not handle postpositions
+ return document_token.tag_ in ('APPR', 'APPRART') and document_token._.holmes.lemma in (
+ 'an', 'auf', 'aus', 'bei', 'gegenüber', 'hinter', 'in', 'neben', 'über',
+ 'unter', 'vor', 'zu', 'zwischen') and \
+ len([1 for c in document_token._.holmes.children
+ if 'Case=Dat' in c.child_token(document_token.doc).morph]) > 0
+ if search_phrase_token._.holmes.lemma == 'wohin':
+ return document_token.tag_ in ('APPR', 'APPRART') and document_token._.holmes.lemma in (
+ 'an', 'auf', 'hinter', 'in', 'neben', 'über',
+ 'unter', 'vor', 'zwischen') and \
+ len([1 for c in document_token._.holmes.children
+ if 'Case=Acc' in c.child_token(document_token.doc).morph]) > 0
+ if search_phrase_token._.holmes.lemma == 'wann':
+ if document_token.tag_ in ('APPR', 'APPRART'):
+ return document_token._.holmes.lemma in (
+ 'ab', 'an', 'bis', 'für', 'in', 'nach', 'seit', 'vor', 'um')
+ return document_token.dep_ == 'mo' and document_token.pos_ in (
+ 'NOUN', 'PROPN', 'ADV', 'VERB', 'AUX')
+ if search_phrase_token._.holmes.lemma == 'wie':
+ if document_token.tag_ in ('APPR', 'APPRART'):
+ return document_token._.holmes.lemma in (
+ 'mit', 'mittels')
+ if document_token.dep_ == ('mo') and document_token.tag_ in ('ADJD'):
+ return True
+ return document_token.dep_ in ('mo', 'oc') and len([1 for c in
+ document_token._.holmes.children if
+ c.child_token(document_token.doc)._.holmes.lemma == 'indem']) > 0
+ if search_phrase_token._.holmes.lemma == 'woher':
+ if document_token.tag_ in ('APPR', 'APPRART'):
+ return document_token._.holmes.lemma in ('aus', 'von', 'wegen')
+ return document_token.dep_ in ('mo', 'oc') and len([1 for c in
+ document_token._.holmes.children if
+ c.child_token(document_token.doc)._.holmes.lemma == 'weil']) > 0
+ if search_phrase_token._.holmes.lemma in ('warum', 'wieso', 'weshalb'):
+ if document_token.tag_ in ('APPR', 'APPRART'):
+ return document_token._.holmes.lemma in ('wegen')
+ return document_token.dep_ in ('mo', 'oc', 'cd') and len([1 for c in
+ document_token.children if c._.holmes.lemma in ('weil', 'damit')]) > 0
+ # syntactic not semantic children to handle subject-predicate phrases correctly
+ if search_phrase_token._.holmes.lemma.startswith('wo') and document_token.tag_ == 'APPR' \
+ and search_phrase_token._.holmes.lemma.endswith(document_token._.holmes.lemma):
+ return True
+ # 'wessen' is not correctly recognized by the current _lg model
+ return False
+
+ def normalize_hyphens(self, word):
+ """ Normalizes hyphens in a multiword for ontology matching. Depending on the language,
+ this may involve replacing them with spaces (English) or deleting them entirely
+ (German).
+ """
+ if word.strip().startswith('-') or word.endswith('-'):
+ return word
+ else:
+ return word.replace('-', '')
diff --git a/holmes_extractor/lang/en/__init__.py b/holmes_extractor/lang/en/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/holmes_extractor/lang/en/data/__init__.py b/holmes_extractor/lang/en/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/holmes_extractor/data/derivation_en.csv b/holmes_extractor/lang/en/data/derivation.csv
similarity index 95%
rename from holmes_extractor/data/derivation_en.csv
rename to holmes_extractor/lang/en/data/derivation.csv
index 7ba911e..b319613 100644
--- a/holmes_extractor/data/derivation_en.csv
+++ b/holmes_extractor/lang/en/data/derivation.csv
@@ -1,414 +1,415 @@
-abandon,abandonment
-able,ability
-abort,abortion
-abstract,abstraction
-abuse,abusive
-accept,acceptance
-accident,accidental
-accompany,accompaniment
-accomplish,accomplishment
-accountable,accountability
-accuracy,accurate
-accuse,accusation
-achieve,achievement
-acknowledge,acknowledgement
-act,action,activity
-adapt,adaptation
-add,addition,additional
-adjust,adjustment
-admire,admiration
-adopt,adoption
-advertise,advertize,advertisement
-advice,advise
-affect,effect
-agree,agreement
-alcohol,alcoholic
-allow,allowance
-alter,alteration
-amaze,amazing,amazement
-ambiguity,ambiguous,ambiguousness
-amuse,amusing,amusement
-analyse,analyze,analysis
-anger,angry
-announce,announcement
-anonymity,anonymous
-apology,apologize,apologetic
-appear,appearance
-applaud,applause
-appoint,appointment
-approve,approval
-argue,argument
-arrange,arrangement
-assert,assertion
-assess,assessment
-assure,assurance
-astonish,astonishing,astonishment
-attach,attachment
-attain,attainment
-attract,attraction
-attribute,attribution
-avoid,avoidance
-base,basic,basis
-beast,beastly
-behave,behavior,behaviour,behavioral
-belief,believe
-breath,breathe
-bury,burial
-capable,capability
-cease,cessation
-ceremony,ceremonial
-certain,certainty
-charm,charming
-cite,citation
-clean,cleanliness
-clear,clarity
-clinic,clinical
-collaboration,collaborative
-collect,collection
-combine,combination
-commerce,commercial
-commit,commitment
-compare,comparison
-compete,competition
-compile,compilation
-complete,completion
-compliant,compliance
-compose,composition
-comprehend,comprehension
-conclude,conclusion
-confirm,confirmation
-conform,conformity
-confront,confrontation
-confuse,confusion
-connect,connection
-consequent,consequence
-conservatism,conservative
-consider,consideration
-consistent,consistency
-constrain,constraint
-construct,construction
-consult,consultation
-continue,continuation
-contradict,contradiction
-contribute,contribution
-controversy,controversial
-convene,convention
-convenient,convenience
-cooperate,cooperative
-correct,correction
-correlate,correlative
-correspond,correspondence
-cover,coverage
-critical,criticise,criticism,criticize
-cruel,cruelty
-day,daily
-deceit,deceive,deception,deceptive
-decide,decision
-declare,declaration,declarative
-deep,depth
-defend,defence,defense,defensive
-define,definition
-deny,denial
-depend,dependence
-depress,depression,depressive
-describe,description
-despair,desperation
-destroy,destruction,destructive
-detach,detachment
-detect,detection
-deter,deterrent,deterrence
-determine,determination
-develop,development,developmental
-devote,devotion
-diagnose,diagnosis
-dictator,dictatorial
-die,dead,death
-differ,different,difference
-digest,digestion,digestive
-dimension,dimensional
-disagree,disagreement
-disappoint,disappointing,disappointment
-disaster,disastrous
-discourage,discouragement
-dishonest,dishonesty
-dismiss,dismissal
-disobey,disobedient,disobedience
-dispose,disposal
-disrespect,disrespectful
-dissatisfy,dissatisfaction
-distant,distance
-distinct,distinction,distinctive
-distort,distortion
-distract,distracting,distraction
-disturb,disturbing,disturbance
-diverse,diversity
-divide,division
-domestic,domesticate
-dominant,dominate,dominance
-doubt,doubtful
-ease,easy
-edit,edition
-efficient,efficiency
-elect,election
-embarrass,embarrassment
-emerge,emergence
-emit,emission
-emphasis,emphatic,emphasise,emphasize
-employ,employment
-enclose,enclosure
-encourage,encouragement
-endure,endurance
-energy,energize,energetic
-enforce,enforcement
-engage,engagement
-enhance,enhancement
-enjoy,enjoyment
-enlarge,enlargement
-enormity,enormous
-enter,entrance
-entertain,entertainment
-entitle,entitlement
-envy,envious
-equal,equality
-equip,equipment
-evolve,evolution
-examine,examination
-excel,excellent,excellence
-excess,excessive
-excite,excitement
-execute,execution
-exhibit,exhibition
-exist,existence
-expand,expansion
-expanse,expansive
-expect,expectation
-expend,expenditure
-expense,expensive
-expire,expiry,expiration
-explain,explanation
-explode,explosion,explosive
-exploit,exploitation
-explore,exploration
-express,expression
-expel,expulsion
-extract,extraction
-fail,failure
-familiar,familiarise,familiarity,familiarize
-fear,fearful
-feasible,feasibility
-fiction,fictional
-finance,financial
-fly,flight
-forgive,forgiveness
-frequent,frequency
-fur,furry
-generous,generosity
-glass,glassy
-govern,government
-grand,grandeur
-grateful,gratitude
-guilt,guilty
-hard,hardship
-haste,hasty
-hierarchy,hierarchical
-high,height
-hinder,hindrance
-history,historical
-honest,honesty
-hope,hopeful
-hostile,hostility
-humid,humidity
-hunger,hungry
-hypothesis,hypothetical
-ice,icy
-identify,identity,identification
-ideology,ideological
-imagine,imagination
-impatient,impatience
-important,importance
-impress,impression
-imprison,imprisonment
-improbable,improbability
-improve,improvement
-impure,impurity
-incapable,incapability
-incident,incidence,incidental
-include,inclusion
-inconsistent,inconsistency
-independent,independence
-indifferent,indifference
-infeasible,infeasibility
-infect,infection
-infinite,infinity
-inform,information
-inhibit,inhibition
-injure,injury
-innocent,innocence
-insist,insistent,insistence
-inspect,inspection
-instant,instance
-institution,institutional
-instruct,instruction
-integral,integrate
-intelligent,intelligence
-intend,intention
-intense,intensity
-interrupt,interruption
-intervene,intervention
-introduce,introduction
-invade,invasion
-invent,invention
-invite,invitation
-involve,involvement
-liable,liability
-logic,logical
-loose,loosen
-lose,loss
-loyal,loyalty
-magic,magical
-maintain,maintenance
-manage,management
-manipulate,manipulative
-marry,marriage
-mass,massive
-maximal,maximum
-measure,measurement
-minimal,minimum
-mix,mixture
-modern,modernity
-modest,modesty
-month,monthly
-music,musical
-necessary,necessity,necessitate
-neglect,negligent,negligence
-nerve,nervous
-noble,nobility
-norm,normal,normality
-obey,obedient,obedience
-oblige,obligation,obligatory
-offend,offence,offense
-omit,omission
-option,optional
-package,packaging
-patient,patience
-patriot,patriotic,patriotism
-peace,peaceful
-peculiar,peculiarity
-perfect,perfection
-perform,performance
-permit,permission
-persist,persistent,persistence
-persuade,persuasion
-poem,poetic
-poor,poverty
-possess,possession,possessive
-possible,possibility
-post,postal
-practical,practicality
-practice,practise
-precise,precision
-prefer,preference
-prejudice,prejudicial
-prepare,preparation
-present,presence
-preserve,preservation,preservative
-presume,presumption
-presuppose,presupposition
-pretend,pretence
-prevalent,prevalence
-prevent,prevention
-probable,probability
-produce,production
-progress,progression
-prohibit,prohibition,prohibitory
-project,projection
-promote,promotion
-proof,prove
-propose,proposal
-protect,protection,protective
-publicise,publicize,publication
-punish,punishment
-pure,purity
-rare,rarity
-react,reaction
-reappear,reappearance
-reassure,reassurance
-rebel,rebellious
-receipt,receive
-recognise,recognize,recognition
-reconcile,reconciliation
-reconsider,reconsideration
-recruit,recruitment
-refer,referral
-refresh,refreshment
-refuse,refusal
-reinforce,reinforcement
-relax,relaxation
-relief,relieve
-reluctant,reluctance
-rely,reliance
-represent,representation
-reproduce,reproduction
-require,requirement
-reside,residence,residential
-resign,resignation
-resist,resistance
-resolve,resolution
-respect,respectful
-responsible,responsibility
-restrain,restraint
-restrict,restriction,restrictive
-reverse,reversal
-rigor,rigour,rigorous
-rival,rivalry
-rose,rosy
-satisfy,satisfaction
-secret,secrecy
-sector,sectoral
-sequence,sequential
-serve,service
-settle,settlement
-sex,sexual
-sign,signature
-sincere,sincerity
-solve,solution
-speak,speech
-sphere,spherical
-spite,spiteful
-spontaneity,spontaneous
-strong,strength
-stupid,stupidity
-substance,substantial
-succeed,success
-suggest,suggestion
-summer,summery
-superior,superiority
-suppose,supposition
-survive,survival
-suspend,suspension
-talent,talented
-tempt,temptation
-tense,tension
-thirst,thirsty
-threat,threaten
-transmit,transmission
-treat,treatment
-true,truth
-trivia,trivial
-unable,inability
-uncertain,uncertainty
-unimportant,unimportance
-unite,unity
-use,usage
-vary,variation
-virtue,virtuous
-warm,warmth
-waste,wastage
-week,weekly
-weigh,weight
-wide,width
-winter,wintery
-wood,wooden
-wool,wooly,woolen,woolly,woollen
-year,yearly
-young,youth
+abandon,abandonment
+able,ability
+abort,abortion
+abstract,abstraction
+abuse,abusive
+accept,acceptance
+accident,accidental
+accompany,accompaniment
+accomplish,accomplishment
+accountable,accountability
+accuracy,accurate
+accuse,accusation
+achieve,achievement
+acknowledge,acknowledgement
+act,action,activity
+adapt,adaptation
+add,addition,additional
+adjust,adjustment
+admire,admiration
+adopt,adoption
+advertise,advertize,advertisement
+advice,advise
+affect,effect
+agree,agreement
+alcohol,alcoholic
+allow,allowance
+alter,alteration
+amaze,amazing,amazement
+ambiguity,ambiguous,ambiguousness
+amuse,amusing,amusement
+analyse,analyze,analysis
+anger,angry
+announce,announcement
+anonymity,anonymous
+apology,apologize,apologetic
+appear,appearance
+applaud,applause
+appoint,appointment
+approve,approval
+argue,argument
+arrange,arrangement
+assert,assertion
+assess,assessment
+assure,assurance
+astonish,astonishing,astonishment
+attach,attachment
+attain,attainment
+attract,attraction
+attribute,attribution
+avoid,avoidance
+base,basic,basis
+beast,beastly
+behave,behavior,behaviour,behavioral
+belief,believe
+breath,breathe
+bury,burial
+capable,capability
+cease,cessation
+ceremony,ceremonial
+certain,certainty
+charm,charming
+cite,citation
+clean,cleanliness
+clear,clarity
+clinic,clinical
+collaboration,collaborative
+collect,collection
+combine,combination
+commerce,commercial
+commit,commitment
+compare,comparison
+compete,competition
+compile,compilation
+complete,completion
+compliant,compliance
+compose,composition
+comprehend,comprehension
+conclude,conclusion
+confirm,confirmation
+conform,conformity
+confront,confrontation
+confuse,confusion
+connect,connection
+consequent,consequence
+conservatism,conservative
+consider,consideration
+consistent,consistency
+constrain,constraint
+construct,construction
+consult,consultation
+continue,continuation
+contradict,contradiction
+contribute,contribution
+controversy,controversial
+convene,convention
+convenient,convenience
+cooperate,cooperative
+correct,correction
+correlate,correlative
+correspond,correspondence
+cover,coverage
+critical,criticise,criticism,criticize
+cruel,cruelty
+day,daily
+deceit,deceive,deception,deceptive
+decide,decision
+declare,declaration,declarative
+deep,depth
+defend,defence,defense,defensive
+define,definition
+deny,denial
+depend,dependence
+depress,depression,depressive
+describe,description
+despair,desperation
+destroy,destruction,destructive
+detach,detachment
+detect,detection
+deter,deterrent,deterrence
+determine,determination
+develop,development,developmental
+devote,devotion
+diagnose,diagnosis
+dictator,dictatorial
+die,dead,death
+differ,different,difference
+digest,digestion,digestive
+dimension,dimensional
+disagree,disagreement
+disappoint,disappointing,disappointment
+disaster,disastrous
+discourage,discouragement
+dishonest,dishonesty
+dismiss,dismissal
+disobey,disobedient,disobedience
+dispose,disposal
+disrespect,disrespectful
+dissatisfy,dissatisfaction
+distant,distance
+distinct,distinction,distinctive
+distort,distortion
+distract,distracting,distraction
+disturb,disturbing,disturbance
+diverse,diversity
+divide,division
+domestic,domesticate
+dominant,dominate,dominance
+doubt,doubtful
+ease,easy
+edit,edition
+efficient,efficiency
+elect,election
+embarrass,embarrassment
+emerge,emergence
+emit,emission
+emphasis,emphatic,emphasise,emphasize
+employ,employment
+enclose,enclosure
+encourage,encouragement
+endure,endurance
+energy,energize,energetic
+enforce,enforcement
+engage,engagement
+enhance,enhancement
+enjoy,enjoyment
+enlarge,enlargement
+enormity,enormous
+enter,entrance
+entertain,entertainment
+entitle,entitlement
+envy,envious
+equal,equality
+equip,equipment
+evolve,evolution
+examine,examination
+excel,excellent,excellence
+excess,excessive
+excite,excitement
+execute,execution
+exhibit,exhibition
+exist,existence
+expand,expansion
+expanse,expansive
+expect,expectation
+expend,expenditure
+expense,expensive
+expire,expiry,expiration
+explain,explanation
+explode,explosion,explosive
+exploit,exploitation
+explore,exploration
+express,expression
+expel,expulsion
+extract,extraction
+fail,failure
+familiar,familiarise,familiarity,familiarize
+fear,fearful
+feasible,feasibility
+fiction,fictional
+finance,financial
+fly,flight
+forgive,forgiveness
+frequent,frequency
+fur,furry
+generous,generosity
+gift,give
+glass,glassy
+govern,government
+grand,grandeur
+grateful,gratitude
+guilt,guilty
+hard,hardship
+haste,hasty
+hierarchy,hierarchical
+high,height
+hinder,hindrance
+history,historical
+honest,honesty
+hope,hopeful
+hostile,hostility
+humid,humidity
+hunger,hungry
+hypothesis,hypothetical
+ice,icy
+identify,identity,identification
+ideology,ideological
+imagine,imagination
+impatient,impatience
+important,importance
+impress,impression
+imprison,imprisonment
+improbable,improbability
+improve,improvement
+impure,impurity
+incapable,incapability
+incident,incidence,incidental
+include,inclusion
+inconsistent,inconsistency
+independent,independence
+indifferent,indifference
+infeasible,infeasibility
+infect,infection
+infinite,infinity
+inform,information
+inhibit,inhibition
+injure,injury
+innocent,innocence
+insist,insistent,insistence
+inspect,inspection
+instant,instance
+institution,institutional
+instruct,instruction
+integral,integrate
+intelligent,intelligence
+intend,intention
+intense,intensity
+interrupt,interruption
+intervene,intervention
+introduce,introduction
+invade,invasion
+invent,invention
+invite,invitation
+involve,involvement
+liable,liability
+logic,logical
+loose,loosen
+lose,loss
+loyal,loyalty
+magic,magical
+maintain,maintenance
+manage,management
+manipulate,manipulative
+marry,marriage
+mass,massive
+maximal,maximum
+measure,measurement
+minimal,minimum
+mix,mixture
+modern,modernity
+modest,modesty
+month,monthly
+music,musical
+necessary,necessity,necessitate
+neglect,negligent,negligence
+nerve,nervous
+noble,nobility
+norm,normal,normality
+obey,obedient,obedience
+oblige,obligation,obligatory
+offend,offence,offense
+omit,omission
+option,optional
+package,packaging
+patient,patience
+patriot,patriotic,patriotism
+peace,peaceful
+peculiar,peculiarity
+perfect,perfection
+perform,performance
+permit,permission
+persist,persistent,persistence
+persuade,persuasion
+poem,poetic
+poor,poverty
+possess,possession,possessive
+possible,possibility
+post,postal
+practical,practicality
+practice,practise
+precise,precision
+prefer,preference
+prejudice,prejudicial
+prepare,preparation
+present,presence
+preserve,preservation,preservative
+presume,presumption
+presuppose,presupposition
+pretend,pretence
+prevalent,prevalence
+prevent,prevention
+probable,probability
+produce,production
+progress,progression
+prohibit,prohibition,prohibitory
+project,projection
+promote,promotion
+proof,prove
+propose,proposal
+protect,protection,protective
+publicise,publicize,publication
+punish,punishment
+pure,purity
+rare,rarity
+react,reaction
+reappear,reappearance
+reassure,reassurance
+rebel,rebellious
+receipt,receive
+recognise,recognize,recognition
+reconcile,reconciliation
+reconsider,reconsideration
+recruit,recruitment
+refer,referral
+refresh,refreshment
+refuse,refusal
+reinforce,reinforcement
+relax,relaxation
+relief,relieve
+reluctant,reluctance
+rely,reliance
+represent,representation
+reproduce,reproduction
+require,requirement
+reside,residence,residential
+resign,resignation
+resist,resistance
+resolve,resolution
+respect,respectful
+responsible,responsibility
+restrain,restraint
+restrict,restriction,restrictive
+reverse,reversal
+rigor,rigour,rigorous
+rival,rivalry
+rose,rosy
+satisfy,satisfaction
+secret,secrecy
+sector,sectoral
+sequence,sequential
+serve,service
+settle,settlement
+sex,sexual
+sign,signature
+sincere,sincerity
+solve,solution
+speak,speech
+sphere,spherical
+spite,spiteful
+spontaneity,spontaneous
+strong,strength
+stupid,stupidity
+substance,substantial
+succeed,success
+suggest,suggestion
+summer,summery
+superior,superiority
+suppose,supposition
+survive,survival
+suspend,suspension
+talent,talented
+tempt,temptation
+tense,tension
+thirst,thirsty
+threat,threaten
+transmit,transmission
+treat,treatment
+true,truth
+trivia,trivial
+unable,inability
+uncertain,uncertainty
+unimportant,unimportance
+unite,unity
+use,usage
+vary,variation
+virtue,virtuous
+warm,warmth
+waste,wastage
+week,weekly
+weigh,weight
+wide,width
+winter,wintery
+wood,wooden
+wool,wooly,woolen,woolly,woollen
+year,yearly
+young,youth
diff --git a/holmes_extractor/lang/en/language_specific_rules.py b/holmes_extractor/lang/en/language_specific_rules.py
new file mode 100644
index 0000000..e618f79
--- /dev/null
+++ b/holmes_extractor/lang/en/language_specific_rules.py
@@ -0,0 +1,836 @@
+from spacy.tokens import Token
+from ...parsing import SemanticAnalyzer, SemanticMatchingHelper, MatchImplication,\
+ PhraseletTemplate, SemanticDependency
+
+class LanguageSpecificSemanticAnalyzer(SemanticAnalyzer):
+
+ language_name = 'English'
+
+ # The part of speech tags that can refer to nouns
+ noun_pos = ('NOUN', 'PROPN')
+
+ # The part of speech tags that can refer to predicate heads
+ predicate_head_pos = ('VERB', 'AUX')
+
+ # The part of speech tags that require a match in the search sentence when they occur within a
+ # search_phrase
+ matchable_pos = ('ADJ', 'ADP', 'ADV', 'NOUN', 'NUM', 'PROPN', 'VERB', 'AUX', 'X', 'INTJ')
+
+ # The part of speech tags that can refer to the head of an adjectival predicate phrase
+ # ("is" in "The dog is tired")
+ adjectival_predicate_head_pos = ('VERB', 'AUX')
+
+ # The part of speech tags that can refer to the subject of a adjectival predicate
+ # ("dog" in "The dog is tired")
+ adjectival_predicate_subject_pos = ('NOUN', 'PROPN', 'PRON')
+
+ # Dependency label that marks the subject of an adjectival predicate
+ adjectival_predicate_subject_dep = 'nsubj'
+
+ # Dependency label that marks the predicate of an adjectival predicate
+ adjectival_predicate_predicate_dep = 'acomp'
+
+ # Part of speech that marks the predicate of an adjectival predicate
+ adjectival_predicate_predicate_pos = 'ADJ'
+
+ # Dependency label that marks a modifying adjective
+ modifier_dep = 'amod'
+
+ # Original dependency label from nouns to prepositions
+ spacy_noun_to_preposition_dep = 'prep'
+
+ # Original dependency label from verbs to prepositions
+ spacy_verb_to_preposition_dep = 'prep'
+
+ # Added possible dependency label from nouns to prepositions
+ holmes_noun_to_preposition_dep = 'prepposs'
+
+ # Added possible dependency label from verbs to prepositions
+ holmes_verb_to_preposition_dep = 'prepposs'
+
+ # Dependency labels that occur in a conjunction phrase (righthand siblings and conjunctions)
+ conjunction_deps = ('conj', 'appos', 'cc')
+
+ # Syntactic tags that can mark interrogative pronouns
+ interrogative_pronoun_tags = ('WDT', 'WP', 'WRB')
+
+ # Syntactic tags that exclude a token from being the child token within a semantic dependency
+ semantic_dependency_excluded_tags = ('DT')
+
+ # Generic pronouns
+ generic_pronoun_lemmas = ('something', 'somebody', 'someone')
+
+ # The word for 'or' in this language
+ or_lemma = 'or'
+
+ # Where dependencies from a parent to a child are copied to the parent's righthand siblings,
+ # it can make sense to mark the dependency as uncertain depending on the underlying spaCy
+ # representations for the individual language
+ mark_child_dependencies_copied_to_siblings_as_uncertain = True
+
+ # Coreference chains are only processed up to this number of mentions away from the currently
+ # matched document location
+ maximum_mentions_in_coreference_chain = 3
+
+ # Coreference chains are only processed up to this number of words away from the currently
+ # matched document location
+ maximum_word_distance_in_coreference_chain = 300
+
+ # Dependency labels that can mark righthand siblings
+ sibling_marker_deps = ('conj', 'appos')
+
+ # Map from entity labels to words that correspond to their meaning
+ entity_labels_to_corresponding_lexemes = {
+ 'PERSON': 'person',
+ 'NORP': 'group',
+ 'FAC': 'building',
+ 'ORG': 'organization',
+ 'GPE': 'place',
+ 'LOC': 'place',
+ 'PRODUCT': 'product',
+ 'EVENT': 'event',
+ 'WORK_OF_ART': 'artwork',
+ 'LAW': 'law',
+ 'LANGUAGE': 'language',
+ 'DATE': 'date',
+ 'TIME': 'time',
+ 'PERCENT': 'percent',
+ 'MONEY': 'money',
+ 'QUANTITY': 'quantity',
+ 'ORDINAL': 'number',
+ 'CARDINAL': 'number'
+ }
+
+ whose_lemma = 'whose'
+
+ def add_subwords(self, token, subword_cache):
+ """ Analyses the internal structure of the word to find atomic semantic elements. Is
+ relevant for German but not implemented for English.
+ """
+ pass
+
+ def set_negation(self, token):
+ """Marks the negation on the token. A token is negative if it or one of its ancestors
+ has a negation word as a syntactic (not semantic!) child.
+ """
+ if token._.holmes.is_negated is not None:
+ return
+ for child in token.children:
+ if child._.holmes.lemma in (
+ 'nobody', 'nothing', 'nowhere', 'noone', 'neither', 'nor', 'no', 'not') \
+ or child.dep_ == 'neg':
+ token._.holmes.is_negated = True
+ return
+ if child._.holmes.lemma in ('more', 'longer'):
+ for grandchild in child.children:
+ if grandchild._.holmes.lemma == 'no':
+ token._.holmes.is_negated = True
+ return
+ if token.dep_ == 'ROOT':
+ token._.holmes.is_negated = False
+ return
+ self.set_negation(token.head)
+ token._.holmes.is_negated = token.head._.holmes.is_negated
+
+ def correct_auxiliaries_and_passives(self, token):
+ """Wherever auxiliaries and passives are found, derive the semantic information
+ from the syntactic information supplied by spaCy.
+ """
+ # 'auxpass' means an auxiliary used in a passive context. We mark its subject with
+ # a new dependency label 'nsubjpass'.
+ if len([
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'auxpass']) > 0:
+ for dependency in token._.holmes.children:
+ if dependency.label == 'nsubj':
+ dependency.label = 'nsubjpass'
+
+ # Structures like 'he used to' and 'he is going to'
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'xcomp'):
+ child = dependency.child_token(token.doc)
+ # distinguish 'he used to ...' from 'he used it to ...'
+ if token._.holmes.lemma == 'use' and token.tag_ == 'VBD' and len([
+ element for element in token._.holmes.children
+ if element.label == 'dobj']) == 0:
+ self.move_information_between_tokens(token, child)
+ elif token._.holmes.lemma == 'go':
+ # 'was going to' is marked as uncertain, 'is going to' is not marked as uncertain
+ uncertainty_flag = False
+ for other_dependency in (
+ other_dependency for other_dependency in
+ token._.holmes.children if other_dependency.label == 'aux'):
+ other_dependency_token = other_dependency.child_token(token.doc)
+ if other_dependency_token._.holmes.lemma == 'be' and \
+ other_dependency_token.tag_ == 'VBD': # 'was going to'
+ uncertainty_flag = True
+ self.move_information_between_tokens(token, child)
+ if uncertainty_flag:
+ for child_dependency in child._.holmes.children:
+ child_dependency.is_uncertain = True
+ else:
+ # constructions like:
+ #
+ #'she told him to close the contract'
+ #'he decided to close the contract'
+ for other_dependency in token._.holmes.children:
+ if other_dependency.label in ('dobj', 'nsubjpass') or (
+ other_dependency.label == 'nsubj' and \
+ len([
+ element for element in token._.holmes.children
+ if element.label == 'dobj'])
+ == 0):
+ if len([
+ element for element in child._.holmes.children
+ if element.label == 'auxpass']) > 0:
+ if not child._.holmes.has_dependency_with_child_index(
+ other_dependency.child_index) and \
+ dependency.child_index > other_dependency.child_index:
+ child._.holmes.children.append(SemanticDependency(
+ dependency.child_index, other_dependency.child_index,
+ 'nsubjpass', True))
+ else:
+ if not child._.holmes.has_dependency_with_child_index(
+ other_dependency.child_index) and \
+ dependency.child_index > other_dependency.child_index:
+ child._.holmes.children.append(SemanticDependency(
+ dependency.child_index, other_dependency.child_index,
+ 'nsubj', True))
+
+ def handle_relative_constructions(self, token):
+ if token.dep_ == 'relcl':
+ for dependency in token._.holmes.children:
+ child = dependency.child_token(token.doc)
+ # handle 'whose' clauses
+ for child_dependency in (
+ child_dependency for child_dependency in
+ child._.holmes.children if child_dependency.child_index >= 0
+ and child_dependency.label == 'poss' and
+ child_dependency.child_token(token.doc).tag_ == 'WP$'):
+ whose_pronoun_token = child_dependency.child_token(
+ token.doc)
+ working_index = whose_pronoun_token.i
+ while working_index >= token.sent.start:
+ # find the antecedent (possessed entity)
+ if len ([1 for working_dependency in
+ whose_pronoun_token.doc[working_index]._.holmes.children
+ if working_dependency.label == 'relcl']) > 0:
+ working_token = child.doc[working_index]
+ working_token = working_token.doc[
+ working_token._.holmes.token_or_lefthand_sibling_index]
+ for lefthand_sibling_of_antecedent in \
+ working_token._.holmes.loop_token_and_righthand_siblings(
+ token.doc):
+ # find the possessing noun
+ for possessing_noun in (
+ possessing_noun for possessing_noun in
+ child._.holmes.loop_token_and_righthand_siblings(token.doc)
+ if possessing_noun.i != lefthand_sibling_of_antecedent.i):
+ # add the semantic dependency
+ possessing_noun._.holmes.children.append(
+ SemanticDependency(
+ possessing_noun.i,
+ lefthand_sibling_of_antecedent.i, 'poss',
+ lefthand_sibling_of_antecedent.i != working_index))
+ # remove the syntactic dependency
+ possessing_noun._.holmes.remove_dependency_with_child_index(
+ whose_pronoun_token.i)
+ whose_pronoun_token._.holmes.children = [SemanticDependency(
+ whose_pronoun_token.i, 0 - (working_index + 1), None)]
+ return
+ working_index -= 1
+ return
+ if child.tag_ in ('WP', 'WRB', 'WDT'): # 'that' or 'which'
+ working_dependency_label = dependency.label
+ child._.holmes.children = [SemanticDependency(
+ child.i, 0 - (token.head.i + 1), None)]
+ else:
+ # relative antecedent, new dependency tag, 'the man I saw yesterday'
+ working_dependency_label = 'relant'
+ last_righthand_sibling_of_predicate = list(
+ token._.holmes.loop_token_and_righthand_siblings(token.doc))[-1]
+ for preposition_dependency in (
+ dep for dep in last_righthand_sibling_of_predicate._.holmes.children
+ if dep.label == 'prep' and
+ dep.child_token(token.doc)._.holmes.is_matchable):
+ preposition = preposition_dependency.child_token(token.doc)
+ for grandchild_dependency in (
+ dep for dep in preposition._.holmes.children if
+ dep.child_token(token.doc).tag_ in ('WP', 'WRB', 'WDT')
+ and dep.child_token(token.doc).i >= 0):
+ # 'that' or 'which'
+ complementizer = grandchild_dependency.child_token(token.doc)
+ preposition._.holmes.remove_dependency_with_child_index(
+ grandchild_dependency.child_index)
+ # a new relation pointing directly to the antecedent noun
+ # will be added in the section below
+ complementizer._.holmes.children = [SemanticDependency(
+ grandchild_dependency.child_index, 0 - (token.head.i + 1), None)]
+ displaced_preposition_dependencies = [
+ dep for dep in
+ last_righthand_sibling_of_predicate._.holmes.children
+ if dep.label == 'prep'
+ and len(dep.child_token(token.doc)._.holmes.children) == 0
+ and dep.child_token(token.doc)._.holmes.is_matchable]
+ antecedent = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
+ if len(displaced_preposition_dependencies) > 0:
+ displaced_preposition = \
+ displaced_preposition_dependencies[0].child_token(token.doc)
+ for lefthand_sibling_of_antecedent in (
+ lefthand_sibling_of_antecedent for lefthand_sibling_of_antecedent in
+ antecedent._.holmes.loop_token_and_righthand_siblings(token.doc)
+ if displaced_preposition.i != lefthand_sibling_of_antecedent.i):
+ displaced_preposition._.holmes.children.append(SemanticDependency(
+ displaced_preposition.i, lefthand_sibling_of_antecedent.i,
+ 'pobj', lefthand_sibling_of_antecedent.i != token.head.i))
+ #Where the antecedent is not the final one before the relative
+ #clause, mark the dependency as uncertain
+ for sibling_of_pred in \
+ token._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if not sibling_of_pred._.holmes.has_dependency_with_child_index(
+ displaced_preposition.i) and \
+ sibling_of_pred.i != displaced_preposition.i:
+ sibling_of_pred._.holmes.children.append(SemanticDependency(
+ sibling_of_pred.i, displaced_preposition.i, 'prep', True))
+ if working_dependency_label != 'relant':
+ # if 'that' or 'which', remove it
+ sibling_of_pred._.holmes.remove_dependency_with_child_index(
+ child.i)
+ else:
+ for lefthand_sibling_of_antecedent in \
+ antecedent._.holmes.loop_token_and_righthand_siblings(token.doc):
+ for sibling_of_predicate in (
+ sibling_of_predicate for sibling_of_predicate
+ in token._.holmes.loop_token_and_righthand_siblings(token.doc)
+ if sibling_of_predicate.i != lefthand_sibling_of_antecedent.i):
+ sibling_of_predicate._.holmes.children.append(SemanticDependency(
+ sibling_of_predicate.i, lefthand_sibling_of_antecedent.i,
+ working_dependency_label,
+ lefthand_sibling_of_antecedent.i != token.head.i))
+ #Where the antecedent is not the final one before the relative
+ #clause, mark the dependency as uncertain
+ if working_dependency_label != 'relant':
+ sibling_of_predicate._.holmes.remove_dependency_with_child_index(
+ child.i)
+ break
+
+ def holmes_lemma(self, token):
+ """Relabel the lemmas of phrasal verbs in sentences like 'he gets up' to incorporate
+ the entire phrasal verb to facilitate matching.
+ """
+ if token.pos_ == 'VERB':
+ for child in token.children:
+ if child.tag_ == 'RP':
+ return ' '.join([token.lemma_.lower(), child.lemma_.lower()])
+ if token.pos_ == 'ADJ':
+ # see if the adjective is a participle
+ participle_test_doc = self.spacy_parse(' '.join(('Somebody has', token.lemma_.lower())))
+ return participle_test_doc[2].lemma_.lower()
+ return token.lemma_.lower()
+
+ def language_specific_derived_holmes_lemma(self, token, lemma):
+ """Generates and returns a derived lemma where appropriate, otherwise returns *None*.
+ """
+ if (token is None or token.pos_ == 'NOUN') and len(lemma) >= 10:
+ possible_lemma = None
+ if lemma.endswith('isation') or lemma.endswith('ization'):
+ possible_lemma = ''.join((lemma[:-5], 'e')) # 'isation', 'ization' -> 'ise', 'ize'
+ if possible_lemma.endswith('ise'):
+ lemma_to_test_in_vocab = ''.join((possible_lemma[:-3], 'ize'))
+ # only American spellings in vocab
+ else:
+ lemma_to_test_in_vocab = possible_lemma
+ elif lemma.endswith('ication'):
+ possible_lemma = ''.join((lemma[:-7], 'y')) # implication -> imply
+ lemma_to_test_in_vocab = possible_lemma
+ if (possible_lemma is None or self.vectors_nlp.vocab[lemma_to_test_in_vocab].is_oov) \
+ and lemma.endswith('ation'):
+ possible_lemma = ''.join((lemma[:-3], 'e')) # manipulation -> manipulate
+ lemma_to_test_in_vocab = possible_lemma
+ if possible_lemma is not None and not \
+ self.vectors_nlp.vocab[lemma_to_test_in_vocab].is_oov:
+ return possible_lemma
+ # deadjectival nouns in -ness
+ if (token is None or token.pos_ == 'NOUN') and len(lemma) >= 7 and lemma.endswith('ness'):
+ working_possible_lemma = lemma[:-4]
+ # 'bawdiness'
+ if working_possible_lemma[-1] == 'i':
+ working_possible_lemma = ''.join((working_possible_lemma[:-1], 'y'))
+ if not self.vectors_nlp.vocab[working_possible_lemma].is_oov:
+ return working_possible_lemma
+ else:
+ return None
+ # adverb with 'ly' -> adjective without 'ly'
+ if token is None or token.tag_ == 'RB':
+ # domestically -> domestic
+ if lemma.endswith('ically'):
+ return lemma[:-4]
+ # 'regrettably', 'horribly' -> 'regrettable', 'horrible'
+ if lemma.endswith('ably') or lemma.endswith('ibly'):
+ return ''.join((lemma[:-1], 'e'))
+ if lemma.endswith('ly'):
+ derived_lemma = lemma[:-2]
+ # 'happily' -> 'happy'
+ if derived_lemma[-1] == 'i':
+ derived_lemma = ''.join((derived_lemma[:-1], 'y'))
+ return derived_lemma
+ # singing -> sing
+ if (token is None or token.tag_ == 'NN') and lemma.endswith('ing'):
+ lemmatization_sentence = ' '.join(('it is', lemma))
+ lemmatization_doc = self.spacy_parse(lemmatization_sentence)
+ return lemmatization_doc[2].lemma_.lower()
+ return None
+
+ def perform_language_specific_tasks(self, token):
+
+ # Because phrasal verbs are conflated into a single lemma, remove the dependency
+ # from the verb to the preposition and mark the preposition is unmatchable
+ if token.tag_ == 'RP':
+ token.head._.holmes.remove_dependency_with_child_index(token.i)
+ token._.holmes.is_matchable = False
+
+ # mark modal verb dependencies as uncertain
+ if token.pos_ == 'VERB':
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'aux'):
+ child = dependency.child_token(token.doc)
+ if child.pos_ in ('VERB', 'AUX') and child._.holmes.lemma not in \
+ ('be', 'have', 'do', 'go', 'use', 'will', 'shall'):
+ for other_dependency in (
+ other_dependency for other_dependency in
+ token._.holmes.children if other_dependency.label != 'aux'):
+ other_dependency.is_uncertain = True
+
+ # set auxiliaries as not matchable
+ if token.dep_ in ('aux', 'auxpass'):
+ token._.holmes.is_matchable = False
+
+ # Add new dependencies to phrases with 'by', 'of' and 'to' to enable the matching
+ # of deverbal nominal phrases with verb phrases; add 'dative' dependency to
+ # nouns within dative 'to' phrases; add new dependency spanning other prepositions
+ # to facilitate topic matching and supervised document classification
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label in ('prep', 'agent', 'dative')):
+ child = dependency.child_token(token.doc)
+ if child._.holmes.lemma == 'by':
+ working_dependency_label = 'pobjb'
+ elif child._.holmes.lemma == 'of':
+ working_dependency_label = 'pobjo'
+ elif child._.holmes.lemma == 'to':
+ if dependency.label == 'dative':
+ working_dependency_label = 'dative'
+ else:
+ working_dependency_label = 'pobjt'
+ else:
+ working_dependency_label = 'pobjp'
+ # for 'by', 'of' and 'to' the preposition is marked as not matchable
+ if working_dependency_label != 'pobjp':
+ child._.holmes.is_matchable = False
+ for child_dependency in (
+ child_dependency for child_dependency in child._.holmes.children
+ if child_dependency.label == 'pobj' and token.i !=
+ child_dependency.child_index):
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_dependency.child_index, working_dependency_label,
+ dependency.is_uncertain or child_dependency.is_uncertain))
+
+ # where a 'prepposs' dependency has been added and the preposition is not 'by', 'of' or
+ #'to', add a corresponding uncertain 'pobjp'
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'prepposs'):
+ child = dependency.child_token(token.doc)
+ for child_dependency in (
+ child_dependency for child_dependency in
+ child._.holmes.children if child_dependency.label == 'pobj' and token.i !=
+ child_dependency.child_index and child._.holmes.is_matchable):
+ token._.holmes.children.append(
+ SemanticDependency(token.i, child_dependency.child_index, 'pobjp', True))
+
+ # handle present active participles
+ if token.dep_ == 'acl' and token.tag_ == 'VBG':
+ lefthand_sibling = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
+ for antecedent in \
+ lefthand_sibling._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if token.i != antecedent.i:
+ token._.holmes.children.append(
+ SemanticDependency(token.i, antecedent.i, 'nsubj'))
+
+ # handle past passive participles
+ if token.dep_ == 'acl' and token.tag_ == 'VBN':
+ lefthand_sibling = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
+ for antecedent in \
+ lefthand_sibling._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if token.i != antecedent.i:
+ token._.holmes.children.append(
+ SemanticDependency(token.i, antecedent.i, 'dobj'))
+
+ # handle phrases like 'cat-eating dog' and 'dog-eaten cat', adding new dependencies
+ if token.dep_ == 'amod' and token.pos_ == 'VERB':
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'npadvmod'):
+ if token.tag_ == 'VBG':
+ dependency.label = 'advmodobj'
+ noun_dependency = 'advmodsubj'
+ elif token.tag_ == 'VBN':
+ dependency.label = 'advmodsubj'
+ noun_dependency = 'advmodobj'
+ else:
+ break
+ for noun in token.head._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if token.i != noun.i:
+ token._.holmes.children.append(SemanticDependency(
+ token.i, noun.i, noun_dependency, noun.i != token.head.i))
+ break # we only handle one antecedent, spaCy never seems to produce more anyway
+
+ # handle phrases like 'he is thinking about singing', 'he keeps on singing'
+ # find governed verb
+ if token.pos_ == 'VERB' and token.dep_ == 'pcomp':
+ # choose correct noun dependency for passive or active structure
+ if len([
+ dependency for dependency in token._.holmes.children
+ if dependency.label == 'auxpass']) > 0:
+ new_dependency_label = 'nsubjpass'
+ else:
+ new_dependency_label = 'nsubj'
+ # check that governed verb does not already have a dependency with the same label
+ if len([
+ target_token_dependency for target_token_dependency in token._.holmes.children
+ if target_token_dependency.label == new_dependency_label]) == 0:
+ # Go back in the sentence to find the first subject phrase
+ counter = token.i
+ while True:
+ counter -= 1
+ if counter < token.sent.start:
+ return
+ if token.doc[counter].dep_ in ('nsubj', 'nsubjpass'):
+ break
+ # From the subject phrase loop up through the syntactic parents
+ # to handle relative constructions
+ working_token = token.doc[counter]
+ while True:
+ if working_token.tag_.startswith('NN') or \
+ working_token._.holmes.is_involved_in_coreference():
+ for source_token in \
+ working_token._.holmes.loop_token_and_righthand_siblings(token.doc):
+ for target_token in \
+ token._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if target_token.i != source_token.i:
+ # such dependencies are always uncertain
+ target_token._.holmes.children.append(SemanticDependency(
+ target_token.i, source_token.i, new_dependency_label, True))
+ return
+ if working_token.dep_ != 'ROOT':
+ working_token = working_token.head
+ else:
+ return
+
+ # handle phrases like 'he is easy to find', 'he is ready to go'
+ # There is no way of knowing from the syntax whether the noun is a semantic
+ # subject or object of the verb, so the new dependency label 'arg' is added.
+ if token.tag_.startswith('NN') or token._.holmes.is_involved_in_coreference():
+ for adjective_dep in (
+ dep for dep in token._.holmes.children if
+ dep.label == self.modifier_dep and dep.child_token(token.doc).pos_ ==
+ self.adjectival_predicate_predicate_pos):
+ adj_token = adjective_dep.child_token(token.doc)
+ for verb_dep in (
+ dep for dep in adj_token._.holmes.children if
+ dep.label == 'xcomp' and dep.child_token(token.doc).pos_ == 'VERB'):
+ verb_token = verb_dep.child_token(token.doc)
+ verb_token._.holmes.children.append(SemanticDependency(
+ verb_token.i, token.i, 'arg', True))
+
+class LanguageSpecificSemanticMatchingHelper(SemanticMatchingHelper):
+
+ # The part of speech tags that can refer to nouns
+ noun_pos = ('NOUN', 'PROPN')
+
+ # Dependency labels between a head and a preposition
+ preposition_deps = ('prep')
+
+ # Parts of speech for which embedding matching is attempted
+ permissible_embedding_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV')
+
+ # Dependency labels that mark noun kernel elements that are not the head noun
+ noun_kernel_dep = ('nmod', 'compound', 'appos', 'nummod')
+
+ # Minimum length of a word taking part in an embedding-based match.
+ # Necessary because of the proliferation of short nonsense strings in the vocabularies.
+ minimum_embedding_match_word_length = 3
+
+ # Lemmas that should be suppressed within relation phraselets or as words of
+ # single-word phraselets during topic matching.
+ topic_matching_phraselet_stop_lemmas = ('then', 'therefore', 'so')
+
+ # Parent lemma / part-of-speech combinations that should lead to phraselets being
+ # reverse-matched only during topic matching.
+ topic_matching_reverse_only_parent_lemmas = (
+ ('be', 'VERB'), ('be', 'AUX'), ('have', 'VERB'), ('have', 'AUX'), ('do', 'VERB'),
+ ('say', 'VERB'), ('go', 'VERB'), ('get', 'VERB'), ('make', 'VERB'))
+
+ # Tags of tokens that should be ignored during topic matching (normally pronouns).
+ topic_matching_phraselet_stop_tags = ('PRP', 'PRP$')
+
+ # Lemmas that should be suppressed within relation phraselets or as words of
+ # single-word phraselets during supervised document classification.
+ supervised_document_classification_phraselet_stop_lemmas = ('be', 'have')
+
+ # Parts of speech that are preferred as lemmas within phraselets
+ preferred_phraselet_pos = ('NOUN', 'PROPN')
+
+ # The part-o'f-speech labels permitted for elements of an entity-defined multiword.
+ entity_defined_multiword_pos = ('NOUN', 'PROPN')
+
+ # The entity labels permitted for elements of an entity-defined multiword.
+ entity_defined_multiword_entity_types = ('PERSON', 'ORG', 'GPE', 'WORK_OF_ART')
+
+ # Dependency labels that can mark righthand siblings
+ sibling_marker_deps = ('conj', 'appos')
+
+ # Dependency labels from a token's subtree that are not included in a question answer
+ question_answer_blacklist_deps = ('conj', 'appos', 'cc', 'punct')
+
+ # Dependency labels from a token's subtree that are not included in a question answer if in
+ # final position.
+ question_answer_final_blacklist_deps = ('case')
+
+ # Maps from dependency tags as occurring within search phrases to corresponding implication
+ # definitions. This is the main source of the asymmetry in matching from search phrases to
+ # documents versus from documents to search phrases.
+ match_implication_dict = {
+ 'nsubj': MatchImplication(search_phrase_dependency='nsubj',
+ document_dependencies=['csubj', 'poss', 'pobjb', 'pobjo', 'advmodsubj', 'arg'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'acomp': MatchImplication(search_phrase_dependency='acomp',
+ document_dependencies=['amod', 'advmod', 'npmod', 'advcl'],
+ reverse_document_dependencies=['nsubj', 'csubj', 'poss', 'pobjb', 'advmodsubj', 'dobj',
+ 'pobjo', 'relant', 'nsubjpass', 'csubjpass', 'compound', 'advmodobj', 'dative',
+ 'arg']),
+ 'advcl': MatchImplication(search_phrase_dependency='advcl',
+ document_dependencies=['pobjo', 'poss', 'relant', 'nsubjpass', 'csubjpass',
+ 'compound', 'advmodobj', 'arg', 'dobj', 'xcomp']),
+ 'amod': MatchImplication(search_phrase_dependency='amod',
+ document_dependencies=['acomp', 'advmod', 'npmod', 'advcl', 'compound'],
+ reverse_document_dependencies=['nsubj', 'csubj', 'poss', 'pobjb', 'advmodsubj', 'dobj',
+ 'pobjo', 'relant', 'nsubjpass', 'csubjpass', 'compound', 'advmodobj', 'dative',
+ 'arg']),
+ 'advmod': MatchImplication(search_phrase_dependency='advmod',
+ document_dependencies=['acomp', 'amod', 'npmod', 'advcl']),
+ 'arg': MatchImplication(search_phrase_dependency='arg',
+ document_dependencies=['nsubj', 'csubj', 'poss', 'pobjb', 'advmodsubj', 'dobj',
+ 'pobjo', 'relant', 'nsubjpass', 'csubjpass', 'compound', 'advmodobj', 'dative',
+ 'pobjp'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'compound': MatchImplication(search_phrase_dependency='compound',
+ document_dependencies=['nmod', 'appos', 'nounmod', 'nsubj', 'csubj', 'poss', 'pobjb',
+ 'advmodsubj', 'dobj', 'pobjo', 'relant', 'pobjp',
+ 'nsubjpass', 'csubjpass', 'arg', 'advmodobj', 'dative', 'amod'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'dative': MatchImplication(search_phrase_dependency='dative',
+ document_dependencies=['pobjt', 'relant', 'nsubjpass'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'pobjt': MatchImplication(search_phrase_dependency='pobjt',
+ document_dependencies=['dative', 'relant'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'nsubjpass': MatchImplication(search_phrase_dependency='nsubjpass',
+ document_dependencies=['dobj', 'pobjo', 'poss', 'relant', 'csubjpass',
+ 'compound', 'advmodobj', 'arg', 'dative'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'dobj': MatchImplication(search_phrase_dependency='dobj',
+ document_dependencies=['pobjo', 'poss', 'relant', 'nsubjpass', 'csubjpass',
+ 'compound', 'advmodobj', 'arg', 'xcomp', 'advcl'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'nmod': MatchImplication(search_phrase_dependency='nmod',
+ document_dependencies=['appos', 'compound', 'nummod']),
+ 'poss': MatchImplication(search_phrase_dependency='poss',
+ document_dependencies=['pobjo', 'nsubj', 'csubj', 'pobjb', 'advmodsubj', 'arg',
+ 'relant', 'nsubjpass', 'csubjpass', 'compound', 'advmodobj', 'det'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'pobjo': MatchImplication(search_phrase_dependency='pobjo',
+ document_dependencies=['poss', 'dobj', 'relant', 'nsubjpass', 'csubjpass',
+ 'compound', 'advmodobj', 'arg', 'xcomp', 'nsubj', 'csubj', 'advmodsubj'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'pobjb': MatchImplication(search_phrase_dependency='pobjb',
+ document_dependencies=['nsubj', 'csubj', 'poss', 'advmodsubj', 'arg'],
+ reverse_document_dependencies=['acomp', 'amod']),
+ 'pobjp': MatchImplication(search_phrase_dependency='pobjp',
+ document_dependencies=['compound']),
+ 'pobj': MatchImplication(search_phrase_dependency='pobj',
+ document_dependencies=['pcomp']),
+ 'pcomp': MatchImplication(search_phrase_dependency='pcomp',
+ document_dependencies=['pobj']),
+ 'prep': MatchImplication(search_phrase_dependency='prep',
+ document_dependencies=['prepposs']),
+ 'wh_wildcard': MatchImplication(search_phrase_dependency='wh_wildcard',
+ document_dependencies=['advmod', 'advcl', 'npadvmod', 'prep', 'pobjp']),
+ 'xcomp': MatchImplication(search_phrase_dependency='xcomp',
+ document_dependencies=['pobjo', 'poss', 'relant', 'nsubjpass', 'csubjpass',
+ 'compound', 'advmodobj', 'arg', 'dobj', 'advcl'])}
+
+ # The templates used to generate topic matching phraselets.
+ phraselet_templates = [
+ PhraseletTemplate(
+ "predicate-actor", "A thing does", 2, 1,
+ ['nsubj', 'csubj', 'pobjb', 'advmodsubj'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "predicate-patient", "Somebody does a thing", 1, 3,
+ ['dobj', 'relant', 'advmodobj', 'xcomp'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ reverse_only=False, question=False),
+ PhraseletTemplate(
+ "word-ofword", "A thing of a thing", 1, 4,
+ ['pobjo', 'poss'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ reverse_only=False, question=False),
+ PhraseletTemplate(
+ "predicate-toughmovedargument", "A thing is easy to do", 5, 1,
+ ['arg'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "predicate-passivesubject", "A thing is done", 3, 1,
+ ['nsubjpass', 'csubjpass'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "be-attribute", "Something is a thing", 1, 3,
+ ['attr'],
+ ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=True, question=False),
+ PhraseletTemplate(
+ "predicate-recipient", "Somebody gives a thing something", 1, 3,
+ ['dative', 'pobjt'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "governor-adjective", "A big thing", 2, 1,
+ ['acomp', 'amod', 'advmod', 'npmod', 'advcl', 'dobj'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['JJ', 'JJR', 'JJS', 'VBN', 'RB', 'RBR', 'RBS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "noun-noun", "A thing thing", 2, 1,
+ ['nmod', 'appos', 'compound', 'nounmod'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "number-noun", "Seven things", 1, 0,
+ ['nummod'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
+ ['CD'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "prepgovernor-noun", "A thing in a thing", 1, 4,
+ ['pobjp'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False, question=False),
+ PhraseletTemplate(
+ "prep-noun", "in a thing", 0, 2,
+ ['pobj', 'pcomp'],
+ ['IN'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=True, question=False),
+ PhraseletTemplate(
+ "head-WHattr", "what is this?", 1, 0,
+ ['attr'],
+ ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WP'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "head-WHsubj", "who came?", 1, 0,
+ ['nsubj', 'nsubjpass', 'pobjb'],
+ ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WP'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "head-WHobj", "who did you see?", 3, 0,
+ ['dobj', 'pobjo'],
+ ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WP'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "head-WHadv", "where did you go?", 3, 0,
+ ['advmod'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WRB'], reverse_only=False, question=True,
+ assigned_dependency_label='wh_wildcard'),
+ PhraseletTemplate(
+ "headprep-WH", "what did you put it in?", 3, 0,
+ ['pobjp'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WP'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "headprepto-WH", "who did you say it to?", 3, 0,
+ ['pobjt'],
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
+ ['WP'], reverse_only=False, question=True),
+ PhraseletTemplate(
+ "word", "thing", 0, None,
+ None,
+ ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
+ None, reverse_only=False, question=False)
+ ]
+
+ def question_word_matches(self, search_phrase_label:str,
+ search_phrase_token:Token, document_token:Token,
+ document_vector, entity_label_to_vector_dict:dict,
+ initial_question_word_embedding_match_threshold:float) -> bool:
+ """ Checks whether *search_phrase_token* is a question word matching *document_token*. """
+
+ if search_phrase_token._.holmes.lemma.startswith('who'):
+ ent_types = ('PERSON', 'NORP', 'ORG', 'GPE')
+ if document_token.ent_type_ in ent_types:
+ return True
+ if self.token_matches_ent_type(document_vector,
+ entity_label_to_vector_dict, ent_types,
+ initial_question_word_embedding_match_threshold) > 0:
+ return True
+ return len([1 for i in document_token._.holmes.token_and_coreference_chain_indexes if
+ len(document_token.doc[i].morph.get('Gender')) > 0 and
+ document_token.doc[i].morph.get('Gender')[0] in ('Masc', 'Fem')]) > 0
+ if search_phrase_token._.holmes.lemma == 'what':
+ return True
+ if search_phrase_token._.holmes.lemma == 'where':
+ return document_token.ent_type_ not in ('DATE', 'TIME') and \
+ len([1 for c in document_token._.holmes.children
+ if c.child_token(document_token.doc).ent_type_ in ('DATE', 'TIME')]) == 0 \
+ and document_token.tag_ == 'IN' and document_token._.holmes.lemma in (
+ 'above', 'across', 'against', 'along', 'among', 'amongst', 'around', 'at',
+ 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'by', 'close', 'down',
+ 'in', 'into', 'near', 'next', 'off', 'on', 'onto', 'opposite', 'out',
+ 'outside', 'round', 'through', 'under', 'underneath', 'up')
+ if search_phrase_token._.holmes.lemma == 'when':
+ if document_token.tag_ == 'IN':
+ return document_token._.holmes.lemma in ('after', 'before', 'by', 'for',
+ 'since', 'till', 'until')
+ return document_token.ent_type_ in ('DATE', 'TIME')
+ if search_phrase_token._.holmes.lemma == 'how':
+ return document_token.tag_ == 'IN' and document_token._.holmes.lemma in ('by', 'with')
+ if search_phrase_token._.holmes.lemma == 'why':
+ if document_token.tag_ == 'IN':
+ if document_token._.holmes.lemma == 'in':
+ return len([1 for c in document_token._.holmes.children if
+ c.child_token(document_token.doc)._.holmes.lemma == 'order']) > 0
+ return document_token._.holmes.lemma in ('because')
+ if document_token.dep_ in ('advcl', 'prep') and document_token.text.lower() == 'owing':
+ return True
+ if document_token.dep_ == 'npadvmod' and document_token.text.lower() == 'thanks':
+ return True
+
+ return document_token.dep_ in ('advmod', 'advcl', 'acomp') and len([1 for c in
+ document_token.children if c._.holmes.lemma in ('because') or c.tag_ == 'TO']) > 0
+ # syntactic not semantic children to handle subject-predicate phrases correctly
+ return False
+
+ def normalize_hyphens(self, word):
+ """ Normalizes hyphens for ontology matching. Depending on the language,
+ this may involve replacing them with spaces (English) or deleting them entirely
+ (German).
+ """
+ if word.strip().startswith('-') or word.endswith('-'):
+ return word
+ else:
+ return word.replace('-', ' ')
diff --git a/holmes_extractor/manager.py b/holmes_extractor/manager.py
index 4857d7b..3b54c98 100644
--- a/holmes_extractor/manager.py
+++ b/holmes_extractor/manager.py
@@ -1,129 +1,266 @@
-from multiprocessing import Process, Queue, Manager as Multiprocessing_manager, cpu_count
+from multiprocessing import Process, Queue, Manager as MultiprocessingManager, cpu_count
from threading import Lock
+from string import punctuation
+from math import sqrt
import traceback
import sys
+import os
import jsonpickle
+import pkg_resources
+import spacy
+import coreferee
+from spacy import Language
+from spacy.tokens import Doc, Token
+from thinc.api import Config
from .errors import *
-from .structural_matching import StructuralMatcher, ThreadsafeContainer
-from .semantics import SemanticAnalyzerFactory
-from .extensive_matching import *
+from .matching import StructuralMatcher
+from .ontology import Ontology
+from .parsing import SemanticAnalyzerFactory, SemanticAnalyzer, SemanticMatchingHelperFactory,\
+ LinguisticObjectFactory, SearchPhrase, SERIALIZED_DOCUMENT_VERSION
+from .classification import SupervisedTopicTrainingBasis, SupervisedTopicClassifier,\
+ SupervisedTopicClassifierModel
+from .topic_matching import TopicMatcher, TopicMatchDictionaryOrderer
from .consoles import HolmesConsoles
-def validate_options(
- semantic_analyzer, overall_similarity_threshold,
- embedding_based_matching_on_root_words, perform_coreference_resolution):
- if overall_similarity_threshold < 0.0 or overall_similarity_threshold > 1.0:
- raise ValueError(
- 'overall_similarity_threshold must be between 0 and 1')
- if overall_similarity_threshold != 1.0 and not semantic_analyzer.model_supports_embeddings():
- raise ValueError(
- 'Model has no embeddings: overall_similarity_threshold must be 1.')
- if overall_similarity_threshold == 1.0 and embedding_based_matching_on_root_words:
- raise ValueError(
- 'overall_similarity_threshold is 1; embedding_based_matching_on_root_words must be '\
- 'False')
- if perform_coreference_resolution and not \
- semantic_analyzer.model_supports_coreference_resolution():
- raise ValueError(
- 'Model does not support coreference resolution: perform_coreference_resolution may '\
- 'not be True')
-
+TIMEOUT_SECONDS = 180
+
+absolute_config_filename = pkg_resources.resource_filename(__name__, 'config.cfg')
+config = Config().from_disk(absolute_config_filename)
+vector_nlps_config_dict = config['vector_nlps']
+model_names_to_nlps = {}
+MODEL_NAMES_TO_SEMANTIC_ANALYZERS = {}
+nlp_lock = Lock()
+pipeline_components_lock = Lock()
+
+def get_nlp(model_name:str) -> Language:
+ with nlp_lock:
+ if model_name not in model_names_to_nlps:
+ if model_name.endswith('_trf'):
+ model_names_to_nlps[model_name] = spacy.load(model_name,
+ config={'components.transformer.model.tokenizer_config.use_fast': False})
+ else:
+ model_names_to_nlps[model_name] = spacy.load(model_name)
+ return model_names_to_nlps[model_name]
+
+def get_semantic_analyzer(nlp:Language) -> SemanticAnalyzer:
+ global MODEL_NAMES_TO_SEMANTIC_ANALYZERS
+ model_name = '_'.join((nlp.meta['lang'], nlp.meta['name']))
+ vectors_nlp = get_nlp(vector_nlps_config_dict[model_name]) \
+ if model_name in vector_nlps_config_dict else nlp
+ with nlp_lock:
+ if model_name not in MODEL_NAMES_TO_SEMANTIC_ANALYZERS:
+ MODEL_NAMES_TO_SEMANTIC_ANALYZERS[model_name] = \
+ SemanticAnalyzerFactory().semantic_analyzer(nlp=nlp, vectors_nlp=vectors_nlp)
+ return MODEL_NAMES_TO_SEMANTIC_ANALYZERS[model_name]
class Manager:
"""The facade class for the Holmes library.
Parameters:
- model -- the name of the spaCy model, e.g. *en_core_web_lg*
+ model -- the name of the spaCy model, e.g. *en_core_web_trf*
overall_similarity_threshold -- the overall similarity threshold for embedding-based
- matching. Defaults to *1.0*, which deactivates embedding-based matching.
+ matching. Defaults to *1.0*, which deactivates embedding-based matching. Note that this
+ parameter is not relevant for topic matching, where the thresholds for embedding-based
+ matching are set on the call to *topic_match_documents_against*.
embedding_based_matching_on_root_words -- determines whether or not embedding-based
matching should be attempted on search-phrase root tokens, which has a considerable
- performance hit. Defaults to *False*.
+ performance hit. Defaults to *False*. Note that this parameter is not relevant for topic
+ matching.
ontology -- an *Ontology* object. Defaults to *None* (no ontology).
analyze_derivational_morphology -- *True* if matching should be attempted between different
words from the same word family. Defaults to *True*.
- perform_coreference_resolution -- *True*, *False*, or *None* if coreference resolution
- should be performed depending on whether the model supports it. Defaults to *None*.
- debug -- a boolean value specifying whether debug representations should be outputted
- for parsed sentences. Defaults to *False*.
+ perform_coreference_resolution -- *True* if coreference resolution should be taken into account
+ when matching. Defaults to *True*.
+ use_reverse_dependency_matching -- *True* if appropriate dependencies in documents can be
+ matched to dependencies in search phrases where the two dependencies point in opposite
+ directions. Defaults to *True*.
+ number_of_workers -- the number of worker processes to use, or *None* if the number of worker
+ processes should depend on the number of available cores. Defaults to *None*
+ verbose -- a boolean value specifying whether multiprocessing messages should be outputted to
+ the console. Defaults to *False*
"""
def __init__(
- self, model, *, overall_similarity_threshold=1.0,
- embedding_based_matching_on_root_words=False, ontology=None,
- analyze_derivational_morphology=True, perform_coreference_resolution=None, debug=False):
- self.semantic_analyzer = SemanticAnalyzerFactory().semantic_analyzer(
- model=model,
- perform_coreference_resolution=perform_coreference_resolution,
- debug=debug)
- if perform_coreference_resolution is None:
- perform_coreference_resolution = \
- self.semantic_analyzer.model_supports_coreference_resolution()
- validate_options(
- self.semantic_analyzer, overall_similarity_threshold,
- embedding_based_matching_on_root_words, perform_coreference_resolution)
+ self, model:str, *, overall_similarity_threshold:float=1.0,
+ embedding_based_matching_on_root_words:bool=False, ontology:Ontology=None,
+ analyze_derivational_morphology:bool=True, perform_coreference_resolution:bool=True,
+ use_reverse_dependency_matching:bool=True, number_of_workers:int=None,
+ verbose:bool=False):
+ self.verbose = verbose
+ self.nlp = get_nlp(model)
+ with pipeline_components_lock:
+ if not self.nlp.has_pipe('coreferee'):
+ self.nlp.add_pipe('coreferee')
+ if not self.nlp.has_pipe('holmes'):
+ self.nlp.add_pipe('holmes')
+ self.semantic_analyzer = get_semantic_analyzer(self.nlp)
+ if not self.semantic_analyzer.model_supports_embeddings():
+ overall_similarity_threshold = 1.0
+ if overall_similarity_threshold < 0.0 or overall_similarity_threshold > 1.0:
+ raise ValueError(
+ 'overall_similarity_threshold must be between 0.0 and 1.0')
+ if overall_similarity_threshold == 1.0 and embedding_based_matching_on_root_words:
+ raise ValueError(
+ 'overall_similarity_threshold is 1.0; embedding_based_matching_on_root_words must '\
+ 'be False')
self.ontology = ontology
- self.debug = debug
+ self.analyze_derivational_morphology = analyze_derivational_morphology
+ self.semantic_matching_helper = SemanticMatchingHelperFactory().semantic_matching_helper(
+ language=self.nlp.meta['lang'], ontology=ontology,
+ analyze_derivational_morphology=analyze_derivational_morphology)
self.overall_similarity_threshold = overall_similarity_threshold
- self.embedding_based_matching_on_root_words = embedding_based_matching_on_root_words
self.perform_coreference_resolution = perform_coreference_resolution
- self.structural_matcher = StructuralMatcher(
- self.semantic_analyzer, ontology, overall_similarity_threshold,
- embedding_based_matching_on_root_words,
+ self.use_reverse_dependency_matching = use_reverse_dependency_matching
+ self.linguistic_object_factory = LinguisticObjectFactory(
+ self.semantic_analyzer, self.semantic_matching_helper, ontology,
+ overall_similarity_threshold, embedding_based_matching_on_root_words,
analyze_derivational_morphology, perform_coreference_resolution)
- self.threadsafe_container = ThreadsafeContainer()
+ self.semantic_matching_helper.ontology_reverse_derivational_dict = \
+ self.linguistic_object_factory.get_ontology_reverse_derivational_dict()
+ self.structural_matcher = StructuralMatcher(
+ self.semantic_matching_helper, ontology, embedding_based_matching_on_root_words,
+ analyze_derivational_morphology, perform_coreference_resolution,
+ use_reverse_dependency_matching,
+ self.semantic_analyzer.get_entity_label_to_vector_dict() if
+ self.semantic_analyzer.model_supports_embeddings() else {})
+ self.document_labels_to_worker_queues = {}
+ self.search_phrases = []
+ HolmesBroker.set_extensions()
+ for phraselet_template in self.semantic_matching_helper.phraselet_templates:
+ phraselet_template.template_doc = self.semantic_analyzer.parse(
+ phraselet_template.template_sentence)
+ if number_of_workers is None:
+ number_of_workers = cpu_count()
+ elif number_of_workers <= 0:
+ raise ValueError('number_of_workers must be a positive integer.')
+ self.number_of_workers = number_of_workers
+ self.next_worker_to_use = 0
+ self.multiprocessing_manager = MultiprocessingManager()
+ self.worker = Worker() # will be copied to worker processes by value (Windows) or
+ # by reference (Linux)
+ self.workers = []
+ self.input_queues = []
+ self.word_dictionaries_need_rebuilding = False
+ self.words_to_corpus_frequencies = {}
+ self.maximum_corpus_frequency = 0
- def parse_and_register_document(self, document_text, label=''):
+ for counter in range(0, self.number_of_workers):
+ input_queue = Queue()
+ self.input_queues.append(input_queue)
+ worker_label = ' '.join(('Worker', str(counter)))
+ this_worker = Process(
+ target=self.worker.listen, args=(
+ self.structural_matcher, self.overall_similarity_threshold, self.nlp.vocab, model,
+ SERIALIZED_DOCUMENT_VERSION, input_queue, worker_label),
+ daemon=True)
+ self.workers.append(this_worker)
+ this_worker.start()
+ self.lock = Lock()
+
+ def next_worker_queue_number(self):
+ self.next_worker_to_use += 1
+ if self.next_worker_to_use == self.number_of_workers:
+ self.next_worker_to_use = 0
+ return self.next_worker_to_use
+
+ def handle_response(self, reply_queue, number_of_messages, method_name):
+ return_values = []
+ exception_worker_label = None
+ for _ in range(number_of_messages):
+ worker_label, return_value, return_info = reply_queue.get(timeout=TIMEOUT_SECONDS)
+ if isinstance(return_info, (WrongModelDeserializationError,
+ WrongVersionDeserializationError)):
+ raise return_info
+ elif isinstance(return_info, Exception):
+ if exception_worker_label is None:
+ exception_worker_label = worker_label
+ else:
+ return_values.append(return_value)
+ if self.verbose:
+ with self.lock:
+ print(return_info)
+ if exception_worker_label is not None:
+ with self.lock:
+ print(''.join(('ERROR executing ', method_name, '() on ',
+ exception_worker_label,
+ '. Please examine the output from the worker processes to identify the problem.')))
+ return return_values
+
+ def register_serialized_documents(self, document_dictionary:dict[str, Doc]) -> None:
"""Parameters:
- document_text -- the raw document text.
- label -- a label for the document which must be unique. Defaults to the empty string,
- which is intended for use cases involving single documents (typically user entries).
+ document_dictionary -- a dictionary from labels to serialized documents.
"""
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ for label, serialized_doc in document_dictionary.items():
+ if label in self.document_labels_to_worker_queues:
+ raise DuplicateDocumentError(label)
+ else:
+ worker_queue_number = self.next_worker_queue_number()
+ self.document_labels_to_worker_queues[label] = worker_queue_number
+ self.word_dictionaries_need_rebuilding = True
+ self.input_queues[worker_queue_number].put((
+ self.worker.register_serialized_document,
+ (serialized_doc, label), reply_queue), TIMEOUT_SECONDS)
+ self.handle_response(reply_queue, len(document_dictionary), 'register_serialized_documents')
+
+ def register_serialized_document(self, serialized_document:bytes, label:str) -> None:
+ """Note that this function is the most efficient way of loading documents.
- doc = self.semantic_analyzer.parse(document_text)
- self.register_parsed_document(doc, label)
-
- def register_parsed_document(self, doc, label=''):
- """Parameters:
+ Parameters:
document -- a preparsed Holmes document.
label -- a label for the document which must be unique. Defaults to the empty string,
which is intended for use cases involving single documents (typically user entries).
"""
- indexed_document = self.structural_matcher.index_document(doc)
- self.threadsafe_container.register_document(indexed_document, label)
+ self.register_serialized_documents({label: serialized_document})
- def deserialize_and_register_document(self, document, label=''):
+ def parse_and_register_document(self, document_text:str, label:str='') -> None:
"""Parameters:
- document -- a Holmes document serialized using the *serialize_document()* function.
+ document_text -- the raw document text.
label -- a label for the document which must be unique. Defaults to the empty string,
which is intended for use cases involving single documents (typically user entries).
"""
- if self.perform_coreference_resolution:
- raise SerializationNotSupportedError(self.semantic_analyzer.model)
- doc = self.semantic_analyzer.from_serialized_string(document)
- self.semantic_analyzer.debug_structures(doc) # only has effect when debug=True
- indexed_document = self.structural_matcher.index_document(doc)
- self.threadsafe_container.register_document(indexed_document, label)
-
- def remove_document(self, label):
+
+ doc = self.nlp(document_text)
+ self.register_serialized_document(doc.to_bytes(), label)
+
+ def remove_document(self, label:str) -> None:
"""Parameters:
label -- the label of the document to be removed.
"""
- self.threadsafe_container.remove_document(label)
-
- def remove_all_documents(self):
- self.threadsafe_container.remove_all_documents()
-
- def document_labels(self):
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ if label in self.document_labels_to_worker_queues:
+ self.input_queues[self.document_labels_to_worker_queues[label]].put((
+ self.worker.remove_document, (label,), reply_queue), timeout=TIMEOUT_SECONDS)
+ del self.document_labels_to_worker_queues[label]
+ self.word_dictionaries_need_rebuilding = True
+ else:
+ return
+ self.handle_response(reply_queue, 1, 'remove_document')
+
+ def remove_all_documents(self) -> None:
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.remove_all_documents, None, reply_queue), timeout=TIMEOUT_SECONDS)
+ self.word_dictionaries_need_rebuilding = True
+ self.document_labels_to_worker_queues = {}
+ self.handle_response(reply_queue, self.number_of_workers, 'remove_all_documents')
+
+ def document_labels(self) -> list[str]:
"""Returns a list of the labels of the currently registered documents."""
- return self.threadsafe_container.document_labels()
+ with self.lock:
+ unsorted_labels = self.document_labels_to_worker_queues.keys()
+ return sorted(unsorted_labels)
- def serialize_document(self, label):
+ def serialize_document(self, label:str) -> bytes:
"""Returns a serialized representation of a Holmes document that can be persisted to
a file. If *label* is not the label of a registered document, *None* is returned
instead.
@@ -132,239 +269,228 @@ def serialize_document(self, label):
label -- the label of the document to be serialized.
"""
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ if label in self.document_labels_to_worker_queues:
+ self.input_queues[self.document_labels_to_worker_queues[label]].put((
+ self.worker.get_serialized_document, (label,), reply_queue), TIMEOUT_SECONDS)
+ else:
+ return None
+ return self.handle_response(reply_queue, 1, 'serialize_document')[0]
- if self.perform_coreference_resolution:
- raise SerializationNotSupportedError(self.semantic_analyzer.model)
- doc = self.threadsafe_container.get_document(label)
- if doc is not None:
- return self.semantic_analyzer.to_serialized_string(doc)
+ def get_document(self, label:str='') -> Doc:
+ """Returns a Holmes document. If *label* is not the label of a registered document, *None*
+ is returned instead.
+
+ Parameters:
+
+ label -- the label of the document to be serialized.
+ """
+ serialized_document = self.serialize_document(label)
+ return None if serialized_document is None else \
+ Doc(self.nlp.vocab).from_bytes(serialized_document)
+
+ def debug_document(self, label:str='') -> None:
+ """Outputs a debug representation for a loaded document.
+ """
+ serialized_document = self.serialize_document(label)
+ if serialized_document is not None:
+ doc = Doc(self.nlp.vocab).from_bytes(serialized_document)
+ self.semantic_analyzer.debug_structures(doc)
else:
- return None
+ print('No document with label', label)
- def register_search_phrase(self, search_phrase_text, label=None):
- """Parameters:
+ def internal_get_search_phrase(self, search_phrase_text, label):
+ if label is None:
+ label = search_phrase_text
+ search_phrase_doc = self.nlp(search_phrase_text)
+ search_phrase = self.linguistic_object_factory.create_search_phrase(
+ search_phrase_text, search_phrase_doc, label, None, False, False, False, False)
+ return search_phrase
+
+ def register_search_phrase(self, search_phrase_text:str, label:str=None) -> SearchPhrase:
+ """Registers and returns a new search phrase.
+
+ Parameters:
search_phrase_text -- the raw search phrase text.
label -- a label for the search phrase which need *not* be unique. Defaults to the raw
search phrase text.
"""
- if label is None:
- label = search_phrase_text
- search_phrase_doc = self.semantic_analyzer.parse(search_phrase_text)
- search_phrase = self.structural_matcher.create_search_phrase(
- search_phrase_text, search_phrase_doc, label, None, False)
- self.threadsafe_container.register_search_phrase(search_phrase)
-
- def remove_all_search_phrases(self):
- self.threadsafe_container.remove_all_search_phrases()
-
- def remove_all_search_phrases_with_label(self, label):
- self.threadsafe_container.remove_all_search_phrases_with_label(label)
-
- def match(self):
- """Matches the registered search phrases to the registered documents. Returns a list
- of *Match* objects sorted by their overall similarity measures in descending order.
- Should be called by applications wishing to retain references to the spaCy and
- Holmes information that was used to derive the matches.
- """
- indexed_documents = self.threadsafe_container.get_indexed_documents()
- search_phrases = self.threadsafe_container.get_search_phrases()
- return self.structural_matcher.match(
- indexed_documents=indexed_documents,
- search_phrases=search_phrases,
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=None,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None)
-
- def _build_match_dictionaries(self, matches):
- """Builds and returns a list of dictionaries describing matches."""
- match_dicts = []
- for match in matches:
- earliest_sentence_index = sys.maxsize
- latest_sentence_index = -1
- for word_match in match.word_matches:
- sentence_index = word_match.document_token.sent.start
- if sentence_index < earliest_sentence_index:
- earliest_sentence_index = sentence_index
- if sentence_index > latest_sentence_index:
- latest_sentence_index = sentence_index
- sentences_string = ' '.join(
- sentence.text.strip() for sentence in
- match.word_matches[0].document_token.doc.sents if sentence.start >=
- earliest_sentence_index and sentence.start <= latest_sentence_index)
-
- match_dict = {
- 'search_phrase': match.search_phrase_label,
- 'document': match.document_label,
- 'index_within_document': match.index_within_document,
- 'sentences_within_document': sentences_string,
- 'negated': match.is_negated,
- 'uncertain': match.is_uncertain,
- 'involves_coreference': match.involves_coreference,
- 'overall_similarity_measure': match.overall_similarity_measure}
- text_word_matches = []
- for word_match in match.word_matches:
- text_word_matches.append({
- 'search_phrase_word': word_match.search_phrase_word,
- 'document_word': word_match.document_word,
- 'document_phrase': self.semantic_analyzer.get_dependent_phrase(
- word_match.document_token, word_match.document_subword),
- 'match_type': word_match.type,
- 'similarity_measure': str(word_match.similarity_measure),
- 'involves_coreference': word_match.involves_coreference,
- 'extracted_word': word_match.extracted_word,
- 'explanation': word_match.explain()})
- match_dict['word_matches'] = text_word_matches
- match_dicts.append(match_dict)
- return match_dicts
-
- def match_returning_dictionaries(self):
- """Matches the registered search phrases to the registered documents. Returns a list
- of dictionaries describing any matches, sorted by their overall similarity measures in
- descending order. Callers of this method do not have to manage any further
- dependencies on spaCy or Holmes.
- """
- return self._build_match_dictionaries(self.match())
+ search_phrase = self.internal_get_search_phrase(search_phrase_text, label)
+ search_phrase.pack()
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.register_search_phrase,
+ (search_phrase,), reply_queue), timeout=TIMEOUT_SECONDS)
+ self.search_phrases.append(search_phrase)
+ self.handle_response(reply_queue, self.number_of_workers, 'register_search_phrase')
+ return search_phrase
+
+ def remove_all_search_phrases_with_label(self, label:str) -> None:
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.remove_all_search_phrases_with_label,
+ (label,), reply_queue), timeout=TIMEOUT_SECONDS)
+ self.search_phrases = [search_phrase for search_phrase in self.search_phrases
+ if search_phrase.label != label]
+ self.handle_response(reply_queue, self.number_of_workers,
+ 'remove_all_search_phrases_with_label')
+
+ def remove_all_search_phrases(self) -> None:
+ reply_queue = self.multiprocessing_manager.Queue()
+ with self.lock:
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.remove_all_search_phrases, None, reply_queue),
+ timeout=TIMEOUT_SECONDS)
+ self.search_phrases = []
+ self.handle_response(reply_queue, self.number_of_workers, 'remove_all_search_phrases')
+
+ def list_search_phrase_labels(self) -> list[str]:
+ with self.lock:
+ return sorted(list({search_phrase.label for search_phrase in self.search_phrases}))
+
+ def match(self, search_phrase_text:str=None, document_text:str=None) -> list[dict]:
+ """ Matches search phrases to documents and returns the result as match dictionaries.
- def match_search_phrases_against(self, entry):
- """Matches the registered search phrases against a single document
- supplied to the method and returns dictionaries describing any matches.
- """
- search_phrases = self.threadsafe_container.get_search_phrases()
- doc = self.semantic_analyzer.parse(entry)
- indexed_documents = {'':self.structural_matcher.index_document(doc)}
- matches = self.structural_matcher.match(
- indexed_documents=indexed_documents,
- search_phrases=search_phrases,
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=None,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None)
- return self._build_match_dictionaries(matches)
-
- def match_documents_against(self, search_phrase_text):
- """Matches the registered documents against a single search phrase
- supplied to the method and returns dictionaries describing any matches.
+ Parameters:
+
+ search_phrase_text -- a text from which to generate a search phrase, or *None* if the
+ preloaded search phrases should be used for matching.
+ document_text -- a text from which to generate a document, or *None* if the preloaded
+ documents should be used for matching.
"""
- indexed_documents = self.threadsafe_container.get_indexed_documents()
- search_phrase_doc = self.semantic_analyzer.parse(search_phrase_text)
- search_phrases = [self.structural_matcher.create_search_phrase(
- search_phrase_text, search_phrase_doc, search_phrase_text, None, False)]
- matches = self.structural_matcher.match(
- indexed_documents=indexed_documents,
- search_phrases=search_phrases,
- output_document_matching_message_to_console=False,
- match_depending_on_single_words=None,
- compare_embeddings_on_root_words=False,
- compare_embeddings_on_non_root_words=True,
- document_labels_to_indexes_for_reverse_matching_sets=None,
- document_labels_to_indexes_for_embedding_reverse_matching_sets=None)
- return self._build_match_dictionaries(matches)
- def topic_match_documents_against(
- self, text_to_match, *, maximum_activation_distance=75,
- relation_score=30, reverse_only_relation_score=20,
- single_word_score=5, single_word_any_tag_score=2,
- overlapping_relation_multiplier=1.5, embedding_penalty=0.6,
- ontology_penalty=0.9,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100,
- sideways_match_extent=100, only_one_result_per_document=False, number_of_results=10,
- document_label_filter=None):
- """Returns the results of a topic match between an entered text and the loaded documents.
+ if search_phrase_text is not None:
+ search_phrase = self.internal_get_search_phrase(search_phrase_text, '')
+ elif len(self.list_search_phrase_labels()) == 0:
+ raise NoSearchPhraseError('At least one search phrase is required for matching.')
+ else:
+ search_phrase = None
+ if document_text is not None:
+ serialized_document = self.nlp(document_text).to_bytes()
+ with self.lock:
+ worker_queue_number = self.next_worker_queue_number()
+ worker_range = range(worker_queue_number, worker_queue_number + 1)
+ number_of_workers = 1
+ else:
+ with self.lock:
+ if len(self.document_labels_to_worker_queues) == 0:
+ raise NoDocumentError('At least one document is required for matching.')
+ serialized_document = None
+ number_of_workers = self.number_of_workers
+ worker_range = range(number_of_workers)
+ reply_queue = self.multiprocessing_manager.Queue()
+ for worker_index in worker_range:
+ self.input_queues[worker_index].put((
+ self.worker.match, (serialized_document, search_phrase), reply_queue),
+ timeout=TIMEOUT_SECONDS)
+ worker_match_dictss = self.handle_response(reply_queue, number_of_workers,
+ 'match')
+ match_dicts = []
+ for worker_match_dicts in worker_match_dictss:
+ match_dicts.extend(worker_match_dicts)
+ return self.structural_matcher.sort_match_dictionaries(match_dicts)
+
+ def get_corpus_frequency_information(self):
+
+ def merge_dicts_adding_common_values(dict1, dict2):
+ dict_to_return = {**dict1, **dict2}
+ for key in dict_to_return:
+ if key in dict1 and key in dict2:
+ dict_to_return[key] = dict1[key] + dict2[key]
+ return dict_to_return
+
+ with self.lock:
+ if self.word_dictionaries_need_rebuilding:
+ reply_queue = self.multiprocessing_manager.Queue()
+ worker_frequency_dict = {}
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.get_words_to_corpus_frequencies, None, reply_queue),
+ timeout=TIMEOUT_SECONDS)
+ exception_worker_label = None
+ for _ in range(self.number_of_workers):
+ worker_label, return_value, return_info = reply_queue.get(
+ timeout=TIMEOUT_SECONDS)
+ if isinstance(return_info, Exception):
+ if exception_worker_label is None:
+ exception_worker_label = worker_label
+ else:
+ worker_frequency_dict = merge_dicts_adding_common_values(
+ worker_frequency_dict, return_value)
+ if self.verbose:
+ print(return_info)
+ if exception_worker_label is not None:
+ print(''.join(('ERROR executing ', method_name, '() on ',
+ exception_worker_label,
+ '. Please examine the output from the worker processes to identify the problem.')))
+ self.words_to_corpus_frequencies = {}
+ for word in worker_frequency_dict:
+ if word in self.words_to_corpus_frequencies:
+ self.words_to_corpus_frequencies[word] += \
+ worker_frequency_dict[word]
+ else:
+ self.words_to_corpus_frequencies[word] = \
+ worker_frequency_dict[word]
+ self.maximum_corpus_frequency = max(self.words_to_corpus_frequencies.values())
+ self.word_dictionaries_need_rebuilding = False
+ return self.words_to_corpus_frequencies, self.maximum_corpus_frequency
- Properties:
+ def topic_match_documents_against(
+ self, text_to_match:str, *, use_frequency_factor:bool=True,
+ maximum_activation_distance:int=75,
+ word_embedding_match_threshold:float=0.8,
+ initial_question_word_embedding_match_threshold:float=0.7,
+ relation_score:int=300, reverse_only_relation_score:int=200,
+ single_word_score:int=50, single_word_any_tag_score:int=20,
+ initial_question_word_answer_score:int=600,
+ initial_question_word_behaviour:str='process', different_match_cutoff_score:int=15,
+ overlapping_relation_multiplier:float=1.5, embedding_penalty:float=0.6,
+ ontology_penalty:float=0.9,
+ relation_matching_frequency_threshold:float=0.25,
+ embedding_matching_frequency_threshold:float=0.5,
+ sideways_match_extent:int=100, only_one_result_per_document:bool=False,
+ number_of_results:int=10, document_label_filter:str=None,
+ tied_result_quotient:float=0.9) -> list[dict]:
- text_to_match -- the text to match against the loaded documents.
- maximum_activation_distance -- the number of words it takes for a previous phraselet
- activation to reduce to zero when the library is reading through a document.
- relation_score -- the activation score added when a normal two-word
- relation is matched.
- reverse_only_relation_score -- the activation score added when a two-word relation
- is matched using a search phrase that can only be reverse-matched.
- single_word_score -- the activation score added when a normal single
- word is matched.
- single_word_any_tag_score -- the activation score added when a single word is matched
- whose tag did not correspond to the template specification.
- overlapping_relation_multiplier -- the value by which the activation score is multiplied
- when two relations were matched and the matches involved a common document word.
- embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
- match involved an embedding. The result is additionally multiplied by the overall
- similarity measure of the match.
- ontology_penalty -- a value between 0 and 1 with which scores are multiplied for each
- word match within a match that involved the ontology. For each such word match,
- the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
- higher for hyponyms and hypernyms than for synonyms and increases with the
- depth distance.
- maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
- maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
- sideways_match_extent -- the maximum number of words that may be incorporated into a
- topic match either side of the word where the activation peaked.
- only_one_result_per_document -- if 'True', prevents multiple results from being returned
- for the same document.
- number_of_results -- the number of topic match objects to return.
- document_label_filter -- optionally, a string with which document labels must start to
- be considered for inclusion in the results.
- """
- topic_matcher = TopicMatcher(
- semantic_analyzer=self.semantic_analyzer,
- structural_matcher=self.structural_matcher,
- indexed_documents=self.threadsafe_container.get_indexed_documents(),
- maximum_activation_distance=maximum_activation_distance,
- relation_score=relation_score,
- reverse_only_relation_score=reverse_only_relation_score,
- single_word_score=single_word_score,
- single_word_any_tag_score=single_word_any_tag_score,
- overlapping_relation_multiplier=overlapping_relation_multiplier,
- embedding_penalty=embedding_penalty,
- ontology_penalty=ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching,
- sideways_match_extent=sideways_match_extent,
- only_one_result_per_document=only_one_result_per_document,
- number_of_results=number_of_results,
- document_label_filter=document_label_filter)
- return topic_matcher.topic_match_documents_against(text_to_match)
-
- def topic_match_documents_returning_dictionaries_against(
- self, text_to_match, *,
- maximum_activation_distance=75, relation_score=30, reverse_only_relation_score=20,
- single_word_score=5, single_word_any_tag_score=2, overlapping_relation_multiplier=1.5,
- embedding_penalty=0.6, ontology_penalty=0.9,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100,
- sideways_match_extent=100, only_one_result_per_document=False, number_of_results=10,
- document_label_filter=None, tied_result_quotient=0.9):
"""Returns a list of dictionaries representing the results of a topic match between an
- entered text and the loaded documents. Callers of this method do not have to manage any
- further dependencies on spaCy or Holmes.
+ entered text and the loaded documents.
Properties:
text_to_match -- the text to match against the loaded documents.
+ use_frequency_factor -- *True* if scores should be multiplied by a factor between 0 and 1
+ expressing how rare the words matching each phraselet are in the corpus. Note that,
+ even if set to *False*, the factors are still calculated as they are required for
+ determining which relation and embedding matches should be attempted.
maximum_activation_distance -- the number of words it takes for a previous phraselet
activation to reduce to zero when the library is reading through a document.
+ word_embedding_match_threshold -- the cosine similarity above which two words match where
+ the search phrase word does not govern an interrogative pronoun..
+ initial_question_word_embedding_match_threshold -- the cosine similarity above which two
+ words match where the search phrase word governs an interrogative pronoun.
relation_score -- the activation score added when a normal two-word
relation is matched.
reverse_only_relation_score -- the activation score added when a two-word relation
is matched using a search phrase that can only be reverse-matched.
- single_word_score -- the activation score added when a normal single
- word is matched.
+ single_word_score -- the activation score added when a normal single word is matched.
single_word_any_tag_score -- the activation score added when a single word is matched
- whose tag did not correspond to the template specification.
+ whose tag would not normally allow it to be matched by phraselets.
+ initial_question_word_answer_score -- the activation score added when a question word is
+ matched to an answering phrase.
+ initial_question_word_behaviour -- 'process' if a question word in the sentence
+ constituent at the beginning of *text_to_match* is to be matched to document phrases
+ that answer it; 'exclusive' if only topic matches that involve such question words
+ are to be permitted; 'ignore' if question words are to be ignored.
+ different_match_cutoff_score -- the activation threshold under which topic matches are
+ separated from one another. Note that the default value will probably be too low if
+ *use_frequency_factor* is set to *False*.
overlapping_relation_multiplier -- the value by which the activation score is multiplied
when two relations were matched and the matches involved a common document word.
embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
@@ -375,52 +501,105 @@ def topic_match_documents_returning_dictionaries_against(
the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
higher for hyponyms and hypernyms than for synonyms and increases with the
depth distance.
- maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
- maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
+ relation_matching_frequency_threshold -- the frequency threshold above which single
+ word matches are used as the basis for attempting relation matches.
+ embedding_matching_frequency_threshold -- the frequency threshold above which single
+ word matches are used as the basis for attempting relation matches with
+ embedding-based matching on the second word.
sideways_match_extent -- the maximum number of words that may be incorporated into a
topic match either side of the word where the activation peaked.
only_one_result_per_document -- if 'True', prevents multiple results from being returned
for the same document.
number_of_results -- the number of topic match objects to return.
- tied_result_quotient -- the quotient between a result and following results above which
- the results are interpreted as tied.
document_label_filter -- optionally, a string with which document labels must start to
be considered for inclusion in the results.
+ tied_result_quotient -- the quotient between a result and following results above which
+ the results are interpreted as tied.
"""
-
- topic_matcher = TopicMatcher(
- semantic_analyzer=self.semantic_analyzer,
- structural_matcher=self.structural_matcher,
- indexed_documents=self.threadsafe_container.get_indexed_documents(),
- maximum_activation_distance=maximum_activation_distance,
- relation_score=relation_score,
- reverse_only_relation_score=reverse_only_relation_score,
- single_word_score=single_word_score,
- single_word_any_tag_score=single_word_any_tag_score,
- overlapping_relation_multiplier=overlapping_relation_multiplier,
- embedding_penalty=embedding_penalty,
- ontology_penalty=ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching,
- sideways_match_extent=sideways_match_extent,
- only_one_result_per_document=only_one_result_per_document,
- number_of_results=number_of_results,
- document_label_filter=document_label_filter)
- return topic_matcher.topic_match_documents_returning_dictionaries_against(
- text_to_match, tied_result_quotient=tied_result_quotient)
+ if word_embedding_match_threshold < 0.0 or word_embedding_match_threshold > 1.0:
+ raise ValueError(
+ 'word_embedding_match_threshold must be between 0 and 1')
+ if initial_question_word_embedding_match_threshold < 0.0 or \
+ initial_question_word_embedding_match_threshold > 1.0:
+ raise ValueError(
+ 'initial_question_word_embedding_match_threshold must be between 0 and 1')
+
+ if not self.semantic_analyzer.model_supports_embeddings():
+ word_embedding_match_threshold = initial_question_word_embedding_match_threshold = 1.0
+
+ overall_similarity_threshold = sqrt(word_embedding_match_threshold)
+ initial_question_word_overall_similarity_threshold = sqrt(
+ initial_question_word_embedding_match_threshold)
+
+ if initial_question_word_behaviour not in ('process', 'exclusive', 'ignore'):
+ raise ValueError(': '.join(('initial_question_word_behaviour',
+ initial_question_word_behaviour)))
+ if embedding_matching_frequency_threshold < 0.0 or \
+ embedding_matching_frequency_threshold > 1.0:
+ raise ValueError(': '.join(('embedding_matching_frequency_threshold',
+ str(embedding_matching_frequency_threshold))))
+ if relation_matching_frequency_threshold < 0.0 or \
+ relation_matching_frequency_threshold > 1.0:
+ raise ValueError(': '.join(('relation_matching_frequency_threshold',
+ str(relation_matching_frequency_threshold))))
+ if embedding_matching_frequency_threshold < relation_matching_frequency_threshold:
+ raise EmbeddingThresholdLessThanRelationThresholdError(' '.join((
+ 'embedding',
+ str(embedding_matching_frequency_threshold),
+ 'relation',
+ str(relation_matching_frequency_threshold))))
+ with self.lock:
+ if len(self.document_labels_to_worker_queues) == 0:
+ raise NoDocumentError('At least one document is required for matching.')
+ words_to_corpus_frequencies, maximum_corpus_frequency = \
+ self.get_corpus_frequency_information()
+
+ reply_queue = self.multiprocessing_manager.Queue()
+ text_to_match_doc = self.semantic_analyzer.parse(text_to_match)
+ phraselet_labels_to_phraselet_infos = \
+ self.linguistic_object_factory.get_phraselet_labels_to_phraselet_infos(
+ text_to_match_doc=text_to_match_doc,
+ words_to_corpus_frequencies=words_to_corpus_frequencies,
+ maximum_corpus_frequency=maximum_corpus_frequency,
+ process_initial_question_words=initial_question_word_behaviour in ('process',
+ 'exclusive'))
+ if len(phraselet_labels_to_phraselet_infos) == 0:
+ return []
+ phraselet_labels_to_search_phrases = \
+ self.linguistic_object_factory.create_search_phrases_from_phraselet_infos(
+ phraselet_labels_to_phraselet_infos.values(), relation_matching_frequency_threshold)
+ for search_phrase in phraselet_labels_to_search_phrases.values():
+ search_phrase.pack()
+
+ for worker_index in range(self.number_of_workers):
+ self.input_queues[worker_index].put((
+ self.worker.get_topic_matches,
+ (text_to_match, phraselet_labels_to_phraselet_infos,
+ phraselet_labels_to_search_phrases, maximum_activation_distance,
+ overall_similarity_threshold, initial_question_word_overall_similarity_threshold,
+ relation_score, reverse_only_relation_score, single_word_score,
+ single_word_any_tag_score, initial_question_word_answer_score,
+ initial_question_word_behaviour, different_match_cutoff_score,
+ overlapping_relation_multiplier, embedding_penalty,
+ ontology_penalty, relation_matching_frequency_threshold,
+ embedding_matching_frequency_threshold, sideways_match_extent,
+ only_one_result_per_document, number_of_results, document_label_filter,
+ use_frequency_factor), reply_queue), timeout=TIMEOUT_SECONDS)
+ worker_topic_match_dictss = self.handle_response(reply_queue,
+ self.number_of_workers, 'match')
+ topic_match_dicts = []
+ for worker_topic_match_dicts in worker_topic_match_dictss:
+ if worker_topic_match_dicts is not None:
+ topic_match_dicts.extend(worker_topic_match_dicts)
+ return TopicMatchDictionaryOrderer().order(
+ topic_match_dicts, number_of_results, tied_result_quotient)
def get_supervised_topic_training_basis(
- self, *, classification_ontology=None,
- overlap_memory_size=10, oneshot=True, match_all_words=False, verbose=True):
- """ Returns an object that is used to train and generate a document model.
+ self, *, classification_ontology:Ontology=None,
+ overlap_memory_size:int=10, oneshot:bool=True, match_all_words:bool=False,
+ verbose:bool=True) -> SupervisedTopicTrainingBasis:
+ """ Returns an object that is used to train and generate a model for the supervised
+ document classification use case.
Parameters:
@@ -435,13 +614,17 @@ def get_supervised_topic_training_basis(
verbose -- if 'True', information about training progress is outputted to the console.
"""
return SupervisedTopicTrainingBasis(
+ linguistic_object_factory=self.linguistic_object_factory,
structural_matcher=self.structural_matcher,
classification_ontology=classification_ontology,
overlap_memory_size=overlap_memory_size, oneshot=oneshot,
- match_all_words=match_all_words, verbose=verbose)
+ match_all_words=match_all_words,
+ overall_similarity_threshold=self.overall_similarity_threshold, verbose=verbose)
- def deserialize_supervised_topic_classifier(self, serialized_model, verbose=False):
- """ Returns a document classifier that will use a pre-trained model.
+ def deserialize_supervised_topic_classifier(self,
+ serialized_model:str, verbose:bool=False) -> SupervisedTopicClassifier:
+ """ Returns a classifier for the supervised document classification use case
+ that will use a supplied pre-trained model.
Parameters:
@@ -451,373 +634,268 @@ def deserialize_supervised_topic_classifier(self, serialized_model, verbose=Fals
"""
model = jsonpickle.decode(serialized_model)
return SupervisedTopicClassifier(
- self.semantic_analyzer, self.structural_matcher, model, verbose)
+ self.semantic_analyzer, self.linguistic_object_factory, self.structural_matcher,
+ model, self.overall_similarity_threshold, verbose)
def start_chatbot_mode_console(self):
- """Starts a chatbot mode console enabling the matching of pre-registered search phrases
- to documents (chatbot entries) entered ad-hoc by the user.
+ """Starts a chatbot mode console enabling the matching of pre-registered
+ search phrases to documents (chatbot entries) ad-hoc by the user.
"""
holmes_consoles = HolmesConsoles(self)
holmes_consoles.start_chatbot_mode()
- def start_structural_search_mode_console(self):
- """Starts a structural search mode console enabling the matching of pre-registered documents
- to search phrases entered ad-hoc by the user.
- """
- holmes_consoles = HolmesConsoles(self)
- holmes_consoles.start_structural_search_mode()
-
- def start_topic_matching_search_mode_console(
- self, only_one_result_per_document=False,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100):
- """Starts a topic matching search mode console enabling the matching of pre-registered
- documents to search texts entered ad-hoc by the user.
-
- Parameters:
-
- only_one_result_per_document -- if 'True', prevents multiple topic match
- results from being returned for the same document.
- maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
- maximum_number_of_single_word_matches_for_embedding_matching = the maximum
- number of single word matches that are used as the basis for reverse matching with
- embeddings at the parent word. If more than this value exist, reverse matching with
- embeddings is not attempted because the performance hit would be too great.
+ def start_structural_extraction_mode_console(self):
+ """Starts a structural extraction mode console enabling the matching of pre-registered
+ documents to search phrases entered ad-hoc by the user.
"""
holmes_consoles = HolmesConsoles(self)
- holmes_consoles.start_topic_matching_search_mode(
- only_one_result_per_document,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching)
-
-class MultiprocessingManager:
- """The facade class for the Holmes library used in a multiprocessing environment.
- This class is threadsafe.
-
- Parameters:
-
- model -- the name of the spaCy model, e.g. *en_core_web_lg*
- overall_similarity_threshold -- the overall similarity threshold for embedding-based
- matching. Defaults to *1.0*, which deactivates embedding-based matching.
- embedding_based_matching_on_root_words -- determines whether or not embedding-based
- matching should be attempted on root (parent) tokens, which has a considerable
- performance hit. Defaults to *False*.
- ontology -- an *Ontology* object. Defaults to *None* (no ontology).
- analyze_derivational_morphology -- *True* if matching should be attempted between different
- words from the same word family. Defaults to *True*.
- perform_coreference_resolution -- *True*, *False* or *None* if coreference resolution
- should be performed depending on whether the model supports it. Defaults to *None*.
- debug -- a boolean value specifying whether debug representations should be outputted
- for parsed sentences. Defaults to *False*.
- verbose -- a boolean value specifying whether status messages should be outputted to the
- console. Defaults to *True*
- number_of_workers -- the number of worker processes to use, or *None* if the number of worker
- processes should depend on the number of available cores. Defaults to *None*
- """
- def __init__(
- self, model, *, overall_similarity_threshold=1.0,
- embedding_based_matching_on_root_words=False, ontology=None,
- analyze_derivational_morphology=True, perform_coreference_resolution=None,
- debug=False, verbose=True, number_of_workers=None):
- self.semantic_analyzer = SemanticAnalyzerFactory().semantic_analyzer(
- model=model, perform_coreference_resolution=perform_coreference_resolution, debug=debug)
- if perform_coreference_resolution is None:
- perform_coreference_resolution = \
- self.semantic_analyzer.model_supports_coreference_resolution()
- validate_options(
- self.semantic_analyzer, overall_similarity_threshold,
- embedding_based_matching_on_root_words, perform_coreference_resolution)
- self.structural_matcher = StructuralMatcher(
- self.semantic_analyzer, ontology, overall_similarity_threshold,
- embedding_based_matching_on_root_words, analyze_derivational_morphology,
- perform_coreference_resolution)
- self._perform_coreference_resolution = perform_coreference_resolution
-
- self._verbose = verbose
- self._document_labels = []
- self._input_queues = []
- if number_of_workers is None:
- number_of_workers = cpu_count()
- self._number_of_workers = number_of_workers
- self._next_worker_to_use = 0
- self._multiprocessor_manager = Multiprocessing_manager()
- self._worker = Worker() # will be copied to worker processes by value (Windows) or
- # by reference (Linux)
- self._workers = []
- for counter in range(0, self._number_of_workers):
- input_queue = Queue()
- self._input_queues.append(input_queue)
- worker_label = ' '.join(('Worker', str(counter)))
- this_worker = Process(
- target=self._worker.listen, args=(
- self.semantic_analyzer, self.structural_matcher, input_queue, worker_label),
- daemon=True)
- self._workers.append(this_worker)
- this_worker.start()
- self._lock = Lock()
-
- def _add_document_label(self, label):
- with self._lock:
- if label in self._document_labels:
- raise DuplicateDocumentError(label)
- else:
- self._document_labels.append(label)
-
- def _handle_reply(self, worker_label, return_value):
- """ If 'return_value' is an exception, return it, otherwise return 'None'. """
- if isinstance(return_value, Exception):
- return return_value
- elif self._verbose:
- if not isinstance(return_value, list):
- with self._lock:
- print(': '.join((worker_label, return_value)))
- return None
-
- def _internal_register_documents(self, dictionary, worker_method):
- reply_queue = self._multiprocessor_manager.Queue()
- for label, value in dictionary.items():
- self._add_document_label(label)
- with self._lock:
- self._input_queues[
- self._next_worker_to_use].put((worker_method, (value, label), reply_queue))
- self._next_worker_to_use += 1
- if self._next_worker_to_use == self._number_of_workers:
- self._next_worker_to_use = 0
- recorded_exception = None
- for _ in range(0, len(dictionary)):
- possible_exception = self._handle_reply(*reply_queue.get())
- if possible_exception is not None and recorded_exception is None:
- recorded_exception = possible_exception
- if recorded_exception is not None:
- with self._lock:
- print('ERROR: not all documents were registered successfully. Please examine the '\
- ' above output from the worker processes to identify the problem.')
-
- def parse_and_register_documents(self, document_dictionary):
- """Parameters:
-
- document_dictionary -- a dictionary from unique document labels to raw document texts.
- """
- self._internal_register_documents(
- document_dictionary, self._worker.worker_parse_and_register_document)
-
- def deserialize_and_register_documents(self, serialized_document_dictionary):
- """Parameters:
-
- serialized_document_dictionary -- a dictionary from unique document labels to
- documents serialized using the *Manager.serialize_document()* method.
- """
- if self._perform_coreference_resolution:
- raise SerializationNotSupportedError(self.semantic_analyzer.model)
- self._internal_register_documents(
- serialized_document_dictionary, self._worker.worker_deserialize_and_register_document)
-
- def document_labels(self):
- with self._lock:
- document_labels = self._document_labels
- return sorted(document_labels)
-
- def topic_match_documents_returning_dictionaries_against(
- self, text_to_match, *, maximum_activation_distance=75, relation_score=30,
- reverse_only_relation_score=20, single_word_score=5, single_word_any_tag_score=2,
- overlapping_relation_multiplier=1.5, embedding_penalty=0.6, ontology_penalty=0.9,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100,
- sideways_match_extent=100, only_one_result_per_document=False, number_of_results=10,
- document_label_filter=None, tied_result_quotient=0.9):
- """Returns the results of a topic match between an entered text and the loaded documents.
-
- Properties:
-
- text_to_match -- the text to match against the loaded documents.
- maximum_activation_distance -- the number of words it takes for a previous phraselet
- activation to reduce to zero when the library is reading through a document.
- relation_score -- the activation score added when a normal two-word
- relation is matched.
- reverse_only_relation_score -- the activation score added when a two-word relation
- is matched using a search phrase that can only be reverse-matched.
- single_word_score -- the activation score added when a normal single
- word is matched.
- single_word_any_tag_score -- the activation score added when a single word is matched
- whose tag did not correspond to the template specification.
- overlapping_relation_multiplier -- the value by which the activation score is multiplied
- when two relations were matched and the matches involved a common document word.
- embedding_penalty -- a value between 0 and 1 with which scores are multiplied when the
- match involved an embedding. The result is additionally multiplied by the overall
- similarity measure of the match.
- ontology_penalty -- a value between 0 and 1 with which scores are multiplied for each
- word match within a match that involved the ontology. For each such word match,
- the score is multiplied by the value (abs(depth) + 1) times, so that the penalty is
- higher for hyponyms and hypernyms than for synonyms and increases with the
- depth distance.
- maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
- maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for reverse matching with
- embeddings at the parent word. If more than this value exist, reverse matching with
- embeddings is not attempted because the performance hit would be too great.
- sideways_match_extent -- the maximum number of words that may be incorporated into a
- topic match either side of the word where the activation peaked.
- only_one_result_per_document -- if 'True', prevents multiple results from being returned
- for the same document.
- number_of_results -- the number of topic match objects to return.
- document_label_filter -- optionally, a string with which document labels must start to
- be considered for inclusion in the results.
- tied_result_quotient -- the quotient between a result and following results above which
- the results are interpreted as tied.
- """
- if maximum_number_of_single_word_matches_for_embedding_matching > \
- maximum_number_of_single_word_matches_for_relation_matching:
- raise EmbeddingThresholdGreaterThanRelationThresholdError(' '.join((
- 'embedding',
- str(maximum_number_of_single_word_matches_for_embedding_matching),
- 'relation',
- str(maximum_number_of_single_word_matches_for_relation_matching))))
- reply_queue = self._multiprocessor_manager.Queue()
- for counter in range(0, self._number_of_workers):
- self._input_queues[counter].put((
- self._worker.worker_topic_match_documents_returning_dictionaries_against,
- (
- text_to_match, maximum_activation_distance, relation_score,
- reverse_only_relation_score, single_word_score, single_word_any_tag_score,
- overlapping_relation_multiplier, embedding_penalty, ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching,
- sideways_match_extent, only_one_result_per_document, number_of_results,
- document_label_filter, tied_result_quotient), reply_queue))
- topic_match_dicts = []
- recorded_exception = None
- for _ in range(0, self._number_of_workers):
- worker_label, worker_topic_match_dicts = reply_queue.get()
- if recorded_exception is None:
- recorded_exception = self._handle_reply(worker_label, worker_topic_match_dicts)
- if not isinstance(worker_topic_match_dicts, Exception):
- topic_match_dicts.extend(worker_topic_match_dicts)
- if recorded_exception is not None:
- with self._lock:
- print('ERROR: not all workers returned results. Please examine the above output '\
- ' from the worker processes to identify the problem.')
- return TopicMatchDictionaryOrderer().order(
- topic_match_dicts, number_of_results, tied_result_quotient)
+ holmes_consoles.start_structural_extraction_mode()
def start_topic_matching_search_mode_console(
- self, only_one_result_per_document=False,
- maximum_number_of_single_word_matches_for_relation_matching=500,
- maximum_number_of_single_word_matches_for_embedding_matching=100):
+ self, only_one_result_per_document:bool=False,
+ word_embedding_match_threshold:float=0.8,
+ initial_question_word_embedding_match_threshold:float=0.7):
"""Starts a topic matching search mode console enabling the matching of pre-registered
- documents to search texts entered ad-hoc by the user.
+ documents to query phrases entered ad-hoc by the user.
Parameters:
only_one_result_per_document -- if 'True', prevents multiple topic match
results from being returned for the same document.
- maximum_number_of_single_word_matches_for_relation_matching -- the maximum number
- of single word matches that are used as the basis for matching relations. If more
- document words than this value correspond to each of the two words within a
- relation phraselet, matching on the phraselet is not attempted.
- maximum_number_of_single_word_matches_for_embedding_matching = the maximum number
- of single word matches that are used as the basis for matching with
- embeddings at the other word. If more than this value exist, matching with
- embeddings is not attempted because the performance hit would be too great.
+ word_embedding_match_threshold -- the cosine similarity above which two words match
+ where the search phrase word does not govern an interrogative pronoun.
+ initial_question_word_embedding_match_threshold -- the cosine similarity above which two
+ words match where the search phrase word governs an interrogative pronoun.
"""
holmes_consoles = HolmesConsoles(self)
holmes_consoles.start_topic_matching_search_mode(
only_one_result_per_document,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching)
+ word_embedding_match_threshold,
+ initial_question_word_embedding_match_threshold)
- def close(self):
- for worker in self._workers:
+ def close(self) -> None:
+ """ Terminates the worker processes. """
+ for worker in self.workers:
worker.terminate()
class Worker:
- """Worker implementation used by *MultiprocessingManager*.
+ """Worker implementation used by *Manager*.
"""
- def _error_header(self, method, args, worker_label):
- if method.__name__.endswith('register_document'):
- return ''.join((
- worker_label, ' - error registering document ', args[1],
- '. Please submit a Github issue including the following stack trace for analysis:'))
- else:
- return ''.join((
- worker_label,
- ' - error. Please submit a Github issue including the following stack trace for '\
- 'analysis:'))
-
- def listen(self, semantic_analyzer, structural_matcher, input_queue, worker_label):
- semantic_analyzer.reload_model() # necessary to avoid neuralcoref MemoryError on Linux
- indexed_documents = {}
+ def error_header(self, method, args, worker_label):
+ return ''.join((
+ worker_label,
+ ' - error:'))
+
+ def listen(self, structural_matcher, overall_similarity_threshold, vocab, model_name,
+ serialized_document_version, input_queue, worker_label):
+ state = {
+ 'structural_matcher': structural_matcher,
+ 'overall_similarity_threshold': overall_similarity_threshold,
+ 'vocab': vocab,
+ 'model_name': model_name,
+ 'serialized_document_version': serialized_document_version,
+ 'document_labels_to_documents': {},
+ 'corpus_index_dict': {},
+ 'search_phrases': [],
+ }
+ HolmesBroker.set_extensions()
while True:
method, args, reply_queue = input_queue.get()
try:
- reply = method(semantic_analyzer, structural_matcher, indexed_documents, *args)
- reply_queue.put((worker_label, reply))
+ if args is not None:
+ return_value, return_info = method(state, *args)
+ else:
+ return_value, return_info = method(state)
+ reply_queue.put((worker_label, return_value, return_info), timeout=TIMEOUT_SECONDS)
except Exception as err:
- print(self._error_header(method, args, worker_label))
+ print(self.error_header(method, args, worker_label))
print(traceback.format_exc())
- reply_queue.put((worker_label, err))
+ reply_queue.put((worker_label, None, err), timeout=TIMEOUT_SECONDS)
except:
- print(self._error_header(method, args, worker_label))
+ print(self.error_header(method, args, worker_label))
print(traceback.format_exc())
err_identifier = str(sys.exc_info()[0])
- reply_queue.put((worker_label, err_identifier))
-
- def worker_parse_and_register_document(
- self, semantic_analyzer, structural_matcher, indexed_documents, document_text, label):
- doc = semantic_analyzer.parse(document_text)
- indexed_document = structural_matcher.index_document(doc)
- indexed_documents[label] = indexed_document
- return ' '.join(('Parsed and registered document', label))
-
- def worker_deserialize_and_register_document(
- self, semantic_analyzer, structural_matcher, indexed_documents, document, label):
- doc = semantic_analyzer.from_serialized_string(document)
- indexed_document = structural_matcher.index_document(doc)
- indexed_documents[label] = indexed_document
- return ' '.join(('Deserialized and registered document', label))
-
- def worker_topic_match_documents_returning_dictionaries_against(
- self, semantic_analyzer, structural_matcher, indexed_documents, text_to_match,
- maximum_activation_distance, relation_score, reverse_only_relation_score,
- single_word_score, single_word_any_tag_score, overlapping_relation_multiplier,
- embedding_penalty, ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching,
+ reply_queue.put((worker_label, None, err_identifier), timeout=TIMEOUT_SECONDS)
+
+ def load_document(self, state, serialized_doc, document_label, corpus_index_dict):
+ doc = Doc(state['vocab']).from_bytes(serialized_doc)
+ if doc._.holmes_document_info.model != state['model_name']:
+ raise WrongModelDeserializationError('; '.join((
+ state['model_name'], doc._.holmes_document_info.model)))
+ if doc._.holmes_document_info.serialized_document_version != \
+ state['serialized_document_version']:
+ raise WrongVersionDeserializationError('; '.join((
+ str(state['serialized_document_version']),
+ str(doc._.holmes_document_info.serialized_document_version))))
+ state['document_labels_to_documents'][document_label] = doc
+ state['structural_matcher'].semantic_matching_helper.add_to_corpus_index(
+ corpus_index_dict, doc, document_label)
+ return doc
+
+ def register_serialized_document(self, state, serialized_doc, document_label):
+ self.load_document(state, serialized_doc, document_label, state['corpus_index_dict'])
+ return None, ' '.join(('Registered document', document_label))
+
+ def remove_document(self, state, document_label):
+ state['document_labels_to_documents'].pop(document_label)
+ state['corpus_index_dict'] = \
+ state['structural_matcher'].semantic_matching_helper.get_corpus_index_removing_document(
+ state['corpus_index_dict'], document_label)
+ return None, ' '.join(('Removed document', document_label))
+
+ def remove_all_documents(self, state):
+ state['document_labels_to_documents'] = {}
+ state['corpus_index_dict'] = {}
+ return None, 'Removed all documents'
+
+ def get_serialized_document(self, state, label):
+ if label in state['document_labels_to_documents']:
+ return state['document_labels_to_documents'][label].to_bytes(), \
+ ' '.join(('Returned serialized document with label', label))
+ else:
+ return None, ' '.join(('No document found with label', label))
+
+ def register_search_phrase(self, state, search_phrase):
+ search_phrase.unpack(state['vocab'])
+ state['search_phrases'].append(search_phrase)
+ return None, ' '.join(('Registered search phrase with label', search_phrase.label))
+
+ def remove_all_search_phrases_with_label(self, state, label):
+ state['search_phrases'] = [search_phrase for search_phrase in state['search_phrases']
+ if search_phrase.label != label]
+ return None, ' '.join(("Removed all search phrases with label '", label, "'"))
+
+ def remove_all_search_phrases(self, state):
+ state['search_phrases'] = []
+ return None, 'Removed all search phrases'
+
+ def get_words_to_corpus_frequencies(self, state):
+ words_to_corpus_frequencies = {}
+ for word, token_info_tuples in state['corpus_index_dict'].items():
+ if word in punctuation:
+ continue
+ cwps = [corpus_word_position for corpus_word_position, _, _ in token_info_tuples]
+ if word in words_to_corpus_frequencies:
+ words_to_corpus_frequencies[word] += len(set(cwps))
+ else:
+ words_to_corpus_frequencies[word] = len(set(cwps))
+ return words_to_corpus_frequencies, 'Retrieved words to corpus frequencies'
+
+ def match(self, state, serialized_doc, search_phrase):
+ if serialized_doc is not None:
+ corpus_index_dict = {}
+ doc = self.load_document(state, serialized_doc, '', corpus_index_dict)
+ document_labels_to_documents = {'': doc}
+ else:
+ corpus_index_dict = state['corpus_index_dict']
+ document_labels_to_documents = state['document_labels_to_documents']
+ search_phrases = [search_phrase] if search_phrase is not None \
+ else state['search_phrases']
+ if len(document_labels_to_documents) > 0 and len(search_phrases) > 0:
+ matches = state['structural_matcher'].match(
+ document_labels_to_documents=document_labels_to_documents,
+ corpus_index_dict=corpus_index_dict,
+ search_phrases=search_phrases,
+ match_depending_on_single_words=None,
+ compare_embeddings_on_root_words=state['structural_matcher'].\
+ embedding_based_matching_on_root_words,
+ compare_embeddings_on_non_root_words=True,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=None,
+ process_initial_question_words=False,
+ overall_similarity_threshold=state['overall_similarity_threshold'],
+ initial_question_word_overall_similarity_threshold=1.0)
+ return state['structural_matcher'].build_match_dictionaries(matches), \
+ 'Returned matches'
+ else:
+ return [], 'No stored objects to match against'
+
+ def get_topic_matches(self, state, text_to_match,
+ phraselet_labels_to_phraselet_infos, phraselet_labels_to_search_phrases,
+ maximum_activation_distance, overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold, relation_score,
+ reverse_only_relation_score, single_word_score, single_word_any_tag_score,
+ initial_question_word_answer_score, initial_question_word_behaviour,
+ different_match_cutoff_score, overlapping_relation_multiplier, embedding_penalty,
+ ontology_penalty, relation_matching_frequency_threshold,
+ embedding_matching_frequency_threshold,
sideways_match_extent, only_one_result_per_document, number_of_results,
- document_label_filter, tied_result_quotient):
- if len(indexed_documents) == 0:
- return []
+ document_label_filter, use_frequency_factor):
+ if len(state['document_labels_to_documents']) == 0:
+ return [], 'No stored documents to match against'
+ for search_phrase in phraselet_labels_to_search_phrases.values():
+ search_phrase.unpack(state['vocab'])
topic_matcher = TopicMatcher(
- semantic_analyzer=semantic_analyzer,
- structural_matcher=structural_matcher,
- indexed_documents=indexed_documents,
+ structural_matcher=state['structural_matcher'],
+ document_labels_to_documents=state['document_labels_to_documents'],
+ corpus_index_dict=state['corpus_index_dict'],
+ text_to_match=text_to_match,
+ phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
+ phraselet_labels_to_search_phrases=phraselet_labels_to_search_phrases,
maximum_activation_distance=maximum_activation_distance,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold,
relation_score=relation_score,
reverse_only_relation_score=reverse_only_relation_score,
single_word_score=single_word_score,
single_word_any_tag_score=single_word_any_tag_score,
+ initial_question_word_answer_score=initial_question_word_answer_score,
+ initial_question_word_behaviour=initial_question_word_behaviour,
+ different_match_cutoff_score=different_match_cutoff_score,
overlapping_relation_multiplier=overlapping_relation_multiplier,
embedding_penalty=embedding_penalty,
ontology_penalty=ontology_penalty,
- maximum_number_of_single_word_matches_for_relation_matching=
- maximum_number_of_single_word_matches_for_relation_matching,
- maximum_number_of_single_word_matches_for_embedding_matching=
- maximum_number_of_single_word_matches_for_embedding_matching,
+ relation_matching_frequency_threshold=relation_matching_frequency_threshold,
+ embedding_matching_frequency_threshold=embedding_matching_frequency_threshold,
sideways_match_extent=sideways_match_extent,
only_one_result_per_document=only_one_result_per_document,
number_of_results=number_of_results,
- document_label_filter=document_label_filter)
- topic_match_dicts = \
- topic_matcher.topic_match_documents_returning_dictionaries_against(
- text_to_match, tied_result_quotient=tied_result_quotient)
- return topic_match_dicts
+ document_label_filter=document_label_filter,
+ use_frequency_factor=use_frequency_factor)
+ return topic_matcher.get_topic_match_dictionaries(), \
+ 'Returned topic match dictionaries'
+
+@Language.factory("holmes")
+class HolmesBroker:
+ def __init__(self, nlp:Language, name:str):
+ self.nlp = nlp
+ self.pid = os.getpid()
+ self.semantic_analyzer = get_semantic_analyzer(nlp)
+ self.set_extensions()
+
+ def __call__(self, doc:Doc) -> Doc:
+ if os.getpid() != self.pid:
+ raise MultiprocessingParsingNotSupportedError(
+ 'Unfortunately at present parsing cannot be shared between forked processes.')
+ try:
+ self.semantic_analyzer.holmes_parse(doc)
+ except:
+ print('Unexpected error annotating document, skipping ....')
+ exception_info_parts = sys.exc_info()
+ print(exception_info_parts[0])
+ print(exception_info_parts[1])
+ traceback.print_tb(exception_info_parts[2])
+ return doc
+
+ def __getstate__(self):
+ return self.nlp.meta
+
+ def __setstate__(self, meta):
+ nlp_name = '_'.join((meta['lang'], meta['name']))
+ self.nlp = spacy.load(nlp_name)
+ self.semantic_analyzer = get_semantic_analyzer(self.nlp)
+ self.pid = os.getpid()
+ HolmesBroker.set_extensions()
+
+ @staticmethod
+ def set_extensions():
+ if not Doc.has_extension('coref_chains'):
+ Doc.set_extension('coref_chains', default=None)
+ if not Token.has_extension('coref_chains'):
+ Token.set_extension('coref_chains', default=None)
+ if not Doc.has_extension('holmes_document_info'):
+ Doc.set_extension('holmes_document_info', default=None)
+ if not Token.has_extension('holmes'):
+ Token.set_extension('holmes', default=None)
diff --git a/holmes_extractor/matching.py b/holmes_extractor/matching.py
new file mode 100644
index 0000000..b4d6352
--- /dev/null
+++ b/holmes_extractor/matching.py
@@ -0,0 +1,1400 @@
+import copy
+import sys
+from spacy.tokens import Token
+from .errors import DuplicateDocumentError, NoSearchPhraseError, NoDocumentError
+from .parsing import Subword, Index
+
+ONTOLOGY_DEPTHS_TO_NAMES = {
+ -4: 'an ancestor', -3: 'a great-grandparent', -2: 'a grandparent', -1: 'a parent',
+ 0: 'a synonym', 1: 'a child', 2: 'a grandchild', 3: 'a great-grandchild', 4: 'a descendant'}
+
+class WordMatch:
+ """A match between a searched phrase word and a document word.
+
+ Properties:
+
+ search_phrase_token -- the spaCy token from the search phrase.
+ search_phrase_word -- the word that matched from the search phrase.
+ document_token -- the spaCy token from the document.
+ first_document_token -- the first token that matched from the document, which will equal
+ *document_token* except with multiword matches.
+ last_document_token -- the lst token that matched from the document, which will equal
+ *document_token* except with multiword matches.
+ document_subword -- the subword from the token that matched, or *None* if the match was
+ with the whole token.
+ document_word -- the word or subword that matched structurally from the document.
+ word_match_type -- *direct*, *entity*, *embedding*, *ontology* or *derivation*.
+ similarity_measure -- for type *embedding*, the similarity between the two tokens,
+ otherwise 1.0.
+ is_negated -- *True* if this word match leads to a match of which it
+ is a part being negated.
+ is_uncertain -- *True* if this word match leads to a match of which it
+ is a part being uncertain.
+ structurally_matched_document_token -- the spaCy token from the document that matched
+ the dependency structure, which may be different from *document_token* if coreference
+ resolution is active.
+ involves_coreference -- *True* if *document_token* and *structurally_matched_document_token*
+ are different.
+ extracted_word -- the most specific term that corresponded to *document_word* within the
+ coreference chain.
+ depth -- the number of hyponym relationships linking *search_phrase_word* and
+ *document_word*, or *0* if ontology-based matching is not active.
+ search_phrase_initial_question_word -- *True* if *search_phrase_token* is an initial question
+ word or governs an initial question word.
+ """
+
+ def __init__(
+ self, search_phrase_token, search_phrase_word, document_token,
+ first_document_token, last_document_token, document_subword, document_word,
+ word_match_type, similarity_measure, is_negated, is_uncertain,
+ structurally_matched_document_token, extracted_word, depth,
+ search_phrase_initial_question_word):
+
+ self.search_phrase_token = search_phrase_token
+ self.search_phrase_word = search_phrase_word
+ self.document_token = document_token
+ self.first_document_token = first_document_token
+ self.last_document_token = last_document_token
+ self.document_subword = document_subword
+ self.document_word = document_word
+ self.word_match_type = word_match_type
+ self.similarity_measure = similarity_measure
+ self.is_negated = is_negated
+ self.is_uncertain = is_uncertain
+ self.structurally_matched_document_token = structurally_matched_document_token
+ self.extracted_word = extracted_word
+ self.depth = depth
+ self.search_phrase_initial_question_word = search_phrase_initial_question_word
+
+ @property
+ def involves_coreference(self):
+ return self.document_token != self.structurally_matched_document_token
+
+ def get_document_index(self):
+ if self.document_subword is not None:
+ subword_index = self.document_subword.index
+ else:
+ subword_index = None
+ return Index(self.document_token.i, subword_index)
+
+ def explain(self):
+ """ Creates a human-readable explanation of the word match from the perspective of the
+ document word (e.g. to be used as a tooltip over it)."""
+ search_phrase_display_word = self.search_phrase_token._.holmes.lemma.upper()
+ if self.word_match_type == 'direct':
+ return ''.join(("Matches ", search_phrase_display_word, " directly."))
+ elif self.word_match_type == 'derivation':
+ return ''.join(("Has a common stem with ", search_phrase_display_word, "."))
+ elif self.word_match_type == 'entity':
+ return ''.join(("Has an entity label matching ", search_phrase_display_word, "."))
+ elif self.word_match_type == 'question':
+ return ''.join(("Matches the question word ", search_phrase_display_word, "."))
+ elif self.word_match_type == 'embedding':
+ printable_similarity = str(int(self.similarity_measure * 100))
+ return ''.join((
+ "Has a word embedding that is ", printable_similarity,
+ "% similar to ", search_phrase_display_word, "."))
+ elif self.word_match_type == 'entity_embedding':
+ printable_similarity = str(int(self.similarity_measure * 100))
+ return ''.join((
+ "Has an entity label that is ", printable_similarity,
+ "% similar to the word embedding corresponding to ", search_phrase_display_word,
+ "."))
+ elif self.word_match_type == 'ontology':
+ working_depth = self.depth
+ if working_depth > 4:
+ working_depth = 4
+ elif working_depth < -4:
+ working_depth = -4
+ return ''.join((
+ "Is ", ONTOLOGY_DEPTHS_TO_NAMES[working_depth], " of ",
+ search_phrase_display_word, " in the ontology."))
+ else:
+ raise RuntimeError(' '.join(('Unrecognized type', self.word_match_type)))
+
+class Match:
+ """A match between a search phrase and a document.
+
+ Properties:
+
+ word_matches -- a list of *WordMatch* objects.
+ is_negated -- *True* if this match is negated.
+ is_uncertain -- *True* if this match is uncertain.
+ involves_coreference -- *True* if this match was found using coreference resolution.
+ search_phrase_label -- the label of the search phrase that matched.
+ search_phrase_text -- the text of the search phrase that matched.
+ document_label -- the label of the document that matched.
+ from_single_word_phraselet -- *True* if this is a match against a single-word
+ phraselet.
+ from_topic_match_phraselet_created_without_matching_tags -- **True** or **False**
+ from_reverse_only_topic_match_phraselet -- **True** or **False**
+ overall_similarity_measure -- the overall similarity of the match, or *1.0* if the embedding
+ strategy was not involved in the match.
+ index_within_document -- the index of the document token that matched the search phrase
+ root token.
+ """
+
+ def __init__(
+ self, search_phrase_label, search_phrase_text, document_label,
+ from_single_word_phraselet, from_topic_match_phraselet_created_without_matching_tags,
+ from_reverse_only_topic_match_phraselet):
+ self.word_matches = []
+ self.is_negated = False
+ self.is_uncertain = False
+ self.search_phrase_label = search_phrase_label
+ self.search_phrase_text = search_phrase_text
+ self.document_label = document_label
+ self.from_single_word_phraselet = from_single_word_phraselet
+ self.from_topic_match_phraselet_created_without_matching_tags = \
+ from_topic_match_phraselet_created_without_matching_tags
+ self.from_reverse_only_topic_match_phraselet = from_reverse_only_topic_match_phraselet
+ self.index_within_document = None
+ self.overall_similarity_measure = '1.0'
+
+ @property
+ def involves_coreference(self):
+ for word_match in self.word_matches:
+ if word_match.involves_coreference:
+ return True
+ return False
+
+ def __copy__(self):
+ match_to_return = Match(
+ self.search_phrase_label, self.search_phrase_text,
+ self.document_label, self.from_single_word_phraselet,
+ self.from_topic_match_phraselet_created_without_matching_tags,
+ self.from_reverse_only_topic_match_phraselet)
+ match_to_return.word_matches = self.word_matches.copy()
+ match_to_return.is_negated = self.is_negated
+ match_to_return.is_uncertain = self.is_uncertain
+ match_to_return.index_within_document = self.index_within_document
+ match_to_return.overall_similarity_measure = self.overall_similarity_measure
+ return match_to_return
+
+ def get_subword_index(self):
+ for word_match in self.word_matches:
+ if word_match.search_phrase_token.dep_ == 'ROOT':
+ if word_match.document_subword is None:
+ return None
+ return word_match.document_subword.index
+ raise RuntimeError('No word match with search phrase token with root dependency')
+
+ def get_subword_index_for_sorting(self):
+ # returns *-1* rather than *None* in the absence of a subword
+ subword_index = self.get_subword_index()
+ return subword_index if subword_index is not None else -1
+
+class StructuralMatcher:
+ """The class responsible for matching search phrases with documents."""
+
+ def __init__(
+ self, semantic_matching_helper, ontology,
+ embedding_based_matching_on_root_words, analyze_derivational_morphology,
+ perform_coreference_resolution, use_reverse_dependency_matching,
+ entity_label_to_vector_dict):
+ """Args:
+
+ semantic_matching_helper -- the *SemanticMatchingHelper* object to use
+ ontology -- optionally, an *Ontology* object to use in matching
+ embedding_based_matching_on_root_words -- *True* if embedding-based matching should be
+ attempted on search-phrase root tokens
+ analyze_derivational_morphology -- *True* if matching should be attempted between different
+ words from the same word family. Defaults to *True*.
+ perform_coreference_resolution -- *True* if coreference resolution should be taken into
+ account when matching.
+ use_reverse_dependency_matching -- *True* if appropriate dependencies in documents can be
+ matched to dependencies in search phrases where the two dependencies point in opposite
+ directions.
+ entity_label_to_vector_dict -- a dictionary from entity labels to vectors generated from
+ words that mean roughly the same as the label. """
+ self.semantic_matching_helper = semantic_matching_helper
+ self.ontology = ontology
+ self.embedding_based_matching_on_root_words = embedding_based_matching_on_root_words
+ self.analyze_derivational_morphology = analyze_derivational_morphology
+ self.perform_coreference_resolution = perform_coreference_resolution
+ self.use_reverse_dependency_matching = use_reverse_dependency_matching
+ self.entity_label_to_vector_dict = entity_label_to_vector_dict
+
+ def match_type(self, search_phrase_and_document_derived_lemmas_identical, *match_types):
+ """ Selects the most salient match type out of a list of relevant match types. """
+
+ if 'ontology' in match_types and search_phrase_and_document_derived_lemmas_identical:
+ # an ontology entry happens to have created a derivation word match before the
+ # derivation match itself was processed, so mark the type as 'derivation'.
+ return 'derivation'
+ elif 'ontology' in match_types:
+ return 'ontology'
+ elif 'derivation' in match_types:
+ return 'derivation'
+ else:
+ return 'direct'
+
+ def match_recursively(
+ self, *, search_phrase, search_phrase_token, document, document_token,
+ document_subword_index, search_phrase_tokens_to_word_matches,
+ search_phrase_and_document_visited_table, is_uncertain,
+ structurally_matched_document_token, compare_embeddings_on_non_root_words,
+ process_initial_question_words, overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold):
+ """Called whenever matching is attempted between a search phrase token and a document
+ token."""
+
+ def handle_match(
+ search_phrase_word, document_word, match_type, depth,
+ *, similarity_measure=1.0, first_document_token=document_token,
+ last_document_token=document_token, search_phrase_initial_question_word=False):
+ """Most of the variables are set from the outer call.
+
+ Args:
+
+ search_phrase_word -- the textual representation of the search phrase word that matched.
+ document_word -- the textual representation of the document word that matched.
+ match_type -- *direct*, *derivation*, *entity*, *embedding*, *entity_embedding*,
+ *ontology* or *question*
+ similarity_measure -- the similarity between the two tokens. Defaults to 1.0 if the
+ match did not involve embeddings.
+ search_phrase_initial_question_word -- *True* if *search_phrase_word* is an initial
+ question word or governs an initial question word.
+ """
+ for dependency in (
+ dependency for dependency in search_phrase_token._.holmes.children
+ if dependency.child_token(search_phrase_token.doc)._.holmes.is_matchable or
+ (process_initial_question_words and
+ dependency.child_token(
+ search_phrase_token.doc)._.holmes.is_initial_question_word)):
+ at_least_one_document_dependency_tried = False
+ at_least_one_document_dependency_matched = False
+ # Loop through this token and any tokens linked to it by coreference
+ parents = [Index(document_token.i, document_subword_index)]
+ if self.perform_coreference_resolution and (document_subword_index is None or
+ document_token._.holmes.subwords[document_subword_index].is_head):
+ parents.extend([
+ Index(token_index, None) for token_index in
+ document_token._.holmes.token_and_coreference_chain_indexes
+ if token_index != document_token.i])
+ for working_document_parent_index in parents:
+ working_document_child_indexes = []
+ document_parent_token = document_token.doc[
+ working_document_parent_index.token_index]
+ if not working_document_parent_index.is_subword() or \
+ document_parent_token._.holmes.subwords[
+ working_document_parent_index.subword_index].is_head:
+ # is_head: e.g. 'Polizeiinformation über Kriminelle' should match
+ # 'Information über Kriminelle'
+
+ # inverse_polarity_boolean: *True* in the special case where the
+ # dependency has been matched backwards
+ document_dependencies_to_inverse_polarity_booleans = {
+ document_dependency: False for document_dependency in
+ document_parent_token._.holmes.children if
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=dependency.label,
+ document_dependency_label=document_dependency.label,
+ inverse_polarity=False)}
+ document_dependencies_to_inverse_polarity_booleans.update({
+ document_dependency: True for document_dependency in
+ document_parent_token._.holmes.parents if
+ self.use_reverse_dependency_matching and
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=dependency.label,
+ document_dependency_label=document_dependency.label,
+ inverse_polarity=True)})
+ for document_dependency, inverse_polarity in \
+ document_dependencies_to_inverse_polarity_booleans.items():
+ if not inverse_polarity:
+ document_child = document_dependency.child_token(document_token.doc)
+ else:
+ document_child = \
+ document_dependency.parent_token(document_token.doc)
+ if self.perform_coreference_resolution:
+ # wherever a dependency is found, loop through any tokens linked
+ # to the child by coreference
+ working_document_child_indexes = [
+ Index(token_index, None) for token_index in
+ document_child._.holmes.token_and_coreference_chain_indexes
+ if document_token.doc[token_index].pos_ != 'PRON' or not
+ document_token.doc[token_index]._.holmes.\
+ is_involved_in_coreference()]
+ # otherwise where matching starts with a noun and there is
+ # a dependency pointing back to the noun, matching will be
+ # attempted against the pronoun only and will then fail.
+ elif not inverse_polarity:
+ working_document_child_indexes = \
+ [Index(document_dependency.child_index, None)]
+ else:
+ working_document_child_indexes = \
+ [Index(document_dependency.parent_index, None)]
+ # Where a dependency points to an entire word that has subwords, check
+ # the head subword as well as the entire word
+ for working_document_child_index in \
+ working_document_child_indexes.copy():
+ working_document_child = \
+ document_token.doc[working_document_child_index.token_index]
+ for subword in (
+ subword for subword in
+ working_document_child._.holmes.subwords
+ if subword.is_head):
+ working_document_child_indexes.append(Index(
+ working_document_child.i, subword.index))
+ # Loop through the dependencies from each token
+ for working_document_child_index in (
+ working_index for working_index
+ in working_document_child_indexes):
+ at_least_one_document_dependency_tried = True
+ if search_phrase.question_phraselet and \
+ document[
+ working_document_parent_index.token_index] in \
+ self.semantic_matching_helper.\
+ get_subtree_list_for_question_answer(
+ document[
+ working_document_child_index.token_index]):
+ continue
+ if working_document_child_index in \
+ search_phrase_and_document_visited_table[
+ dependency.child_index] or \
+ self.match_recursively(
+ search_phrase=search_phrase,
+ search_phrase_token=dependency.child_token(
+ search_phrase_token.doc),
+ document=document,
+ document_token=document[
+ working_document_child_index.token_index],
+ document_subword_index=
+ working_document_child_index.subword_index,
+ search_phrase_tokens_to_word_matches=
+ search_phrase_tokens_to_word_matches,
+ search_phrase_and_document_visited_table=
+ search_phrase_and_document_visited_table,
+ is_uncertain=(
+ (document_dependency.is_uncertain and not
+ dependency.is_uncertain) or inverse_polarity),
+ structurally_matched_document_token=document_child,
+ compare_embeddings_on_non_root_words=
+ compare_embeddings_on_non_root_words,
+ process_initial_question_words=
+ process_initial_question_words,
+ overall_similarity_threshold=
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold):
+ at_least_one_document_dependency_matched = True
+ if working_document_parent_index.is_subword():
+ # examine relationship to dependent subword in the same word
+ document_parent_subword = document_token.doc[
+ working_document_parent_index.token_index]._.holmes.\
+ subwords[working_document_parent_index.subword_index]
+ if document_parent_subword.dependent_index is not None and \
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=dependency.label,
+ document_dependency_label=
+ document_parent_subword.dependency_label,
+ inverse_polarity=False):
+ at_least_one_document_dependency_tried = True
+ if self.match_recursively(
+ search_phrase=search_phrase,
+ search_phrase_token=dependency.child_token(
+ search_phrase_token.doc),
+ document=document,
+ document_token=document_token,
+ document_subword_index=
+ document_parent_subword.dependent_index,
+ search_phrase_tokens_to_word_matches=
+ search_phrase_tokens_to_word_matches,
+ search_phrase_and_document_visited_table=
+ search_phrase_and_document_visited_table,
+ is_uncertain=False,
+ structurally_matched_document_token=document_token,
+ compare_embeddings_on_non_root_words=
+ compare_embeddings_on_non_root_words,
+ process_initial_question_words=
+ process_initial_question_words,
+ overall_similarity_threshold=
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold):
+ at_least_one_document_dependency_matched = True
+ # examine relationship to governing subword in the same word
+ document_child_subword = document_token.doc[
+ working_document_parent_index.token_index]._.holmes.\
+ subwords[working_document_parent_index.subword_index]
+ if document_child_subword.governor_index is not None and \
+ self.use_reverse_dependency_matching and \
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=dependency.label,
+ document_dependency_label=
+ document_parent_subword.governing_dependency_label,
+ inverse_polarity=True):
+ at_least_one_document_dependency_tried = True
+ if self.match_recursively(
+ search_phrase=search_phrase,
+ search_phrase_token=dependency.child_token(
+ search_phrase_token.doc),
+ document=document,
+ document_token=document_token,
+ document_subword_index=
+ document_parent_subword.governor_index,
+ search_phrase_tokens_to_word_matches=
+ search_phrase_tokens_to_word_matches,
+ search_phrase_and_document_visited_table=
+ search_phrase_and_document_visited_table,
+ is_uncertain=False,
+ structurally_matched_document_token=document_token,
+ compare_embeddings_on_non_root_words=
+ compare_embeddings_on_non_root_words,
+ process_initial_question_words=
+ process_initial_question_words,
+ overall_similarity_threshold=
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold):
+ at_least_one_document_dependency_matched = True
+ if at_least_one_document_dependency_tried and not \
+ at_least_one_document_dependency_matched:
+ # it is already clear that the search phrase has not matched, so
+ # there is no point in pursuing things any further
+ return
+ # store the word match
+ if document_subword_index is None:
+ document_subword = None
+ else:
+ document_subword = document_token._.holmes.subwords[document_subword_index]
+ search_phrase_tokens_to_word_matches[search_phrase_token.i].append(WordMatch(
+ search_phrase_token, search_phrase_word, document_token,
+ first_document_token, last_document_token, document_subword,
+ document_word, match_type, similarity_measure, is_negated, is_uncertain,
+ structurally_matched_document_token, document_word, depth,
+ search_phrase_initial_question_word))
+
+ def loop_search_phrase_word_representations():
+ yield search_phrase_token._.holmes.lemma, 'direct', \
+ search_phrase_token._.holmes.lemma_or_derived_lemma()
+ hyphen_normalized_word = self.semantic_matching_helper.normalize_hyphens(
+ search_phrase_token._.holmes.lemma)
+ if hyphen_normalized_word != search_phrase_token._.holmes.lemma:
+ yield hyphen_normalized_word, 'direct', \
+ search_phrase_token._.holmes.lemma_or_derived_lemma()
+ if self.analyze_derivational_morphology and \
+ search_phrase_token._.holmes.derived_lemma is not None:
+ yield search_phrase_token._.holmes.derived_lemma, 'derivation', \
+ search_phrase_token._.holmes.lemma_or_derived_lemma()
+ if not search_phrase.topic_match_phraselet and \
+ search_phrase_token._.holmes.lemma == search_phrase_token.lemma_ and \
+ search_phrase_token._.holmes.lemma != search_phrase_token.text:
+ # search phrase word is not multiword, phrasal or separable verb, so we can match
+ # against its text as well as its lemma
+ yield search_phrase_token.text, 'direct', \
+ search_phrase_token._.holmes.lemma_or_derived_lemma()
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ for reverse_lemma in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(search_phrase_token):
+ yield reverse_lemma, 'ontology', \
+ search_phrase_token._.holmes.lemma_or_derived_lemma()
+
+ def document_word_representations():
+ list_to_return = []
+ if document_subword_index is not None:
+ working_document_subword = document_token._.holmes.subwords[document_subword_index]
+ list_to_return.append((
+ working_document_subword.text, 'direct',
+ working_document_subword.lemma_or_derived_lemma()))
+ hyphen_normalized_word = self.semantic_matching_helper.normalize_hyphens(
+ working_document_subword.text)
+ if hyphen_normalized_word != working_document_subword.text:
+ list_to_return.append((
+ hyphen_normalized_word, 'direct',
+ working_document_subword.lemma_or_derived_lemma()))
+ if working_document_subword.lemma != working_document_subword.text:
+ list_to_return.append((
+ working_document_subword.lemma, 'direct',
+ working_document_subword.lemma_or_derived_lemma()))
+ if self.analyze_derivational_morphology and \
+ working_document_subword.derived_lemma is not None:
+ list_to_return.append((
+ working_document_subword.derived_lemma,
+ 'derivation', working_document_subword.lemma_or_derived_lemma()))
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ for reverse_lemma in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(working_document_subword):
+ list_to_return.append((
+ reverse_lemma, 'ontology',
+ working_document_subword.lemma_or_derived_lemma()))
+ else:
+ list_to_return.append((
+ document_token.text, 'direct',
+ document_token._.holmes.lemma_or_derived_lemma()))
+ hyphen_normalized_word = self.semantic_matching_helper.normalize_hyphens(
+ document_token.text)
+ if hyphen_normalized_word != document_token.text:
+ list_to_return.append((
+ hyphen_normalized_word, 'direct',
+ document_token._.holmes.lemma_or_derived_lemma()))
+ if document_token._.holmes.lemma != document_token.text:
+ list_to_return.append((
+ document_token._.holmes.lemma, 'direct',
+ document_token._.holmes.lemma_or_derived_lemma()))
+ if self.analyze_derivational_morphology:
+ if document_token._.holmes.derived_lemma is not None:
+ list_to_return.append((
+ document_token._.holmes.derived_lemma,
+ 'derivation', document_token._.holmes.lemma_or_derived_lemma()))
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ for reverse_lemma in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(document_token):
+ list_to_return.append((
+ reverse_lemma, 'ontology',
+ document_token._.holmes.lemma_or_derived_lemma()))
+ return list_to_return
+
+ def loop_document_multiword_representations(multiword_span):
+ yield multiword_span.text, 'direct', multiword_span.derived_lemma
+ hyphen_normalized_word = \
+ self.semantic_matching_helper.normalize_hyphens(multiword_span.text)
+ if hyphen_normalized_word != multiword_span.text:
+ yield hyphen_normalized_word, 'direct', multiword_span.derived_lemma
+ if multiword_span.text != multiword_span.lemma:
+ yield multiword_span.lemma, 'direct', multiword_span.derived_lemma
+ if multiword_span.derived_lemma != multiword_span.lemma:
+ yield multiword_span.derived_lemma, 'derivation', multiword_span.derived_lemma
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ for reverse_lemma in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(multiword_span):
+ yield reverse_lemma, 'ontology', multiword_span.derived_lemma
+
+ index = Index(document_token.i, document_subword_index)
+ search_phrase_and_document_visited_table[search_phrase_token.i].add(index)
+ is_negated = document_token._.holmes.is_negated
+ if document_token._.holmes.is_uncertain:
+ is_uncertain = True
+
+ search_phrase_initial_question_word = process_initial_question_words and \
+ search_phrase_token._.holmes.has_initial_question_word_in_phrase
+ if self.semantic_matching_helper.is_entity_search_phrase_token(
+ search_phrase_token, search_phrase.topic_match_phraselet) and \
+ document_subword_index is None:
+ if self.semantic_matching_helper.entity_search_phrase_token_matches(
+ search_phrase_token, search_phrase.topic_match_phraselet, document_token):
+ for multiword_span in \
+ self.semantic_matching_helper.multiword_spans_with_head_token(
+ document_token):
+ for working_token in multiword_span.tokens:
+ if not self.semantic_matching_helper.entity_search_phrase_token_matches(
+ search_phrase_token, search_phrase.topic_match_phraselet,
+ document_token):
+ continue
+ for working_token in multiword_span.tokens:
+ search_phrase_and_document_visited_table[search_phrase_token.i].add(
+ working_token.i)
+ handle_match(
+ search_phrase_token.text, multiword_span.text, 'entity', 0,
+ first_document_token=multiword_span.tokens[0],
+ last_document_token=multiword_span.tokens[-1],
+ search_phrase_initial_question_word=search_phrase_initial_question_word)
+ return True
+ search_phrase_and_document_visited_table[search_phrase_token.i].add(
+ document_token.i)
+ handle_match(search_phrase_token.text, document_token.text, 'entity', 0,
+ search_phrase_initial_question_word=
+ search_phrase_initial_question_word)
+ return True
+ return False
+
+ document_word_representations = document_word_representations()
+ for search_phrase_word_representation, search_phrase_match_type, \
+ search_phrase_derived_lemma in loop_search_phrase_word_representations():
+ # multiword matches
+ if document_subword_index is None:
+ for multiword_span in \
+ self.semantic_matching_helper.multiword_spans_with_head_token(
+ document_token):
+ for multiword_span_representation, document_match_type, \
+ multispan_derived_lemma in \
+ loop_document_multiword_representations(multiword_span):
+ if search_phrase_word_representation.lower() == \
+ multiword_span_representation.lower():
+ for working_token in multiword_span.tokens:
+ search_phrase_and_document_visited_table[search_phrase_token.i].add(
+ working_token.i)
+ handle_match(
+ search_phrase_token._.holmes.lemma,
+ multiword_span_representation,
+ self.match_type(
+ search_phrase_derived_lemma == multispan_derived_lemma,
+ search_phrase_match_type, document_match_type),
+ 0, first_document_token=multiword_span.tokens[0],
+ last_document_token=multiword_span.tokens[-1],
+ search_phrase_initial_question_word=
+ search_phrase_initial_question_word)
+ return True
+ if self.ontology is not None:
+ entry = self.ontology.matches(
+ search_phrase_word_representation.lower(),
+ multiword_span_representation.lower())
+ if entry is not None:
+ for working_token in multiword_span.tokens:
+ search_phrase_and_document_visited_table[
+ search_phrase_token.i].add(working_token.i)
+ handle_match(
+ search_phrase_word_representation, entry.word,
+ 'ontology', entry.depth,
+ first_document_token=multiword_span.tokens[0],
+ last_document_token=multiword_span.tokens[-1],
+ search_phrase_initial_question_word=
+ search_phrase_initial_question_word)
+ return True
+ for document_word_representation, document_match_type, document_derived_lemma in \
+ document_word_representations:
+ if search_phrase_word_representation.lower() == \
+ document_word_representation.lower():
+ handle_match(
+ search_phrase_word_representation, document_word_representation,
+ self.match_type(
+ search_phrase_derived_lemma == document_derived_lemma,
+ search_phrase_match_type, document_match_type)
+ , 0,
+ search_phrase_initial_question_word=search_phrase_initial_question_word)
+ return True
+ if self.ontology is not None:
+ entry = self.ontology.matches(
+ search_phrase_word_representation.lower(),
+ document_word_representation.lower())
+ if entry is not None:
+ handle_match(
+ search_phrase_word_representation, entry.word, 'ontology',
+ entry.depth,
+ search_phrase_initial_question_word=
+ search_phrase_initial_question_word)
+ return True
+
+ if document_subword_index is not None:
+ document_word_to_use = document_token._.holmes.subwords[document_subword_index].lemma
+ document_vector = document_token._.holmes.subwords[document_subword_index].vector if \
+ self.embedding_matching_permitted(
+ document_token._.holmes.subwords[document_subword_index]) else None
+ else:
+ document_word_to_use = document_token.lemma_
+ document_vector = document_vector = document_token._.holmes.vector if \
+ self.embedding_matching_permitted(document_token) else None
+
+ if (overall_similarity_threshold < 1.0 or (search_phrase_initial_question_word and
+ initial_question_word_overall_similarity_threshold < 1.0)) and (
+ compare_embeddings_on_non_root_words or search_phrase.root_token.i ==
+ search_phrase_token.i) and search_phrase_token.i in \
+ search_phrase.matchable_non_entity_tokens_to_vectors.keys() and \
+ self.embedding_matching_permitted(search_phrase_token):
+ search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[
+ search_phrase_token.i]
+ if document_subword_index is not None:
+ if not self.embedding_matching_permitted(
+ document_token._.holmes.subwords[document_subword_index]):
+ return False
+ else:
+ if not self.embedding_matching_permitted(document_token):
+ return False
+ single_token_similarity_threshold = \
+ (initial_question_word_overall_similarity_threshold if
+ search_phrase_initial_question_word else overall_similarity_threshold) ** len(
+ search_phrase.matchable_non_entity_tokens_to_vectors)
+ if search_phrase_vector is not None and document_vector is not None:
+ similarity_measure = \
+ self.semantic_matching_helper.cosine_similarity(search_phrase_vector,
+ document_vector)
+ if similarity_measure > single_token_similarity_threshold:
+ if not search_phrase.topic_match_phraselet and \
+ len(search_phrase_token._.holmes.lemma.split()) > 1:
+ search_phrase_word_to_use = search_phrase_token.lemma_
+ else:
+ search_phrase_word_to_use = search_phrase_token._.holmes.lemma
+ handle_match(
+ search_phrase_word_to_use, document_word_to_use, 'embedding', 0,
+ similarity_measure=similarity_measure,
+ search_phrase_initial_question_word=search_phrase_initial_question_word)
+ return True
+ if document_token.ent_type_ != '':
+ cosine_similarity = self.semantic_matching_helper.token_matches_ent_type(
+ search_phrase_vector, self.entity_label_to_vector_dict,
+ (document_token.ent_type_,), single_token_similarity_threshold)
+ if cosine_similarity > 0:
+ for multiword_span in \
+ self.semantic_matching_helper.multiword_spans_with_head_token(
+ document_token):
+ for working_token in multiword_span.tokens:
+ if not working_token.ent_type == document_token.ent_type:
+ continue
+ for working_token in multiword_span.tokens:
+ search_phrase_and_document_visited_table[search_phrase_token.i].add(
+ working_token.i)
+ handle_match(search_phrase_token.text, document_token.text,
+ 'entity_embedding', 0, similarity_measure=cosine_similarity,
+ first_document_token=multiword_span.tokens[0],
+ last_document_token=multiword_span.tokens[-1],
+ search_phrase_initial_question_word=search_phrase_initial_question_word)
+ return True
+ handle_match(search_phrase_token.text, document_token.text, 'entity_embedding',
+ 0, similarity_measure=cosine_similarity,
+ search_phrase_initial_question_word=search_phrase_initial_question_word)
+ return True
+
+ if process_initial_question_words and search_phrase_token._.holmes.is_initial_question_word:
+ if document_vector is not None:
+ question_word_matches = self.semantic_matching_helper.question_word_matches(
+ search_phrase.label, search_phrase_token, document_token, document_vector,
+ self.entity_label_to_vector_dict,
+ initial_question_word_overall_similarity_threshold ** 2)
+ else:
+ question_word_matches = self.semantic_matching_helper.question_word_matches(
+ search_phrase.label, search_phrase_token, document_token, None, None, None)
+ if question_word_matches:
+ first_document_token_index = last_document_token_index = document_token.i
+ if document_token.pos_ in self.semantic_matching_helper.noun_pos and \
+ len(document_token.ent_type_) > 0:
+ while first_document_token_index >= 1:
+ if document_token.doc[first_document_token_index - 1].pos_ in \
+ self.semantic_matching_helper.noun_pos:
+ first_document_token_index = first_document_token_index - 1
+ else:
+ break
+ while last_document_token_index + 1 < len(document_token.doc):
+ if document_token.doc[last_document_token_index + 1].pos_ in \
+ self.semantic_matching_helper.noun_pos:
+ last_document_token_index = last_document_token_index + 1
+ else:
+ break
+ handle_match(search_phrase_token._.holmes.lemma, document_word_to_use, 'question',
+ 0, first_document_token=document_token.doc[first_document_token_index],
+ last_document_token=document_token.doc[last_document_token_index],
+ search_phrase_initial_question_word=True)
+ return True
+ return False
+
+ def embedding_matching_permitted(self, obj):
+ """ Embedding matching is suppressed for some parts of speech as well as for very short
+ words. """
+ if isinstance(obj, Token):
+ if len(obj._.holmes.lemma.split()) > 1:
+ working_lemma = obj.lemma_
+ else:
+ working_lemma = obj._.holmes.lemma
+ return obj.pos_ in self.semantic_matching_helper.permissible_embedding_pos and \
+ len(working_lemma) >= \
+ self.semantic_matching_helper.minimum_embedding_match_word_length
+ elif isinstance(obj, Subword):
+ return len(obj.lemma) >= \
+ self.semantic_matching_helper.minimum_embedding_match_word_length
+ else:
+ raise RuntimeError("'obj' must be either a Token or a Subword")
+
+ def build_matches(
+ self, *, search_phrase, search_phrase_tokens_to_word_matches, document_label,
+ overall_similarity_threshold, initial_question_word_overall_similarity_threshold):
+ """Investigate possible matches when recursion is complete."""
+
+ def mention_root_or_token_index(token):
+ if len(token._.coref_chains) == 0:
+ return token.i
+ for mention in (m for m in token._.coref_chains[0].mentions if token.i in
+ m.token_indexes):
+ return mention.root_index
+
+ def filter_word_matches_based_on_coreference_resolution(word_matches):
+ """ When coreference resolution is active, additional matches are sometimes
+ returned that are filtered out again using this method.
+ """
+ structural_indexes_to_word_matches = {}
+ # Find the structurally matching document tokens for this list of word matches
+ for word_match in word_matches:
+ structural_index = \
+ mention_root_or_token_index(word_match.structurally_matched_document_token)
+ if structural_index in structural_indexes_to_word_matches.keys():
+ structural_indexes_to_word_matches[structural_index].append(word_match)
+ else:
+ structural_indexes_to_word_matches[structural_index] = [word_match]
+ new_word_matches = []
+ for structural_index in structural_indexes_to_word_matches:
+ # For each structural token, find the best matching coreference mention
+ relevant_word_matches = structural_indexes_to_word_matches[structural_index]
+ structurally_matched_document_token = \
+ relevant_word_matches[0].document_token.doc[structural_index]
+ already_added_document_token_indexes = set()
+ if structurally_matched_document_token._.holmes.is_involved_in_coreference():
+ working_index = -1
+ for relevant_word_match in relevant_word_matches:
+ this_index = mention_root_or_token_index(relevant_word_match.document_token)
+ # The best mention should be as close to the structural
+ # index as possible; if they are the same distance, the preceding mention
+ # wins.
+ if working_index == -1 or (
+ abs(structural_index - this_index) <
+ abs(structural_index - working_index)) or \
+ ((abs(structural_index - this_index) ==
+ abs(structural_index - working_index)) and
+ this_index < working_index):
+ working_index = this_index
+ # Filter out any matches from mentions other than the best mention
+ for relevant_word_match in relevant_word_matches:
+ if working_index == \
+ mention_root_or_token_index(relevant_word_match.document_token) \
+ and relevant_word_match.document_token.i not in \
+ already_added_document_token_indexes:
+ already_added_document_token_indexes.add(
+ relevant_word_match.document_token.i)
+ new_word_matches.append(relevant_word_match)
+ else:
+ new_word_matches.extend(relevant_word_matches)
+ return new_word_matches
+
+ def revise_extracted_words_based_on_coreference_resolution(word_matches):
+ """ When coreference resolution is active, there may be a more specific piece of
+ information elsewhere in the coreference chain of a token that has been matched, in
+ which case this piece of information should be recorded in *word_match.extracted_word*.
+ """
+
+ for word_match in (
+ word_match for word_match in word_matches
+ if word_match.word_match_type in ('direct', 'derivation', 'ontology')
+ and word_match.document_subword is None and
+ word_match.document_token._.holmes.most_specific_coreferring_term_index
+ is not None):
+ most_specific_document_token = word_match.document_token.doc[
+ word_match.document_token._.holmes.most_specific_coreferring_term_index]
+ if word_match.document_token._.holmes.lemma != \
+ most_specific_document_token._.holmes.lemma:
+ for multiword_span in \
+ self.semantic_matching_helper.multiword_spans_with_head_token(
+ word_match.document_token.doc[
+ word_match.document_token._.holmes.
+ most_specific_coreferring_term_index]):
+ word_match.extracted_word = multiword_span.text
+ break
+ else:
+ word_match.extracted_word = most_specific_document_token.text
+
+ return word_matches
+
+ def match_already_contains_structurally_matched_document_token(
+ match, document_token, document_subword_index):
+ """Ensure that the same document token or subword does not match multiple search phrase
+ tokens.
+ """
+ for word_match in match.word_matches:
+ if document_token.i == word_match.structurally_matched_document_token.i:
+ if word_match.document_subword is not None and document_subword_index == \
+ word_match.document_subword.index:
+ return True
+ if word_match.document_subword is None and document_subword_index is None:
+ return True
+ return False
+
+ def check_document_tokens_are_linked_by_dependency(
+ parent_token, parent_subword, child_token, child_subword):
+ """ The recursive nature of the main matching algorithm can mean that all the tokens
+ in the search phrase have matched but that two of them are linked by a
+ search-phrase dependency that is absent from the document, which invalidates the
+ match.
+ """
+ if parent_subword is not None:
+ if child_subword is not None and parent_subword.dependent_index == \
+ child_subword.index and parent_token.i == child_token.i:
+ return True
+ elif parent_subword.is_head and (child_subword is None or (
+ child_subword.is_head and parent_subword.containing_token_index !=
+ child_subword.containing_token_index)):
+ return True
+ else:
+ return False
+ if child_subword is not None and not child_subword.is_head:
+ return False
+ if self.perform_coreference_resolution and (parent_subword is None
+ or parent_subword.is_head):
+ parents = parent_token._.holmes.token_and_coreference_chain_indexes
+ children = child_token._.holmes.token_and_coreference_chain_indexes
+ else:
+ parents = [parent_token.i]
+ children = [child_token.i]
+ for parent in parents:
+ for child in children:
+ if parent_token.doc[parent]._.holmes.has_dependency_with_child_index(child):
+ return True
+ if child_token.doc[child]._.holmes.has_dependency_with_child_index(parent):
+ return True
+ return False
+
+ def match_with_subwords_involves_all_containing_document_tokens(word_matches):
+ """ Where a match involves subwords and the subwords are involved in conjunction,
+ we need to make sure there are no tokens involved in the match merely because they
+ supply subwords to another token, as this would lead to double matching. An example
+ is search phrase 'Extraktion der Information' and document
+ 'Informationsextraktionsüberlegungen und -probleme'.
+ """
+ token_indexes = []
+ containing_subword_token_indexes = []
+ for word_match in word_matches:
+ if word_match.document_subword is not None:
+ token_indexes.append(word_match.document_token.i)
+ containing_subword_token_indexes.append(
+ word_match.document_subword.containing_token_index)
+ return len([
+ token_index for token_index in token_indexes if not token_index in
+ containing_subword_token_indexes]) == 0
+
+ matches = [Match(
+ search_phrase.label, search_phrase.doc_text, document_label,
+ search_phrase.topic_match_phraselet and search_phrase.has_single_matchable_word,
+ search_phrase.topic_match_phraselet_created_without_matching_tags,
+ search_phrase.reverse_only)]
+ for search_phrase_token in search_phrase.matchable_tokens:
+ word_matches = search_phrase_tokens_to_word_matches[search_phrase_token.i]
+ if len(word_matches) == 0:
+ # if there is any search phrase token without a matching document token,
+ # we have no match and can return
+ return []
+ if self.perform_coreference_resolution:
+ word_matches = filter_word_matches_based_on_coreference_resolution(word_matches)
+ if self.ontology is not None:
+ word_matches = revise_extracted_words_based_on_coreference_resolution(
+ word_matches)
+ # handle any conjunction by distributing the matches amongst separate match objects
+ working_matches = []
+ for word_match in word_matches:
+ for match in matches:
+ working_match = copy.copy(match)
+ if word_match.document_subword is None:
+ subword_index = None
+ else:
+ subword_index = word_match.document_subword.index
+ if not match_already_contains_structurally_matched_document_token(
+ working_match, word_match.structurally_matched_document_token,
+ subword_index):
+ working_match.word_matches.append(word_match)
+ if word_match.is_negated:
+ working_match.is_negated = True
+ if word_match.is_uncertain:
+ working_match.is_uncertain = True
+ if search_phrase_token.i == search_phrase.root_token.i:
+ working_match.index_within_document = word_match.document_token.i
+ working_matches.append(working_match)
+ matches = working_matches
+
+ matches_to_return = []
+ for match in matches:
+ failed = False
+ not_normalized_overall_similarity_measure = 1.0
+ # now carry out the coherence check, if there are two or fewer word matches (which
+ # is the case during topic matching) no check is necessary
+ if len(match.word_matches) > 2:
+ for parent_word_match in match.word_matches:
+ for search_phrase_dependency in \
+ parent_word_match.search_phrase_token._.holmes.children:
+ for child_word_match in (
+ cwm for cwm in match.word_matches if cwm.search_phrase_token.i ==
+ search_phrase_dependency.child_index):
+ if not check_document_tokens_are_linked_by_dependency(
+ parent_word_match.document_token,
+ parent_word_match.document_subword,
+ child_word_match.document_token,
+ child_word_match.document_subword) and \
+ not check_document_tokens_are_linked_by_dependency(
+ child_word_match.document_token,
+ child_word_match.document_subword,
+ parent_word_match.document_token,
+ parent_word_match.document_subword):
+ failed = True
+ if failed:
+ break
+ if failed:
+ break
+ if failed:
+ continue
+
+ if not match_with_subwords_involves_all_containing_document_tokens(match.word_matches):
+ continue
+
+ for word_match in match.word_matches:
+ not_normalized_overall_similarity_measure *= word_match.similarity_measure
+ if not_normalized_overall_similarity_measure < 1.0:
+ overall_similarity_measure = \
+ round(not_normalized_overall_similarity_measure ** \
+ (1 / len(search_phrase.matchable_non_entity_tokens_to_vectors)), 8)
+ else:
+ overall_similarity_measure = 1.0
+ if overall_similarity_measure == 1.0 or \
+ overall_similarity_measure >= overall_similarity_threshold or \
+ overall_similarity_measure >= \
+ initial_question_word_overall_similarity_threshold:
+ match.overall_similarity_measure = str(
+ overall_similarity_measure)
+ matches_to_return.append(match)
+ return matches_to_return
+
+ def get_matches_starting_at_root_word_match(
+ self, search_phrase, document, document_token, document_subword_index, document_label,
+ compare_embeddings_on_non_root_words, process_initial_question_words,
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold):
+ """Begin recursive matching where a search phrase root token has matched a document
+ token.
+ """
+ matches_to_return = []
+ # array of arrays where each entry corresponds to a search_phrase token and is itself an
+ # array of WordMatch instances
+ search_phrase_tokens_to_word_matches = [[] for token in search_phrase.doc]
+ # array of sets to guard against endless looping during recursion. Each set
+ # corresponds to the search phrase token with its index and contains the Index objects
+ # for the document words for which a match to that search phrase token has been attempted.
+ search_phrase_and_document_visited_table = [set() for token in search_phrase.doc]
+ self.match_recursively(
+ search_phrase=search_phrase,
+ search_phrase_token=search_phrase.root_token,
+ document=document,
+ document_token=document_token,
+ document_subword_index=document_subword_index,
+ search_phrase_tokens_to_word_matches=search_phrase_tokens_to_word_matches,
+ search_phrase_and_document_visited_table=search_phrase_and_document_visited_table,
+ is_uncertain=document_token._.holmes.is_uncertain,
+ structurally_matched_document_token=document_token,
+ compare_embeddings_on_non_root_words=compare_embeddings_on_non_root_words,
+ process_initial_question_words=process_initial_question_words,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold)
+ working_matches = self.build_matches(
+ search_phrase=search_phrase,
+ search_phrase_tokens_to_word_matches=search_phrase_tokens_to_word_matches,
+ document_label=document_label,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold)
+ matches_to_return.extend(working_matches)
+ return matches_to_return
+
+ def match(
+ self, *, document_labels_to_documents,
+ corpus_index_dict,
+ search_phrases,
+ match_depending_on_single_words,
+ compare_embeddings_on_root_words,
+ compare_embeddings_on_non_root_words,
+ reverse_matching_corpus_word_positions,
+ embedding_reverse_matching_corpus_word_positions,
+ process_initial_question_words,
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold,
+ document_label_filter=None):
+ """Finds and returns matches between search phrases and documents.
+ match_depending_on_single_words -- 'True' to match only single word search phrases,
+ 'False' to match only non-single-word search phrases and 'None' to match both.
+ compare_embeddings_on_root_words -- if 'True', embeddings on root words are compared.
+ compare_embeddings_on_non_root_words -- if 'True', embeddings on non-root words are
+ compared.
+ reverse_matching_corpus_word_positions -- corpus word positions for non-embedding
+ reverse matching only.
+ embedding_reverse_matching_corpus_word_positions -- corpus word positions for embedding
+ and non-embedding reverse matching.
+ process_initial_question_words -- 'True' if interrogative pronouns in search phrases should
+ be matched to answering phrases in documents. Only used with topic matching.
+ overall_similarity_threshold -- the overall similarity threshold for embedding-based
+ matching.
+ initial_question_word_overall_similarity_threshold -- the overall similarity threshold for
+ embedding-based matching where the search phrase word has a dependent initial question
+ word.
+ document_label_filter -- a string with which the label of a document must begin for that
+ document to be considered for matching, or 'None' if no filter is in use.
+ """
+
+ def filter_out(document_label):
+ return document_label_filter is not None and document_label is not None and \
+ not document_label.startswith(str(document_label_filter))
+
+ if overall_similarity_threshold == 1.0 and \
+ initial_question_word_overall_similarity_threshold == 1.0:
+ compare_embeddings_on_root_words = False
+ compare_embeddings_on_non_root_words = False
+ match_specific_indexes = reverse_matching_corpus_word_positions is not None \
+ or embedding_reverse_matching_corpus_word_positions is not None
+ if reverse_matching_corpus_word_positions is None:
+ reverse_matching_corpus_word_positions = set()
+ if embedding_reverse_matching_corpus_word_positions is None:
+ embedding_reverse_matching_corpus_word_positions = set()
+
+ matches = []
+ # Dictionary used to improve performance when embedding-based matching for root tokens
+ # is active and there are multiple search phrases with the same root token word: the
+ # same corpus word positions will then match all the search phrase root tokens.
+ root_lexeme_to_cwps_to_match_dict = {}
+
+ for search_phrase in search_phrases:
+ if not search_phrase.has_single_matchable_word and match_depending_on_single_words:
+ continue
+ if search_phrase.has_single_matchable_word and \
+ match_depending_on_single_words is False:
+ continue
+ if not match_specific_indexes and (search_phrase.reverse_only or \
+ search_phrase.treat_as_reverse_only_during_initial_relation_matching):
+ continue
+ if search_phrase.has_single_matchable_word and \
+ not compare_embeddings_on_root_words and \
+ not self.semantic_matching_helper.is_entity_search_phrase_token(
+ search_phrase.root_token, search_phrase.topic_match_phraselet):
+ # We are only matching a single word without embedding, so to improve
+ # performance we avoid entering the subgraph matching code.
+ search_phrase_token = [
+ token for token in search_phrase.doc if token._.holmes.is_matchable][0]
+ existing_minimal_match_cwps = []
+ for word_matching_root_token in search_phrase.words_matching_root_token:
+ if word_matching_root_token in corpus_index_dict:
+ search_phrase_match_type, depth = \
+ search_phrase.root_word_to_match_info_dict[
+ word_matching_root_token]
+ for corpus_word_position, document_word_representation, \
+ document_match_type_is_derivation in \
+ corpus_index_dict[word_matching_root_token]:
+ if filter_out(corpus_word_position.document_label):
+ continue
+ if corpus_word_position in existing_minimal_match_cwps:
+ continue
+ document_label = corpus_word_position.document_label
+ index = corpus_word_position.index
+ doc = document_labels_to_documents[document_label]
+ if document_match_type_is_derivation:
+ document_match_type = 'derivation'
+ else:
+ document_match_type = 'direct'
+ match_type = self.match_type(
+ False, search_phrase_match_type, document_match_type)
+ minimal_match = Match(
+ search_phrase.label, search_phrase.doc_text, document_label,
+ True, search_phrase.
+ topic_match_phraselet_created_without_matching_tags,
+ search_phrase.reverse_only)
+ minimal_match.index_within_document = index.token_index
+ matched = False
+ if len(word_matching_root_token.split()) > 1:
+ for multiword_span in \
+ self.semantic_matching_helper.\
+ multiword_spans_with_head_token(
+ doc[index.token_index]):
+ for textual_representation, _ in \
+ self.semantic_matching_helper.\
+ loop_textual_representations(multiword_span):
+ if textual_representation == \
+ word_matching_root_token:
+ matched = True
+ minimal_match.word_matches.append(WordMatch(
+ search_phrase_token,
+ search_phrase_token._.holmes.lemma,
+ doc[index.token_index],
+ multiword_span.tokens[0],
+ multiword_span.tokens[-1],
+ None,
+ document_word_representation,
+ match_type,
+ 1.0, False, False, doc[index.token_index],
+ document_word_representation, depth, False))
+ break
+ if matched:
+ break
+ if not matched:
+ token = doc[index.token_index]
+ if index.is_subword():
+ subword = token._.holmes.subwords[index.subword_index]
+ else:
+ subword = None
+ minimal_match.word_matches.append(WordMatch(
+ search_phrase_token,
+ search_phrase_token._.holmes.lemma,
+ token,
+ token,
+ token,
+ subword,
+ document_word_representation,
+ match_type,
+ 1.0, token._.holmes.is_negated, False, token,
+ document_word_representation, depth, False))
+ if token._.holmes.is_negated:
+ minimal_match.is_negated = True
+ existing_minimal_match_cwps.append(corpus_word_position)
+ matches.append(minimal_match)
+ continue
+ direct_matching_corpus_word_positions = []
+ if self.semantic_matching_helper.is_entitynoun_search_phrase_token(
+ search_phrase.root_token): # phraselets are not generated for
+ # ENTITYNOUN roots, so not relevant to topic matching
+ for document_label, doc in document_labels_to_documents.items():
+ for token in doc:
+ if token.pos_ in self.semantic_matching_helper.noun_pos:
+ matches.extend(
+ self.get_matches_starting_at_root_word_match(
+ search_phrase, doc, token, None, document_label,
+ compare_embeddings_on_non_root_words,
+ process_initial_question_words,
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold))
+ continue
+ matched_corpus_word_positions = set()
+ if self.semantic_matching_helper.is_entity_search_phrase_token(
+ search_phrase.root_token, search_phrase.topic_match_phraselet):
+ if search_phrase.topic_match_phraselet:
+ entity_label = search_phrase.root_token._.holmes.lemma
+ else:
+ entity_label = search_phrase.root_token.text
+ if entity_label in corpus_index_dict.keys():
+ entity_matching_corpus_word_positions = [
+ cwp for cwp, _, _ in corpus_index_dict[entity_label]]
+ if match_specific_indexes:
+ entity_matching_corpus_word_positions = [
+ cwp for cwp in entity_matching_corpus_word_positions
+ if cwp in reverse_matching_corpus_word_positions
+ or cwp in embedding_reverse_matching_corpus_word_positions
+ and not cwp.index.is_subword()]
+ matched_corpus_word_positions.update(
+ entity_matching_corpus_word_positions)
+ else:
+ for word_matching_root_token in search_phrase.words_matching_root_token:
+ if word_matching_root_token in corpus_index_dict.keys():
+ direct_matching_corpus_word_positions = [
+ cwp for cwp, _, _ in corpus_index_dict[
+ word_matching_root_token]]
+ if match_specific_indexes:
+ direct_matching_corpus_word_positions = [
+ cwp for cwp in direct_matching_corpus_word_positions
+ if cwp in reverse_matching_corpus_word_positions
+ or cwp in embedding_reverse_matching_corpus_word_positions]
+ matched_corpus_word_positions.update(
+ direct_matching_corpus_word_positions)
+ if compare_embeddings_on_root_words and not \
+ self.semantic_matching_helper.is_entity_search_phrase_token(
+ search_phrase.root_token, search_phrase.topic_match_phraselet) \
+ and not search_phrase.reverse_only and \
+ self.embedding_matching_permitted(search_phrase.root_token):
+ if not search_phrase.topic_match_phraselet and \
+ len(search_phrase.root_token._.holmes.lemma.split()) > 1:
+ root_token_lemma_to_use = search_phrase.root_token.lemma_
+ else:
+ root_token_lemma_to_use = search_phrase.root_token._.holmes.lemma
+ if root_token_lemma_to_use in root_lexeme_to_cwps_to_match_dict:
+ matched_corpus_word_positions.update(
+ root_lexeme_to_cwps_to_match_dict[root_token_lemma_to_use])
+ else:
+ working_cwps_to_match_for_cache = set()
+ for document_word in corpus_index_dict:
+ corpus_word_positions_to_match = [
+ cwp for cwp, _, _ in corpus_index_dict[document_word]]
+ if match_specific_indexes:
+ corpus_word_positions_to_match = [
+ cwp for cwp in corpus_word_positions_to_match
+ if cwp in embedding_reverse_matching_corpus_word_positions
+ and cwp not in direct_matching_corpus_word_positions]
+ if len(corpus_word_positions_to_match) == 0:
+ continue
+ search_phrase_vector = \
+ search_phrase.matchable_non_entity_tokens_to_vectors[
+ search_phrase.root_token.i]
+ example_cwp = corpus_word_positions_to_match[0]
+ example_doc = document_labels_to_documents[example_cwp.document_label]
+ example_index = example_cwp.index
+ example_document_token = example_doc[example_index.token_index]
+ if example_index.is_subword():
+ if not self.embedding_matching_permitted(
+ example_document_token._.holmes.subwords[
+ example_index.subword_index]):
+ continue
+ document_vector = example_document_token._.holmes.subwords[
+ example_index.subword_index].vector
+ else:
+ if not self.embedding_matching_permitted(example_document_token):
+ continue
+ document_vector = example_document_token._.holmes.vector
+ if search_phrase_vector is not None and document_vector is not None:
+ similarity_measure = \
+ self.semantic_matching_helper.cosine_similarity(
+ search_phrase_vector,
+ document_vector)
+ search_phrase_initial_question_word = process_initial_question_words \
+ and search_phrase.root_token._.holmes.\
+ has_initial_question_word_in_phrase
+ single_token_similarity_threshold = \
+ (initial_question_word_overall_similarity_threshold if
+ search_phrase_initial_question_word else
+ overall_similarity_threshold) ** len(
+ search_phrase.matchable_non_entity_tokens_to_vectors)
+ if similarity_measure >= single_token_similarity_threshold:
+ matched_corpus_word_positions.update(
+ corpus_word_positions_to_match)
+ working_cwps_to_match_for_cache.update(
+ corpus_word_positions_to_match)
+ root_lexeme_to_cwps_to_match_dict[root_token_lemma_to_use] = \
+ working_cwps_to_match_for_cache
+ for corpus_word_position in matched_corpus_word_positions:
+ if filter_out(corpus_word_position.document_label):
+ continue
+ doc = document_labels_to_documents[corpus_word_position.document_label]
+ matches.extend(self.get_matches_starting_at_root_word_match(
+ search_phrase, doc, doc[corpus_word_position.index.token_index],
+ corpus_word_position.index.subword_index, corpus_word_position.document_label,
+ compare_embeddings_on_non_root_words, process_initial_question_words,
+ overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold))
+ return sorted(matches, key=lambda match: (1 - float(match.overall_similarity_measure),
+ match.document_label, match.index_within_document))
+
+ def build_match_dictionaries(self, matches):
+ """Builds and returns a sorted list of match dictionaries."""
+ match_dicts = []
+ for match in matches:
+ earliest_sentence_index = sys.maxsize
+ latest_sentence_index = -1
+ for word_match in match.word_matches:
+ sentence_index = word_match.document_token.sent.start
+ if sentence_index < earliest_sentence_index:
+ earliest_sentence_index = sentence_index
+ if sentence_index > latest_sentence_index:
+ latest_sentence_index = sentence_index
+ sentences_string = ' '.join(
+ sentence.text.strip() for sentence in
+ match.word_matches[0].document_token.doc.sents if sentence.start >=
+ earliest_sentence_index and sentence.start <= latest_sentence_index)
+
+ match_dict = {
+ 'search_phrase_label': match.search_phrase_label,
+ 'search_phrase_text': match.search_phrase_text,
+ 'document': match.document_label,
+ 'index_within_document': match.index_within_document,
+ 'sentences_within_document': sentences_string,
+ 'negated': match.is_negated,
+ 'uncertain': match.is_uncertain,
+ 'involves_coreference': match.involves_coreference,
+ 'overall_similarity_measure': match.overall_similarity_measure}
+ text_word_matches = []
+ for word_match in match.word_matches:
+ text_word_matches.append({
+ 'search_phrase_token_index': word_match.search_phrase_token.i,
+ 'search_phrase_word': word_match.search_phrase_word,
+ 'document_token_index': word_match.document_token.i,
+ 'first_document_token_index': word_match.first_document_token.i,
+ 'last_document_token_index': word_match.last_document_token.i,
+ 'structurally_matched_document_token_index':
+ word_match.structurally_matched_document_token.i,
+ 'document_subword_index':
+ word_match.document_subword.index
+ if word_match.document_subword is not None else None,
+ 'document_subword_containing_token_index':
+ word_match.document_subword.containing_token_index
+ if word_match.document_subword is not None else None,
+ 'document_word': word_match.document_word,
+ 'document_phrase': self.semantic_matching_helper.get_dependent_phrase(
+ word_match.document_token, word_match.document_subword),
+ 'match_type': word_match.word_match_type,
+ 'negated': word_match.is_negated,
+ 'uncertain': word_match.is_uncertain,
+ 'similarity_measure': str(word_match.similarity_measure),
+ 'involves_coreference': word_match.involves_coreference,
+ 'extracted_word': word_match.extracted_word,
+ 'depth': word_match.depth,
+ 'explanation': word_match.explain()})
+ match_dict['word_matches'] = text_word_matches
+ match_dicts.append(match_dict)
+ return match_dicts
+
+ def sort_match_dictionaries(self, match_dictionaries):
+ return sorted(match_dictionaries,
+ key=lambda match_dict: (1 - float(match_dict['overall_similarity_measure']),
+ match_dict['document']))
diff --git a/holmes_extractor/ontology.py b/holmes_extractor/ontology.py
index 0aa044c..e4edea9 100644
--- a/holmes_extractor/ontology.py
+++ b/holmes_extractor/ontology.py
@@ -43,15 +43,15 @@ def __init__(
self._graph.load(entry)
else:
self._graph.load(ontology_path)
- self._owl_class_type = owl_class_type
- self._owl_individual_type = owl_individual_type
- self._owl_type_link = owl_type_link
- self._owl_synonym_type = owl_synonym_type
- self._owl_hyponym_type = owl_hyponym_type
- self.words, self._multiwords = self._get_words()
- self._match_dict = {}
+ self.owl_class_type = owl_class_type
+ self.owl_individual_type = owl_individual_type
+ self.owl_type_link = owl_type_link
+ self.owl_synonym_type = owl_synonym_type
+ self.owl_hyponym_type = owl_hyponym_type
+ self.words, self._multiwords = self.get_words()
+ self.match_dict = {}
self.symmetric_matching = symmetric_matching
- self._populate_dictionary()
+ self.populate_dictionary()
class Entry:
"""Args:
@@ -67,25 +67,25 @@ def __init__(self, word, depth, is_individual):
self.depth = depth
self.is_individual = is_individual
- def _populate_dictionary(self):
+ def populate_dictionary(self):
"""Generates the dictionary from search phrase words to matching document words."""
- for class_id, _, _ in self._get_classes():
- entry_word = self._get_entry_word(class_id).lower()
- if entry_word in self._match_dict:
- entry_set = self._match_dict[entry_word]
+ for class_id, _, _ in self.get_classes():
+ entry_word = self.get_entry_word(class_id).lower()
+ if entry_word in self.match_dict:
+ entry_set = self.match_dict[entry_word]
else:
- self._match_dict[entry_word] = entry_set = set()
- self._recursive_add_to_dict(
+ self.match_dict[entry_word] = entry_set = set()
+ self.recursive_add_to_dict(
entry_set, entry_word, class_id, set(), 0, False, False,
self.symmetric_matching)
- for class_id, _, _ in self._get_individuals():
- entry_word = self._get_entry_word(class_id).lower()
- if entry_word in self._match_dict:
- entry_set = self._match_dict[entry_word]
+ for class_id, _, _ in self.get_individuals():
+ entry_word = self.get_entry_word(class_id).lower()
+ if entry_word in self.match_dict:
+ entry_set = self.match_dict[entry_word]
else:
- self._match_dict[entry_word] = entry_set = set()
- self._recursive_add_to_dict(
+ self.match_dict[entry_word] = entry_set = set()
+ self.recursive_add_to_dict(
entry_set, entry_word, class_id, set(), 0, True, False,
self.symmetric_matching)
@@ -103,8 +103,8 @@ def matches(self, search_phrase_word, candidate_word):
Matching is defined as *candidate_word* being a hyponym, synonym or individual instance
of *search_phrase_word*. Where *symmetric_matching==True*, matching also encompasses
*search_phrase_word* being a hyponym of *candidate_word*."""
- if search_phrase_word.lower() in self._match_dict.keys():
- for entry in self._match_dict[search_phrase_word.lower()]:
+ if search_phrase_word.lower() in self.match_dict.keys():
+ for entry in self.match_dict[search_phrase_word.lower()]:
if entry.word.lower() == candidate_word.lower():
return entry
return None
@@ -114,9 +114,9 @@ def get_words_matching(self, search_phrase_word):
as well as the hypernyms where *symmetric_matching==True*.
All words are set to lower case.
"""
- if search_phrase_word.lower() in self._match_dict.keys():
+ if search_phrase_word.lower() in self.match_dict.keys():
return set(map(
- lambda entry: entry.word.lower(), self._match_dict[search_phrase_word.lower()]))
+ lambda entry: entry.word.lower(), self.match_dict[search_phrase_word.lower()]))
else:
return []
@@ -126,38 +126,38 @@ def get_words_matching_and_depths(self, search_phrase_word):
the corresponding depths.
All words are set to lower case.
"""
- if search_phrase_word.lower() in self._match_dict.keys():
+ if search_phrase_word.lower() in self.match_dict.keys():
return set(map(
lambda entry: (entry.word.lower(), entry.depth),
- self._match_dict[search_phrase_word.lower()]))
+ self.match_dict[search_phrase_word.lower()]))
else:
return []
- def _get_classes(self):
+ def get_classes(self):
"""Returns all classes from the loaded ontology."""
return self._graph.triples((
- None, rdflib.term.URIRef(self._owl_type_link),
- rdflib.term.URIRef(self._owl_class_type)))
+ None, rdflib.term.URIRef(self.owl_type_link),
+ rdflib.term.URIRef(self.owl_class_type)))
- def _get_individuals(self):
+ def get_individuals(self):
"""Returns all classes from the loaded ontology."""
return self._graph.triples((
- None, rdflib.term.URIRef(self._owl_type_link),
- rdflib.term.URIRef(self._owl_individual_type)))
+ None, rdflib.term.URIRef(self.owl_type_link),
+ rdflib.term.URIRef(self.owl_individual_type)))
- def _get_words(self):
+ def get_words(self):
"""Finds all words in the loaded ontology and returns multiwords in a separate list."""
words = []
multiwords = []
for class_id, _, _ in chain(
- self._get_classes(), self._get_individuals()):
- entry_word = self._get_entry_word(class_id)
+ self.get_classes(), self.get_individuals()):
+ entry_word = self.get_entry_word(class_id)
words.append(entry_word.lower())
if ' ' in entry_word:
multiwords.append(entry_word.lower())
return words, multiwords
- def _recursive_add_to_dict(
+ def recursive_add_to_dict(
self, entry_set, word, working_entry_url, visited, depth, is_individual, is_hypernym,
symmetric):
"""Adds synonyms and hyponyms of a search phrase word to its dictionary.
@@ -179,41 +179,41 @@ def _recursive_add_to_dict(
"""
if working_entry_url not in visited:
visited.add(working_entry_url)
- working_entry_word = self._get_entry_word(working_entry_url)
+ working_entry_word = self.get_entry_word(working_entry_url)
if word.lower() != working_entry_word.lower():
entry_set.add(self.Entry(working_entry_word, depth, is_individual))
if not is_hypernym: # prevent recursive traversal of adjacent branches
for entry, _, _ in self._graph.triples((
- None, rdflib.term.URIRef(self._owl_hyponym_type), working_entry_url)):
- self._recursive_add_to_dict(
+ None, rdflib.term.URIRef(self.owl_hyponym_type), working_entry_url)):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth+1, False, False, symmetric)
for entry, _, _ in self._graph.triples((
- None, rdflib.term.URIRef(self._owl_type_link), working_entry_url)):
- self._recursive_add_to_dict(
+ None, rdflib.term.URIRef(self.owl_type_link), working_entry_url)):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth+1, True, False, symmetric)
for entry, _, _ in self._graph.triples((
- None, rdflib.term.URIRef(self._owl_synonym_type), working_entry_url)):
- self._recursive_add_to_dict(
+ None, rdflib.term.URIRef(self.owl_synonym_type), working_entry_url)):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth, False, False, symmetric)
for _, _, entry in self._graph.triples((
- working_entry_url, rdflib.term.URIRef(self._owl_synonym_type), None)):
- self._recursive_add_to_dict(
+ working_entry_url, rdflib.term.URIRef(self.owl_synonym_type), None)):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth, False, False, symmetric)
if symmetric and depth <= 0:
for _, _, entry in self._graph.triples((
- working_entry_url, rdflib.term.URIRef(self._owl_hyponym_type), None)):
- self._recursive_add_to_dict(
+ working_entry_url, rdflib.term.URIRef(self.owl_hyponym_type), None)):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth-1, False, True, symmetric)
if is_individual:
for _, _, entry in self._graph.triples((
- working_entry_url, rdflib.term.URIRef(self._owl_type_link), None)):
- if entry != rdflib.term.URIRef(self._owl_individual_type):
- self._recursive_add_to_dict(
+ working_entry_url, rdflib.term.URIRef(self.owl_type_link), None)):
+ if entry != rdflib.term.URIRef(self.owl_individual_type):
+ self.recursive_add_to_dict(
entry_set, word, entry, visited, depth-1, False, True, symmetric)
# setting depth to a negative value ensures the hypernym
# can never qualify as being equally or more specific than the original match.
- def _get_entry_word(self, class_id):
+ def get_entry_word(self, class_id):
"""Converts an OWL URL into an entry word
The fragment is retrieved from the URL and underscores are replaced with spaces.
@@ -228,17 +228,17 @@ def get_most_general_hypernym_ancestor(self, word):
"""
matching_set = set()
for clazz in (
- clazz for clazz, t, m in self._get_classes() if
- self._get_entry_word(clazz).lower() == word.lower()):
+ clazz for clazz, t, m in self.get_classes() if
+ self.get_entry_word(clazz).lower() == word.lower()):
this_class_set = set()
- self._recursive_add_to_dict(
+ self.recursive_add_to_dict(
this_class_set, word, clazz, set(), 0, False, False, True)
matching_set |= this_class_set
for individual in (
- individual for individual, t, m in self._get_individuals() if
- self._get_entry_word(individual).lower() == word.lower()):
+ individual for individual, t, m in self.get_individuals() if
+ self.get_entry_word(individual).lower() == word.lower()):
this_individual_set = set()
- self._recursive_add_to_dict(
+ self.recursive_add_to_dict(
this_individual_set, word, individual, set(), 0, True, False, True)
matching_set |= this_individual_set
matching_list = sorted(matching_set, key=lambda entry: (entry.depth, entry.word))
diff --git a/holmes_extractor/parsing.py b/holmes_extractor/parsing.py
new file mode 100644
index 0000000..278ba9e
--- /dev/null
+++ b/holmes_extractor/parsing.py
@@ -0,0 +1,2236 @@
+import math
+import pickle
+import importlib
+from abc import ABC, abstractmethod
+from functools import total_ordering
+import srsly
+import pkg_resources
+from numpy import dot
+from numpy.linalg import norm
+from spacy.tokens import Token, Doc
+from .errors import WrongModelDeserializationError, WrongVersionDeserializationError,\
+ DocumentTooBigError, SearchPhraseContainsNegationError,\
+ SearchPhraseContainsConjunctionError, SearchPhraseWithoutMatchableWordsError,\
+ SearchPhraseContainsMultipleClausesError, SearchPhraseContainsCoreferringPronounError
+
+SERIALIZED_DOCUMENT_VERSION = '3.1'
+
+class SemanticDependency:
+ """A labelled semantic dependency between two tokens."""
+
+ def __init__(self, parent_index, child_index, label=None, is_uncertain=False):
+ """Args:
+
+ parent_index -- the index of the parent token within the document.
+ child_index -- the index of the child token within the document, or one less than zero
+ minus the index of the child token within the document for a grammatical dependency. A
+ grammatical dependency is always in a non-final position within a chain of dependencies
+ ending in one or more non-grammatical (lexical / normal) dependencies. When creating
+ both Holmes semantic structures and search phrases, grammatical dependencies are
+ sometimes replaced by the lexical dependencies at the end of their chains.
+ label -- the label of the semantic dependency, which must be *None* for grammatical
+ dependencies.
+ is_uncertain -- if *True*, any match involving this dependency will itself be uncertain.
+ """
+ if child_index < 0 and label is not None:
+ raise RuntimeError(
+ 'Semantic dependency with negative child index may not have a label.')
+ if parent_index == child_index:
+ raise RuntimeError(' '.join((
+ 'Attempt to create self-referring semantic dependency with index',
+ str(parent_index))))
+ self.parent_index = parent_index
+ self.child_index = child_index
+ self.label = label
+ self.is_uncertain = is_uncertain
+
+ def parent_token(self, doc):
+ """Convenience method to return the parent token of this dependency.
+
+ doc -- the document containing the token.
+ """
+ index = self.parent_index
+ if index < 0:
+ index = -1 - index
+ return doc[index]
+
+ def child_token(self, doc):
+ """Convenience method to return the child token of this dependency.
+
+ doc -- the document containing the token.
+ """
+ index = self.child_index
+ if index < 0:
+ index = -1 - index
+ return doc[index]
+
+ def __str__(self):
+ """e.g. *2:nsubj* or *2:nsubj(U)* to represent uncertainty."""
+ working_label = str(self.label)
+ if self.is_uncertain:
+ working_label = ''.join((working_label, '(U)'))
+ return ':'.join((str(self.child_index), working_label))
+
+ def __eq__(self, other):
+ return isinstance(other, SemanticDependency) and \
+ self.parent_index == other.parent_index and self.child_index == other.child_index \
+ and self.label == other.label and self.is_uncertain == other.is_uncertain
+
+ def __hash__(self):
+ return hash((self.parent_index, self.child_index, self.label, self.is_uncertain))
+
+class Mention:
+ """ Simplified information about a coreference mention with respect to a specific token. """
+
+ def __init__(self, root_index, indexes):
+ """
+ root_index -- the index of the member of *indexes* that forms the syntactic head of any
+ coordinated phrase, or *indexes[0]* if *len(indexes) == 1*.
+ indexes -- the indexes of the tokens that make up the mention. If there is more than one
+ token, they must form a coordinated phrase.
+ """
+ self.root_index = root_index
+ self.indexes = indexes
+
+ def __str__(self):
+ return ''.join(('[', str(self.root_index), '; ', str(self.indexes), ']'))
+
+class Subword:
+ """A semantically atomic part of a word. Currently only used for German.
+
+ containing_token_index -- the index of the containing token within the document.
+ index -- the index of the subword within the word.
+ text -- the original subword string.
+ lemma -- the model-normalized representation of the subword string.
+ derived_lemma -- where relevant, another lemma with which *lemma* is derivationally related
+ and which can also be useful for matching in some usecases; otherwise *None*
+ vector -- the vector representation of *lemma*, or *None* if there is none available.
+ char_start_index -- the character index of the subword within the containing word.
+ dependent_index -- the index of a subword that is dependent on this subword, or *None*
+ if there is no such subword.
+ dependency_label -- the label of the dependency between this subword and its dependent,
+ or *None* if it has no dependent.
+ governor_index -- the index of a subword on which this subword is dependent, or *None*
+ if there is no such subword.
+ governing_dependency_label -- the label of the dependency between this subword and its
+ governor, or *None* if it has no governor.
+ """
+ def __init__(
+ self, containing_token_index, index, text, lemma, derived_lemma, vector,
+ char_start_index, dependent_index, dependency_label, governor_index,
+ governing_dependency_label):
+ self.containing_token_index = containing_token_index
+ self.index = index
+ self.text = text
+ self.lemma = lemma
+ self.derived_lemma = derived_lemma
+ self.vector = vector
+ self.char_start_index = char_start_index
+ self.dependent_index = dependent_index
+ self.dependency_label = dependency_label
+ self.governor_index = governor_index
+ self.governing_dependency_label = governing_dependency_label
+
+ def lemma_or_derived_lemma(self):
+ if self.derived_lemma is not None:
+ return self.derived_lemma
+ else:
+ return self.lemma
+
+ @property
+ def is_head(self):
+ return self.governor_index is None
+
+ def __str__(self):
+ if self.derived_lemma is not None:
+ lemma_string = ''.join((self.lemma, '(', self.derived_lemma, ')'))
+ else:
+ lemma_string = self.lemma
+ return '/'.join((self.text, lemma_string))
+
+
+@total_ordering
+class Index:
+ """ The position of a word or subword within a document. """
+
+ def __init__(self, token_index, subword_index):
+ self.token_index = token_index
+ self.subword_index = subword_index
+
+ def is_subword(self):
+ return self.subword_index is not None
+
+ def __eq__(self, other):
+ return isinstance(other, Index) and \
+ self.token_index == other.token_index and self.subword_index == other.subword_index
+
+ def __lt__(self, other):
+ if not isinstance(other, Index):
+ raise RuntimeError('Comparison between Index and another type.')
+ if self.token_index < other.token_index:
+ return True
+ if not self.is_subword() and other.is_subword():
+ return True
+ if self.is_subword() and other.is_subword() and self.subword_index < other.subword_index:
+ return True
+ return False
+
+ def __hash__(self):
+ return hash((self.token_index, self.subword_index))
+
+class CorpusWordPosition:
+ """ A reference to a word or subword within a corpus of one or more documents. """
+ def __init__(self, document_label, index):
+ if document_label is None:
+ raise RuntimeError('CorpusWordPosition.document_label must have a value.')
+ self.document_label = document_label
+ self.index = index
+
+ def __eq__(self, other):
+ return isinstance(other, CorpusWordPosition) and self.document_label == \
+ other.document_label and self.index == other.index
+
+ def __hash__(self):
+ return hash((self.document_label, self.index))
+
+ def __str__(self):
+ return ':'.join((self.document_label, str(self.index)))
+
+class MultiwordSpan:
+
+ def __init__(self, text, lemma, derived_lemma, tokens):
+ """Args:
+
+ text -- the raw text representation of the multiword span
+ lemma - the lemma representation of the multiword span
+ derived_lemma - the lemma representation with individual words that have derived
+ lemmas replaced by those derived lemmas
+ tokens -- a list of tokens that make up the multiword span
+ """
+ self.text = text
+ self.lemma = lemma
+ self.derived_lemma = derived_lemma
+ self.tokens = tokens
+
+class MatchImplication:
+ """Entry describing which document dependencies match a given search phrase dependency.
+
+ Parameters:
+
+ search_phrase_dependency -- the search phrase dependency.
+ document_dependencies -- the matching document dependencies.
+ reverse_document_dependencies -- document dependencies that match when the polarity is
+ opposite to the polarity of *search_phrase_dependency*.
+ """
+
+ def __init__(
+ self, *, search_phrase_dependency, document_dependencies,
+ reverse_document_dependencies=[]):
+ self.search_phrase_dependency = search_phrase_dependency
+ self.document_dependencies = document_dependencies
+ self.reverse_document_dependencies = reverse_document_dependencies
+
+class HolmesDocumentInfo:
+ def __init__(self, semantic_analyzer):
+ self.model = semantic_analyzer.model
+ self.serialized_document_version = SERIALIZED_DOCUMENT_VERSION
+
+ @srsly.msgpack_encoders("holmes_document_info_holder")
+ def serialize_obj(obj, chain=None):
+ if isinstance(obj, HolmesDocumentInfo):
+ return {'__holmes_document_info_holder__': pickle.dumps(obj)}
+ return obj if chain is None else chain(obj)
+
+ @srsly.msgpack_decoders("holmes_document_info_holder")
+ def deserialize_obj(obj, chain=None):
+ if '__holmes_document_info_holder__' in obj:
+ return pickle.loads(obj['__holmes_document_info_holder__'])
+ return obj if chain is None else chain(obj)
+
+class HolmesDictionary:
+ """The holder object for token-level semantic information managed by Holmes
+
+ Holmes dictionaries are accessed using the syntax *token._.holmes*.
+
+ index -- the index of the token
+ lemma -- the value returned from *._.holmes.lemma* for the token.
+ derived_lemma -- the value returned from *._.holmes.derived_lemma for the token; where relevant,
+ another lemma with which *lemma* is derivationally related and which can also be useful for
+ matching in some usecases; otherwise *None*.
+ vector -- the vector representation of *lemma*, unless *lemma* is a multiword, in which case
+ the vector representation of *token.lemma_* is used instead. *None* where there is no
+ vector for the lexeme.
+ """
+
+ def __init__(self, index, lemma, derived_lemma, vector):
+ self.index = index
+ self.lemma = lemma
+ self._derived_lemma = derived_lemma
+ self.vector = vector
+ self.children = [] # list of *SemanticDependency* objects where this token is the parent.
+ self.parents = [] # list of *SemanticDependency* objects where this token is the child.
+ self.righthand_siblings = [] # list of tokens to the right of this token that stand in a
+ # conjunction relationship to this token and that share its semantic parents.
+ self.token_or_lefthand_sibling_index = None # the index of this token's lefthand sibling,
+ # or this token's own index if this token has no lefthand sibling.
+ self.is_involved_in_or_conjunction = False
+ self.is_negated = None
+ self.is_matchable = None
+ self.is_initial_question_word = False
+ self.has_initial_question_word_in_phrase = False
+ self.coreference_linked_child_dependencies = [] # list of [index, label] specifications of
+ # dependencies where this token is the parent, taking any coreference resolution into
+ # account. Used in topic matching.
+ self.coreference_linked_parent_dependencies = [] # list of [index, label] specifications of
+ # dependencies where this token is the child, taking any coreference resolution into
+ # account. Used in topic matching.
+ self.token_and_coreference_chain_indexes = None # where no coreference, only the token
+ # index; where coreference, the token index followed by the indexes of coreferring tokens
+ self.mentions = []
+ self.subwords = []
+
+ @property
+ def derived_lemma(self):
+ if self.lemma == self._derived_lemma: # can occur with phraselets
+ return None
+ else:
+ return self._derived_lemma
+
+ @derived_lemma.setter
+ def derived_lemma(self, derived_lemma):
+ self._derived_lemma = derived_lemma
+
+ def lemma_or_derived_lemma(self):
+ if self._derived_lemma is not None:
+ return self._derived_lemma
+ else:
+ return self.lemma
+
+ @property
+ def is_uncertain(self):
+ """if *True*, a match involving this token will itself be uncertain."""
+ return self.is_involved_in_or_conjunction
+
+ def loop_token_and_righthand_siblings(self, doc):
+ """Convenience generator to loop through this token and any righthand siblings."""
+ indexes = [self.index]
+ indexes.extend(self.righthand_siblings)
+ indexes = sorted(indexes) # in rare cases involving truncated nouns in German, righthand
+ #siblings can actually end up to the left of the head word.
+ for index in indexes:
+ yield doc[index]
+
+ def get_sibling_indexes(self, doc):
+ """ Returns the indexes of this token and any siblings, ordered from left to right. """
+ # with truncated nouns in German, the righthand siblings may occasionally occur to the left
+ # of the head noun
+ head_sibling = doc[self.token_or_lefthand_sibling_index]
+ indexes = [self.token_or_lefthand_sibling_index]
+ indexes.extend(head_sibling._.holmes.righthand_siblings)
+ return sorted(indexes)
+
+ def has_dependency_with_child_index(self, index):
+ for dependency in self.children:
+ if dependency.child_index == index:
+ return True
+ return False
+
+ def get_label_of_dependency_with_child_index(self, index):
+ for dependency in self.children:
+ if dependency.child_index == index:
+ return dependency.label
+ return None
+
+ def has_dependency_with_label(self, label):
+ for dependency in self.children:
+ if dependency.label == label:
+ return True
+ return False
+
+ def has_dependency_with_child_index_and_label(self, index, label):
+ for dependency in self.children:
+ if dependency.child_index == index and dependency.label == label:
+ return True
+ return False
+
+ def remove_dependency_with_child_index(self, index):
+ self.children = [dep for dep in self.children if dep.child_index != index]
+
+ def string_representation_of_children(self):
+ children = sorted(
+ self.children, key=lambda dependency: dependency.child_index)
+ return '; '.join(str(child) for child in children)
+
+ def string_representation_of_parents(self):
+ parents = sorted(
+ self.parents, key=lambda dependency: dependency.parent_index)
+ return '; '.join(':'.join((str(parent.parent_index), parent.label)) for parent in parents)
+
+ def is_involved_in_coreference(self):
+ return len(self.mentions) > 0
+
+ @srsly.msgpack_encoders("holmes_dictionary_holder")
+ def serialize_obj(obj, chain=None):
+ if isinstance(obj, HolmesDictionary):
+ return {'__holmes_dictionary_holder__': pickle.dumps(obj)}
+ return obj if chain is None else chain(obj)
+
+ @srsly.msgpack_decoders("holmes_dictionary_holder")
+ def deserialize_obj(obj, chain=None):
+ if '__holmes_dictionary_holder__' in obj:
+ return pickle.loads(obj['__holmes_dictionary_holder__'])
+ return obj if chain is None else chain(obj)
+
+class PhraseletTemplate:
+ """A template for a phraselet used in topic matching.
+
+ Properties:
+
+ label -- a label for the relation which will be used to form part of the labels of phraselets
+ derived from this template.
+ template_sentence -- a sentence with the target grammatical structure for phraselets derived
+ from this template.
+ template_doc -- a spacy Doc representing *template_sentence* (set by the *Manager* object)
+ parent_index -- the index within *template_sentence* of the parent participant in the dependency
+ (for relation phraselets) or of the word (for single-word phraselets).
+ child_index -- the index within *template_sentence* of the child participant in the dependency
+ (for relation phraselets) or 'None' for single-word phraselets.
+ dependency_labels -- the labels of dependencies that match the template
+ (for relation phraselets) or 'None' for single-word phraselets.
+ parent_tags -- the tag_ values of parent participants in the dependency (for parent phraselets)
+ of of the word (for single-word phraselets) that match the template.
+ child_tags -- the tag_ values of child participants in the dependency (for parent phraselets)
+ that match the template, or 'None' for single-word phraselets.
+ reverse_only -- specifies that relation phraselets derived from this template should only be
+ reverse-matched, e.g. matching should only be attempted during topic matching when the
+ possible child token has already been matched to a single-word phraselet. This
+ is used for performance reasons when the parent tag belongs to a closed word class like
+ prepositions. Reverse-only phraselets are ignored in supervised document classification.
+ question -- a question template involves an interrogative pronoun in the child position and is
+ always matched regardless of corpus frequencies.
+ assigned_dependency_label -- if a value other than 'None', specifies a dependency label that
+ should be used to relabel the relationship between the parent and child participants.
+ Has no effect if child_index is None.
+ """
+
+ def __init__(
+ self, label, template_sentence, parent_index, child_index,
+ dependency_labels, parent_tags, child_tags, *, reverse_only,
+ question, assigned_dependency_label=None):
+ self.label = label
+ self.template_sentence = template_sentence
+ self.parent_index = parent_index
+ self.child_index = child_index
+ self.dependency_labels = dependency_labels
+ self.parent_tags = parent_tags
+ self.child_tags = child_tags
+ self.reverse_only = reverse_only
+ self.question = question
+ self.assigned_dependency_label = assigned_dependency_label
+
+ def single_word(self):
+ """ 'True' if this is a template for single-word phraselets, otherwise 'False'. """
+ return self.child_index is None
+
+class PhraseletInfo:
+ """Information describing a topic matching phraselet.
+
+ Parameters:
+
+ label -- the phraselet label, e.g. 'predicate-patient: open-door'
+ template_label -- the value of 'PhraseletTemplate.label', e.g. 'predicate-patient'
+ parent_lemma -- the parent lemma, or the lemma for single-word phraselets.
+ parent_derived_lemma -- the parent derived lemma, or the derived lemma for single-word
+ phraselets.
+ parent_pos -- the part of speech tag of the token that supplied the parent word.
+ parent_ent_type -- the parent entity label, or the entity label for single-word
+ phraselets. '' if there is none.
+ parent_is_initial_question_word -- 'True' or 'False'
+ parent_has_initial_question_word_in_phrase -- 'True' or 'False'
+ child_lemma -- the child lemma, or 'None' for single-word phraselets.
+ child_derived_lemma -- the child derived lemma, or 'None' for single-word phraselets.
+ child_pos -- the part of speech tag of the token that supplied the child word, or 'None'
+ for single-word phraselets.
+ child_ent_type -- the child entity label. '' if there is none; 'None' for single-word
+ phraselets.
+ child_is_initial_question_word -- 'True' or 'False'
+ child_has_initial_question_word_in_phrase -- 'True' or 'False'
+ created_without_matching_tags -- 'True' if created without matching tags.
+ reverse_only_parent_lemma -- 'True' if the parent lemma is in the reverse matching list.
+ frequency_factor -- a multiplication factor between 0.0 and 1.0 which is lower the more
+ frequently words occur in the corpus, relating to the whole phraselet.
+ parent_frequency_factor -- a multiplication factor between 0.0 and 1.0 which is lower the
+ more frequently words occur in the corpus, relating to the parent token.
+ child_frequency_factor -- a multiplication factor between 0.0 and 1.0 which is lower the
+ more frequently words occur in the corpus, relating to the child token.
+ """
+
+ def __init__(
+ self, label, template_label, parent_lemma, parent_derived_lemma,
+ parent_pos, parent_ent_type, parent_is_initial_question_word,
+ parent_has_initial_question_word_in_phrase, child_lemma, child_derived_lemma,
+ child_pos, child_ent_type, child_is_initial_question_word,
+ child_has_initial_question_word_in_phrase, created_without_matching_tags,
+ reverse_only_parent_lemma, frequency_factor, parent_frequency_factor,
+ child_frequency_factor):
+ self.label = label
+ self.template_label = template_label
+
+ self.parent_lemma = parent_lemma
+ self.parent_derived_lemma = parent_derived_lemma
+ self.parent_pos = parent_pos
+ self.parent_ent_type = parent_ent_type
+ self.parent_is_initial_question_word = parent_is_initial_question_word
+ self.parent_has_initial_question_word_in_phrase = parent_has_initial_question_word_in_phrase
+ self.child_lemma = child_lemma
+ self.child_derived_lemma = child_derived_lemma
+ self.child_pos = child_pos
+ self.child_ent_type = child_ent_type
+ self.child_is_initial_question_word = child_is_initial_question_word
+ self.child_has_initial_question_word_in_phrase = child_has_initial_question_word_in_phrase
+ self.created_without_matching_tags = created_without_matching_tags
+ self.reverse_only_parent_lemma = reverse_only_parent_lemma
+ self.frequency_factor = frequency_factor
+ self.parent_frequency_factor = parent_frequency_factor
+ self.child_frequency_factor = child_frequency_factor
+
+ def __eq__(self, other):
+ return isinstance(other, PhraseletInfo) and \
+ self.label == other.label and \
+ self.template_label == other.template_label and \
+ self.parent_lemma == other.parent_lemma and \
+ self.parent_derived_lemma == other.parent_derived_lemma and \
+ self.parent_pos == other.parent_pos and \
+ self.parent_ent_type == other.parent_ent_type and \
+ self.parent_is_initial_question_word == other.parent_is_initial_question_word and \
+ self.parent_has_initial_question_word_in_phrase == \
+ other.parent_has_initial_question_word_in_phrase and \
+ self.child_lemma == other.child_lemma and \
+ self.child_derived_lemma == other.child_derived_lemma and \
+ self.child_pos == other.child_pos and \
+ self.child_ent_type == other.child_ent_type and \
+ self.child_is_initial_question_word == other.child_is_initial_question_word and \
+ self.child_has_initial_question_word_in_phrase == \
+ other.child_has_initial_question_word_in_phrase and \
+ self.created_without_matching_tags == other.created_without_matching_tags and \
+ self.reverse_only_parent_lemma == other.reverse_only_parent_lemma and \
+ str(self.frequency_factor) == str(other.frequency_factor) and \
+ str(self.parent_frequency_factor) == str(other.parent_frequency_factor) and \
+ str(self.child_frequency_factor) == str(other.child_frequency_factor)
+
+ def __hash__(self):
+ return hash((
+ self.label, self.template_label, self.parent_lemma, self.parent_derived_lemma,
+ self.parent_pos, self.parent_ent_type, self.parent_is_initial_question_word,
+ self.parent_has_initial_question_word_in_phrase, self.child_lemma,
+ self.child_derived_lemma, self.child_pos, self.child_ent_type,
+ self.child_is_initial_question_word, self.child_has_initial_question_word_in_phrase,
+ self.created_without_matching_tags, self.reverse_only_parent_lemma,
+ str(self.frequency_factor), str(self.parent_frequency_factor),
+ str(self.child_frequency_factor)))
+
+class SearchPhrase:
+
+ def __init__(self, doc, matchable_token_indexes, root_token_index,
+ matchable_non_entity_tokens_to_vectors, label, topic_match_phraselet,
+ topic_match_phraselet_created_without_matching_tags, question_phraselet, reverse_only,
+ treat_as_reverse_only_during_initial_relation_matching, words_matching_root_token,
+ root_word_to_match_info_dict, has_single_matchable_word):
+ """Args:
+
+ doc -- the Holmes document created for the search phrase
+ matchable_token_indexes -- a list of indexes of tokens all of which must have counterparts
+ in the document to produce a match
+ root_token_index -- the index of the token at which recursive matching starts
+ matchable_non_entity_tokens_to_vectors -- dictionary from token indexes to vectors.
+ Only used when embedding matching is active.
+ label -- a label for the search phrase.
+ topic_match_phraselet -- 'True' if a topic match phraselet, otherwise 'False'.
+ topic_match_phraselet_created_without_matching_tags -- 'True' if a topic match
+ phraselet created without matching tags (match_all_words), otherwise 'False'.
+ question_phraselet -- 'True' if a topic match phraselet where the child member is
+ an initial question word, otherwise 'False'
+ reverse_only -- 'True' if a phraselet that should only be reverse-matched.
+ treat_as_reverse_only_during_initial_relation_matching -- phraselets are
+ set to *True* in the context of topic matching to prevent them from being taken into
+ account during initial relation matching because the parent relation occurs too
+ frequently during the corpus. *reverse_only* cannot be used instead because it
+ has an effect on scoring.
+ words_matching_root_token -- a list of words that match the root token.
+ root_word_to_match_info_dict -- a dictionary from words in *words_matching_root_token*
+ to match information tuples.
+ has_single_matchable_word -- **True** or **False**.
+ """
+ self.doc = doc
+ self.doc_text = doc.text
+ self.matchable_token_indexes = matchable_token_indexes
+ self.root_token_index = root_token_index
+ self.matchable_non_entity_tokens_to_vectors = matchable_non_entity_tokens_to_vectors
+ self.label = label
+ self.topic_match_phraselet = topic_match_phraselet
+ self.topic_match_phraselet_created_without_matching_tags = \
+ topic_match_phraselet_created_without_matching_tags
+ self.question_phraselet = question_phraselet
+ self.reverse_only = reverse_only
+ self.treat_as_reverse_only_during_initial_relation_matching = \
+ treat_as_reverse_only_during_initial_relation_matching
+ self.words_matching_root_token = words_matching_root_token
+ self.root_word_to_match_info_dict = root_word_to_match_info_dict
+ self.has_single_matchable_word = has_single_matchable_word#len(matchable_token_indexes) == 1
+
+ @property
+ def matchable_tokens(self):
+ return [self.doc[index] for index in self.matchable_token_indexes]
+
+ @property
+ def root_token(self):
+ return self.doc[self.root_token_index]
+
+ def pack(self):
+ self.serialized_doc = self.doc.to_bytes()
+ self.doc = None
+
+ def unpack(self, vocab):
+ self.doc = Doc(vocab).from_bytes(self.serialized_doc)
+ self.serialized_doc = None
+
+class SemanticAnalyzerFactory():
+ """Returns the correct *SemanticAnalyzer* for the model language.
+ This class must be added to if additional implementations are added for new languages.
+ """
+
+ def semantic_analyzer(self, *, nlp, vectors_nlp):
+ language = nlp.meta['lang']
+ try:
+ language_specific_rules_module = importlib.import_module(
+ '.'.join(('.lang', language, 'language_specific_rules')),
+ 'holmes_extractor')
+ except ModuleNotFoundError:
+ raise ValueError(' '.join(('Language', language, 'not supported')))
+ return language_specific_rules_module.\
+ LanguageSpecificSemanticAnalyzer(nlp=nlp, vectors_nlp=vectors_nlp)
+
+class SemanticAnalyzer(ABC):
+ """Abstract *SemanticAnalyzer* parent class. A *SemanticAnalyzer* is responsible for adding the
+ *token._.holmes* dictionaries to each token within a spaCy document. It requires full access to
+ the spaCy *Language* object, cannot be serialized and so may only be called within main
+ processes, not from worker processes.
+
+ Functionality is placed here that is common to all
+ current implementations. It follows that some functionality will probably have to be moved
+ out to specific implementations whenever an implementation for a new language is added.
+
+ For explanations of the abstract variables and methods, see the *EnglishSemanticAnalyzer*
+ implementation where they can be illustrated with direct examples.
+ """
+
+ language_name = NotImplemented
+
+ noun_pos = NotImplemented
+
+ predicate_head_pos = NotImplemented
+
+ matchable_pos = NotImplemented
+
+ adjectival_predicate_head_pos = NotImplemented
+
+ adjectival_predicate_subject_pos = NotImplemented
+
+ adjectival_predicate_subject_dep = NotImplemented
+
+ adjectival_predicate_predicate_dep = NotImplemented
+
+ adjectival_predicate_predicate_pos = NotImplemented
+
+ modifier_dep = NotImplemented
+
+ spacy_noun_to_preposition_dep = NotImplemented
+
+ spacy_verb_to_preposition_dep = NotImplemented
+
+ holmes_noun_to_preposition_dep = NotImplemented
+
+ holmes_verb_to_preposition_dep = NotImplemented
+
+ conjunction_deps = NotImplemented
+
+ interrogative_pronoun_tags = NotImplemented
+
+ semantic_dependency_excluded_tags = NotImplemented
+
+ generic_pronoun_lemmas = NotImplemented
+
+ or_lemma = NotImplemented
+
+ mark_child_dependencies_copied_to_siblings_as_uncertain = NotImplemented
+
+ maximum_mentions_in_coreference_chain = NotImplemented
+
+ maximum_word_distance_in_coreference_chain = NotImplemented
+
+ sibling_marker_deps = NotImplemented
+
+ entity_labels_to_corresponding_lexemes = NotImplemented
+
+ whose_lemma = NotImplemented
+
+ @abstractmethod
+ def add_subwords(self, token, subword_cache):
+ pass
+
+ @abstractmethod
+ def set_negation(self, token):
+ pass
+
+ @abstractmethod
+ def correct_auxiliaries_and_passives(self, token):
+ pass
+
+ @abstractmethod
+ def perform_language_specific_tasks(self, token):
+ pass
+
+ @abstractmethod
+ def handle_relative_constructions(self, token):
+ pass
+
+ @abstractmethod
+ def holmes_lemma(self, token):
+ pass
+
+ @abstractmethod
+ def language_specific_derived_holmes_lemma(self, token, lemma):
+ pass
+
+ def __init__(self, *, nlp, vectors_nlp):
+ """Args:
+
+ nlp -- the spaCy model
+ vectors_nlp -- the spaCy model to use for vocabularies and vectors
+ """
+ self.nlp = nlp
+ self.vectors_nlp = vectors_nlp
+ self.model = '_'.join((self.nlp.meta['lang'], self.nlp.meta['name']))
+ self.derivational_dictionary = self.load_derivational_dictionary()
+ self.serialized_document_version = SERIALIZED_DOCUMENT_VERSION
+
+ def load_derivational_dictionary(self):
+ in_package_filename = ''.join(('lang/', self.nlp.meta['lang'], '/data/derivation.csv'))
+ absolute_filename = pkg_resources.resource_filename(__name__, in_package_filename)
+ dictionary = {}
+ with open(absolute_filename, "r", encoding="utf-8") as file:
+ for line in file.readlines():
+ words = [word.strip() for word in line.split(',')]
+ for index, _ in enumerate(words):
+ dictionary[words[index]] = words[0]
+ return dictionary
+
+ _maximum_document_size = 1000000
+
+ def spacy_parse(self, text):
+ """Performs a standard spaCy parse on a string.
+ """
+ if len(text) > self._maximum_document_size:
+ raise DocumentTooBigError(' '.join((
+ 'size:', str(len(text)), 'max:', str(self._maximum_document_size))))
+ return self.nlp(text, disable=['coreferee', 'holmes'])
+
+ def parse(self, text):
+ return self.nlp(text)
+
+ def get_vector(self, lemma):
+ """ Returns a vector representation of *lemma*, or *None* if none is available.
+ """
+ lexeme = self.vectors_nlp.vocab[lemma]
+ return lexeme.vector if lexeme.has_vector and lexeme.vector_norm > 0 else None
+
+ def holmes_parse(self, spacy_doc):
+ """Adds the Holmes-specific information to each token within a spaCy document.
+ """
+ spacy_doc._.set('holmes_document_info', HolmesDocumentInfo(self))
+ for token in spacy_doc:
+ lemma = self.holmes_lemma(token)
+ derived_lemma = self.derived_holmes_lemma(token, lemma)
+ lexeme = self.vectors_nlp.vocab[token.lemma_ if len(lemma.split()) > 1 else lemma]
+ vector = lexeme.vector if lexeme.has_vector and lexeme.vector_norm > 0 else None
+ token._.set('holmes', HolmesDictionary(token.i, lemma, derived_lemma, vector))
+ for token in spacy_doc:
+ self.set_negation(token)
+ for token in spacy_doc:
+ self.initialize_semantic_dependencies(token)
+ self.set_initial_question_words(spacy_doc)
+ for token in spacy_doc:
+ self.mark_if_righthand_sibling(token)
+ token._.holmes.token_or_lefthand_sibling_index = self._lefthand_sibling_recursively(
+ token)
+ for token in spacy_doc:
+ self.copy_any_sibling_info(token)
+ subword_cache = {}
+ for token in spacy_doc:
+ self.add_subwords(token, subword_cache)
+ for token in spacy_doc:
+ self.set_coreference_information(token)
+ for token in spacy_doc:
+ self.set_matchability(token)
+ for token in spacy_doc:
+ self.correct_auxiliaries_and_passives(token)
+ for token in spacy_doc:
+ self.copy_any_sibling_info(token)
+ for token in spacy_doc:
+ self.handle_relative_constructions(token)
+ for token in spacy_doc:
+ self.normalize_predicative_adjectives(token)
+ for token in spacy_doc:
+ self.create_additional_preposition_phrase_semantic_dependencies(token)
+ for token in spacy_doc:
+ self.perform_language_specific_tasks(token)
+ for token in spacy_doc:
+ self.create_convenience_dependencies(token)
+ return spacy_doc
+
+ def _lefthand_sibling_recursively(self, token):
+ """If *token* is a righthand sibling, return the index of the token that has a sibling
+ reference to it, otherwise return the index of *token* itself.
+ """
+ if token.dep_ not in self.conjunction_deps:
+ return token.i
+ else:
+ return self._lefthand_sibling_recursively(token.head)
+
+ def debug_structures(self, doc):
+ for token in doc:
+ if token._.holmes.derived_lemma is not None:
+ lemma_string = ''.join((
+ token._.holmes.lemma, '(', token._.holmes.derived_lemma, ')'))
+ else:
+ lemma_string = token._.holmes.lemma
+ subwords_strings = ';'.join(str(subword) for subword in token._.holmes.subwords)
+ subwords_strings = ''.join(('[', subwords_strings, ']'))
+ negation_string = 'negative' if token._.holmes.is_negated else 'positive'
+ uncertainty_string = 'uncertain' if token._.holmes.is_uncertain else 'certain'
+ matchability_string = 'matchable' if token._.holmes.is_matchable else 'unmatchable'
+ if token._.holmes.is_involved_in_coreference():
+ coreference_string = '; '.join(
+ str(mention) for mention in token._.holmes.mentions)
+ else:
+ coreference_string = ''
+ print(
+ token.i, token.text, lemma_string, subwords_strings, token.pos_, token.tag_,
+ token.dep_, token.ent_type_, token.head.i,
+ token._.holmes.string_representation_of_children(),
+ token._.holmes.righthand_siblings, negation_string,
+ uncertainty_string, matchability_string, coreference_string)
+
+ def set_coreference_information(self, token):
+ token._.holmes.token_and_coreference_chain_indexes = [token.i]
+ token._.holmes.most_specific_coreferring_term_index = None
+ for chain in token._.coref_chains:
+ this_token_mention_index = -1
+ for mention_index, mention in enumerate(chain):
+ if token.i in mention.token_indexes:
+ this_token_mention_index = mention_index
+ break
+ if this_token_mention_index > -1:
+ for mention_index, mention in enumerate(chain):
+ if this_token_mention_index - mention_index > \
+ self.maximum_mentions_in_coreference_chain or \
+ abs (mention.root_index - token.i) > \
+ self.maximum_word_distance_in_coreference_chain:
+ continue
+ if mention_index - this_token_mention_index > \
+ self.maximum_mentions_in_coreference_chain:
+ break
+ token._.holmes.mentions.append(Mention(mention.root_index,
+ [token.i] if token.i in mention.token_indexes
+ else mention.token_indexes))
+ if len(chain[0]) == 1: # chains with coordinated mentions are not relevant to
+ # most specific mentions
+ token._.holmes.most_specific_coreferring_term_index = \
+ chain[chain.most_specific_mention_index][0]
+ working_set = set()
+ for mention in (m for m in token._.holmes.mentions if token.i not in m.indexes):
+ working_set.update(mention.indexes)
+ token._.holmes.token_and_coreference_chain_indexes.extend(sorted(working_set))
+
+ def model_supports_embeddings(self):
+ return self.vectors_nlp.meta['vectors']['vectors'] > 0
+
+ def is_interrogative_pronoun(self, token:Token):
+ return token.tag_ in self.interrogative_pronoun_tags
+
+ def derived_holmes_lemma(self, token, lemma):
+ if lemma in self.derivational_dictionary:
+ derived_lemma = self.derivational_dictionary[lemma]
+ if lemma == derived_lemma: # basis entry, so do not call language specific method
+ return None
+ else:
+ return derived_lemma
+ else:
+ return self.language_specific_derived_holmes_lemma(token, lemma)
+
+ def initialize_semantic_dependencies(self, token):
+ for child in (
+ child for child in token.children if child.dep_ != 'punct' and
+ child.tag_ not in self.semantic_dependency_excluded_tags):
+ token._.holmes.children.append(SemanticDependency(token.i, child.i, child.dep_))
+
+ def set_initial_question_words(self, doc:Doc):
+ """ is_initial_question_word -- True on a token that represents an interrogative pronoun
+ within an initial phrase.
+ has_initial_question_word_in_phrase -- True on a token within an initial phrase that
+ governs an interrogative pronoun.
+ """
+ initial_sentence = next(doc.sents, None)
+ if initial_sentence is not None:
+ visited = set()
+ working_first_phrase_head = doc[0]
+ while working_first_phrase_head.head is not None and not \
+ working_first_phrase_head.head.pos_ in ('VERB', 'AUX') \
+ and not working_first_phrase_head.head in visited:
+ visited.add(working_first_phrase_head)
+ working_first_phrase_head = working_first_phrase_head.head
+ for token in initial_sentence:
+ if self.is_interrogative_pronoun(token) and \
+ token in working_first_phrase_head.subtree:
+ token._.holmes.is_initial_question_word = True
+ for token in initial_sentence:
+ if token.pos_ in self.noun_pos and len([1 for c in token._.holmes.children
+ if self.is_interrogative_pronoun(c.child_token(token.doc)) and
+ c.child_token(token.doc)._.holmes.lemma != self.whose_lemma]) > 0:
+ token._.holmes.has_initial_question_word_in_phrase = True
+
+ def mark_if_righthand_sibling(self, token):
+ if token.dep_ in self.sibling_marker_deps: # i.e. is righthand sibling
+ working_token = token
+ working_or_conjunction_flag = False
+ # work up through the tree until the lefthandmost sibling element with the
+ # semantic relationships to the rest of the sentence is reached
+ while working_token.dep_ in self.conjunction_deps:
+ working_token = working_token.head
+ for working_child in working_token.children:
+ if working_child.lemma_ == self.or_lemma:
+ working_or_conjunction_flag = True
+ # add this element to the lefthandmost sibling as a righthand sibling
+ working_token._.holmes.righthand_siblings.append(token.i)
+ if working_or_conjunction_flag:
+ working_token._.holmes.is_involved_in_or_conjunction = True
+
+ def copy_any_sibling_info(self, token):
+ # Copy the or conjunction flag to righthand siblings
+ if token._.holmes.is_involved_in_or_conjunction:
+ for righthand_sibling in token._.holmes.righthand_siblings:
+ token.doc[righthand_sibling]._.holmes.is_involved_in_or_conjunction = True
+ for dependency in (
+ dependency for dependency in token._.holmes.children
+ if dependency.child_index >= 0):
+ # where a token has a dependent token and the dependent token has righthand siblings,
+ # add dependencies from the parent token to the siblings
+ for child_righthand_sibling in \
+ token.doc[dependency.child_index]._.holmes.righthand_siblings:
+ # Check this token does not already have the dependency
+ if len([dependency for dependency in token._.holmes.children if
+ dependency.child_index == child_righthand_sibling]) == 0:
+ child_index_to_add = child_righthand_sibling
+ # If this token is a grammatical element, it needs to point to new
+ # child dependencies as a grammatical element as well
+ if dependency.child_index < 0:
+ child_index_to_add = 0 - (child_index_to_add + 1)
+ # Check adding the new dependency will not result in a loop and that
+ # this token still does not have the dependency now its index has
+ # possibly been changed
+ if token.i != child_index_to_add and not \
+ token._.holmes.has_dependency_with_child_index(child_index_to_add):
+ token._.holmes.children.append(SemanticDependency(
+ token.i, child_index_to_add, dependency.label, dependency.is_uncertain))
+ # where a token has a dependent token and the parent token has righthand siblings,
+ # add dependencies from the siblings to the dependent token, unless the dependent
+ # token is to the right of the parent token but to the left of the sibling.
+ for righthand_sibling in (
+ righthand_sibling for righthand_sibling in
+ token._.holmes.righthand_siblings if righthand_sibling !=
+ dependency.child_index and (
+ righthand_sibling < dependency.child_index or
+ dependency.child_index < token.i)):
+ # unless the sibling already contains a dependency with the same label
+ # or the sibling has this token as a dependent child
+ righthand_sibling_token = token.doc[righthand_sibling]
+ if len([sibling_dependency for sibling_dependency in
+ righthand_sibling_token._.holmes.children if
+ sibling_dependency.label == dependency.label and not
+ token._.holmes.has_dependency_with_child_index(
+ sibling_dependency.child_index)]) == 0 and \
+ dependency.label not in self.conjunction_deps and not \
+ righthand_sibling_token._.holmes.has_dependency_with_child_index(
+ dependency.child_index) \
+ and righthand_sibling != dependency.child_index:
+ righthand_sibling_token._.holmes.children.append(SemanticDependency(
+ righthand_sibling, dependency.child_index, dependency.label,
+ self.mark_child_dependencies_copied_to_siblings_as_uncertain
+ or dependency.is_uncertain))
+
+ def normalize_predicative_adjectives(self, token):
+ """Change phrases like *the town is old* and *the man is poor* so their
+ semantic structure is equivalent to *the old town* and *the poor man*.
+ """
+ if token.pos_ in self.adjectival_predicate_head_pos:
+ altered = False
+ for predicative_adjective_index in (
+ dependency.child_index for dependency in \
+ token._.holmes.children if dependency.label ==
+ self.adjectival_predicate_predicate_dep and
+ token.doc[dependency.child_index].pos_ ==
+ self.adjectival_predicate_predicate_pos and
+ dependency.child_index >= 0):
+ for subject_index in (
+ dependency.child_index for dependency in
+ token._.holmes.children if dependency.label ==
+ self.adjectival_predicate_subject_dep and (
+ dependency.child_token(token.doc).pos_ in
+ self.adjectival_predicate_subject_pos or
+ dependency.child_token(token.doc)._.holmes.is_involved_in_coreference()
+ and dependency.child_index >= 0
+ and dependency.child_index != predicative_adjective_index)):
+ token.doc[subject_index]._.holmes.children.append(
+ SemanticDependency(
+ subject_index, predicative_adjective_index, self.modifier_dep))
+ altered = True
+ if altered:
+ token._.holmes.children = [SemanticDependency(
+ token.i, 0 - (subject_index + 1), None)]
+
+ def create_additional_preposition_phrase_semantic_dependencies(self, token):
+ """In structures like 'Somebody needs insurance for a period' it seems to be
+ mainly language-dependent whether the preposition phrase is analysed as being
+ dependent on the preceding noun or the preceding verb. We add an additional, new
+ dependency to whichever of the noun or the verb does not already have one. In English,
+ the new label is defined in *match_implication_dict* in such a way that original
+ dependencies in search phrases match new dependencies in documents but not vice versa.
+ This restriction is not applied in German because the fact the verb can be placed in
+ different positions within the sentence means there is considerable variation around
+ how prepositional phrases are analyzed by spaCy.
+ """
+
+ def add_dependencies_pointing_to_preposition_and_siblings(parent, label):
+ for working_preposition in token._.holmes.loop_token_and_righthand_siblings(token.doc):
+ if parent.i != working_preposition.i:
+ parent._.holmes.children.append(SemanticDependency(
+ parent.i, working_preposition.i, label, True))
+
+ # token is a preposition ...
+ if token.pos_ == 'ADP':
+ # directly preceded by a noun
+ if token.i > 0 and token.doc[token.i-1].sent == token.sent and \
+ token.doc[token.i-1].pos_ in ('NOUN', 'PROPN', 'PRON'):
+ preceding_noun = token.doc[token.i-1]
+ # and the noun is governed by at least one verb
+ governing_verbs = [
+ working_token for working_token in token.sent
+ if working_token.pos_ == 'VERB' and
+ working_token._.holmes.has_dependency_with_child_index(
+ preceding_noun.i)]
+ if len(governing_verbs) == 0:
+ return
+ # if the noun governs the preposition, add new possible dependencies
+ # from the verb(s)
+ for governing_verb in governing_verbs:
+ if preceding_noun._.holmes.has_dependency_with_child_index_and_label(
+ token.i, self.spacy_noun_to_preposition_dep) and not \
+ governing_verb._.holmes.has_dependency_with_child_index_and_label(
+ token.i, self.spacy_verb_to_preposition_dep):
+ add_dependencies_pointing_to_preposition_and_siblings(
+ governing_verb, self.holmes_verb_to_preposition_dep)
+ # if the verb(s) governs the preposition, add new possible dependencies
+ # from the noun
+ if governing_verbs[0]._.holmes.has_dependency_with_child_index_and_label(
+ token.i, self.spacy_verb_to_preposition_dep) and not \
+ preceding_noun._.holmes.has_dependency_with_child_index_and_label(
+ token.i, self.spacy_noun_to_preposition_dep):
+ # check the preposition is not pointing back to a relative clause
+ for preposition_dep_index in (
+ dep.child_index for dep in token._.holmes.children
+ if dep.child_index >= 0):
+ if token.doc[preposition_dep_index]._.holmes.\
+ has_dependency_with_label('relcl'):
+ return
+ add_dependencies_pointing_to_preposition_and_siblings(
+ preceding_noun, self.holmes_noun_to_preposition_dep)
+
+ def set_matchability(self, token):
+ """Marks whether this token, if it appears in a search phrase, should require a counterpart
+ in a document being matched.
+ """
+ token._.holmes.is_matchable = (
+ token.pos_ in self.matchable_pos or token._.holmes.is_involved_in_coreference()
+ or len(token._.holmes.subwords) > 0) \
+ and not self.is_interrogative_pronoun(token) and \
+ token._.holmes.lemma not in self.generic_pronoun_lemmas
+
+ def move_information_between_tokens(self, from_token, to_token):
+ """Moves semantic child and sibling information from one token to another.
+
+ Args:
+
+ from_token -- the source token, which will be marked as a grammatical token
+ pointing to *to_token*.
+ to_token -- the destination token.
+ """
+ linking_dependencies = [
+ dependency for dependency in from_token._.holmes.children
+ if dependency.child_index == to_token.i]
+ if len(linking_dependencies) == 0:
+ return # should only happen if there is a problem with the spaCy structure
+ # only loop dependencies whose label or index are not already present at the destination
+ for dependency in (
+ dependency for dependency in from_token._.holmes.children
+ if not to_token._.holmes.has_dependency_with_child_index(dependency.child_index)
+ and to_token.i != dependency.child_index and
+ to_token.i not in to_token.doc[dependency.child_index]._.holmes.righthand_siblings
+ and dependency.child_index not in to_token._.holmes.righthand_siblings):
+ to_token._.holmes.children.append(SemanticDependency(
+ to_token.i, dependency.child_index, dependency.label, dependency.is_uncertain))
+ from_token._.holmes.children = [SemanticDependency(from_token.i, 0 - (to_token.i + 1))]
+ to_token._.holmes.righthand_siblings.extend(
+ from_token._.holmes.righthand_siblings)
+ from_token._.holmes.righthand_siblings = []
+ if from_token._.holmes.is_involved_in_or_conjunction:
+ to_token._.holmes.is_involved_in_or_conjunction = True
+ if from_token._.holmes.is_negated:
+ to_token._.holmes.is_negated = True
+ # If from_token is the righthand sibling of some other token within the same sentence,
+ # replace that token's reference with a reference to to_token
+ for token in from_token.sent:
+ if from_token.i in token._.holmes.righthand_siblings:
+ token._.holmes.righthand_siblings.remove(from_token.i)
+ if token.i != to_token.i:
+ token._.holmes.righthand_siblings.append(to_token.i)
+
+ def create_convenience_dependencies(self, token):
+ for child_dependency in (
+ child_dependency for child_dependency in token._.holmes.children
+ if child_dependency.child_index >= 0):
+ child_token = child_dependency.child_token(token.doc)
+ child_token._.holmes.parents.append(child_dependency)
+ for linked_parent_index in token._.holmes.token_and_coreference_chain_indexes:
+ linked_parent = token.doc[linked_parent_index]
+ for child_dependency in (
+ child_dependency for child_dependency in linked_parent._.holmes.children
+ if child_dependency.child_index >= 0):
+ child_token = child_dependency.child_token(token.doc)
+ for linked_child_index in \
+ child_token._.holmes.token_and_coreference_chain_indexes:
+ linked_child = token.doc[linked_child_index]
+ token._.holmes.coreference_linked_child_dependencies.append([
+ linked_child.i, child_dependency.label])
+ linked_child._.holmes.coreference_linked_parent_dependencies.append([
+ token.i, child_dependency.label])
+
+ def get_entity_label_to_vector_dict(self):
+ return {label:
+ self.vectors_nlp.vocab[self.entity_labels_to_corresponding_lexemes[label]].vector
+ for label in self.entity_labels_to_corresponding_lexemes}
+
+class LinguisticObjectFactory:
+ """ Factory for search phrases and topic matching phraselets. """
+
+ def __init__(
+ self, semantic_analyzer, semantic_matching_helper, ontology,
+ overall_similarity_threshold, embedding_based_matching_on_root_words,
+ analyze_derivational_morphology, perform_coreference_resolution):
+ """Args:
+
+ semantic_analyzer -- the *SemanticAnalyzer* object to use
+ semantic_matching_helper -- the *SemanticMatchingHelper* object to use
+ ontology -- optionally, an *Ontology* object to use in matching
+ overall_similarity_threshold -- if embedding-based matching is to be activated, a float
+ value between 0 and 1. A match between a search phrase and a document is then valid
+ if the geometric mean of all the similarities between search phrase tokens and
+ document tokens is this value or greater. If this value is set to 1.0,
+ embedding-based matching is deactivated.
+ embedding_based_matching_on_root_words -- determines whether or not embedding-based
+ matching should be attempted on search-phrase root tokens, which has a considerable
+ performance hit. Defaults to *False*.
+ analyze_derivational_morphology -- *True* if matching should be attempted between different
+ words from the same word family. Defaults to *True*.
+ perform_coreference_resolution -- *True* if coreference resolution should be performed.
+ """
+ self.semantic_analyzer = semantic_analyzer
+ self.semantic_matching_helper = semantic_matching_helper
+ self.ontology = ontology
+ self.overall_similarity_threshold = overall_similarity_threshold
+ self.embedding_based_matching_on_root_words = embedding_based_matching_on_root_words
+ self.analyze_derivational_morphology = analyze_derivational_morphology
+ self.perform_coreference_resolution = perform_coreference_resolution
+
+ def add_phraselets_to_dict(
+ self, doc, *, phraselet_labels_to_phraselet_infos,
+ replace_with_hypernym_ancestors, match_all_words,
+ ignore_relation_phraselets, include_reverse_only, stop_lemmas, stop_tags,
+ reverse_only_parent_lemmas, words_to_corpus_frequencies, maximum_corpus_frequency,
+ process_initial_question_words):
+ """ Creates topic matching phraselets extracted from a matching text.
+
+ Properties:
+
+ doc -- the Holmes-parsed document
+ phraselet_labels_to_phraselet_infos -- a dictionary from labels to phraselet info objects
+ that are used to generate phraselet search phrases.
+ replace_with_hypernym_ancestors -- if 'True', all words present in the ontology
+ are replaced with their most general (highest) ancestors.
+ match_all_words -- if 'True', word phraselets are generated for all matchable words
+ rather than just for words whose tags match the phraselet template; multiwords
+ are not taken into account when processing single-word phraselets; and single-word
+ phraselets are generated for subwords.
+ ignore_relation_phraselets -- if 'True', only single-word phraselets are processed.
+ include_reverse_only -- whether to generate phraselets that are only reverse-matched.
+ Reverse matching is used in topic matching but not in supervised document
+ classification.
+ stop_lemmas -- lemmas that should prevent all types of phraselet production.
+ stop_tags -- tags that should prevent all types of phraselet production.
+ reverse_only_parent_lemmas -- lemma / part-of-speech combinations that, when present at
+ the parent pole of a relation phraselet, should cause that phraselet to be
+ reverse-matched.
+ words_to_corpus_frequencies -- a dictionary from words to the number of times each
+ word occurs in the indexed documents.
+ maximum_corpus_frequency -- the maximum value within *words_to_corpus_frequencies*.
+ process_initial_question_words -- *True* if interrogative pronouns are permitted within
+ phraselets.
+ """
+
+ index_to_lemmas_cache = {}
+ def get_lemmas_from_index(index):
+ """ Returns the lemma and the derived lemma. Phraselets form a special case where
+ the derived lemma is set even if it is identical to the lemma. This is necessary
+ because the lemma may be set to a different value during the lifecycle of the
+ object. The property getter in the SemanticDictionary class ensures that
+ derived_lemma is None is always returned where the two strings are identical.
+ """
+ if index in index_to_lemmas_cache:
+ return index_to_lemmas_cache[index]
+ token = doc[index.token_index]
+ if self.semantic_matching_helper.is_entity_search_phrase_token(token, False):
+ # False in order to get text rather than lemma
+ index_to_lemmas_cache[index] = token.text, token.text
+ return token.text, token.text
+ # keep the text, because the lemma will be lowercase
+ if index.is_subword():
+ lemma = token._.holmes.subwords[index.subword_index].lemma
+ if self.analyze_derivational_morphology:
+ derived_lemma = token._.holmes.subwords[index.subword_index].\
+ lemma_or_derived_lemma()
+ else:
+ derived_lemma = lemma
+ if self.ontology is not None and self.analyze_derivational_morphology:
+ for reverse_derived_word in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(
+ token._.holmes.subwords[index.subword_index]):
+ derived_lemma = reverse_derived_word.lower()
+ break
+ else:
+ lemma = token._.holmes.lemma
+ if self.analyze_derivational_morphology:
+ derived_lemma = token._.holmes.lemma_or_derived_lemma()
+ else:
+ derived_lemma = lemma
+ if self.ontology is not None and not self.ontology.contains(lemma):
+ if self.ontology.contains(token.text.lower()):
+ lemma = derived_lemma = token.text.lower()
+ # ontology contains text but not lemma, so return text
+ if self.ontology is not None and self.analyze_derivational_morphology:
+ for reverse_derived_word in self.semantic_matching_helper.\
+ reverse_derived_lemmas_in_ontology(token):
+ derived_lemma = reverse_derived_word.lower()
+ break
+ # ontology contains a word pointing to the same derived lemma,
+ # so return that. Note that if there are several such words the same
+ # one will always be returned.
+ index_to_lemmas_cache[index] = lemma, derived_lemma
+ return lemma, derived_lemma
+
+ def replace_lemmas_with_most_general_ancestor(lemma, derived_lemma):
+ new_derived_lemma = self.ontology.get_most_general_hypernym_ancestor(
+ derived_lemma).lower()
+ if derived_lemma != new_derived_lemma:
+ lemma = derived_lemma = new_derived_lemma
+ return lemma, derived_lemma
+
+ def lemma_replacement_indicated(existing_lemma, existing_pos, new_lemma, new_pos):
+ if existing_lemma is None:
+ return False
+ if not existing_pos in self.semantic_matching_helper.preferred_phraselet_pos and \
+ new_pos in self.semantic_matching_helper.preferred_phraselet_pos:
+ return True
+ if existing_pos in self.semantic_matching_helper.preferred_phraselet_pos and \
+ new_pos not in self.semantic_matching_helper.preferred_phraselet_pos:
+ return False
+ return len(new_lemma) < len(existing_lemma)
+
+ def add_new_phraselet_info(
+ phraselet_label, phraselet_template, created_without_matching_tags,
+ is_reverse_only_parent_lemma, parent_lemma, parent_derived_lemma,
+ parent_pos, parent_ent_type, parent_is_initial_question_word,
+ parent_has_initial_question_word_in_phrase, child_lemma, child_derived_lemma,
+ child_pos, child_ent_type, child_is_initial_question_word,
+ child_has_initial_question_word_in_phrase):
+
+ def get_frequency_factor_for_pole(parent): # pole is 'True' -> parent, 'False' -> child
+ original_word_set = {parent_lemma, parent_derived_lemma} if parent else \
+ {child_lemma, child_derived_lemma}
+ word_set_including_any_ontology = original_word_set.copy()
+ if self.ontology is not None:
+ for word in original_word_set:
+ for word_matching, _ in \
+ self.ontology.get_words_matching_and_depths(word):
+ word_set_including_any_ontology.add(word_matching)
+ frequencies = []
+ for word in word_set_including_any_ontology:
+ if word in words_to_corpus_frequencies:
+ frequencies.append(float(words_to_corpus_frequencies[word]))
+ if len(frequencies) == 0:
+ return 1.0
+ adjusted_max_frequency = max(frequencies) - 1.0
+ if adjusted_max_frequency <= 0.0:
+ return 1.0
+ return 1 - (math.log(adjusted_max_frequency) / math.log(maximum_corpus_frequency))
+
+ frequency_factor = parent_frequency_factor = child_frequency_factor = None
+ if words_to_corpus_frequencies is not None:
+ parent_frequency_factor = get_frequency_factor_for_pole(True)
+ frequency_factor = parent_frequency_factor
+ if child_lemma is not None:
+ child_frequency_factor = get_frequency_factor_for_pole(False)
+ frequency_factor *= child_frequency_factor
+ if phraselet_label not in phraselet_labels_to_phraselet_infos:
+ phraselet_labels_to_phraselet_infos[phraselet_label] = PhraseletInfo(
+ phraselet_label, phraselet_template.label, parent_lemma,
+ parent_derived_lemma, parent_pos, parent_ent_type,
+ parent_is_initial_question_word, parent_has_initial_question_word_in_phrase,
+ child_lemma, child_derived_lemma, child_pos, child_ent_type,
+ child_is_initial_question_word, child_has_initial_question_word_in_phrase,
+ created_without_matching_tags, is_reverse_only_parent_lemma, frequency_factor,
+ parent_frequency_factor, child_frequency_factor)
+ else:
+ existing_phraselet = phraselet_labels_to_phraselet_infos[phraselet_label]
+ if lemma_replacement_indicated(
+ existing_phraselet.parent_lemma, existing_phraselet.parent_pos,
+ parent_lemma, parent_pos):
+ existing_phraselet.parent_lemma = parent_lemma
+ existing_phraselet.parent_pos = parent_pos
+ if lemma_replacement_indicated(
+ existing_phraselet.child_lemma, existing_phraselet.child_pos, child_lemma,
+ child_pos):
+ existing_phraselet.child_lemma = child_lemma
+ existing_phraselet.child_pos = child_pos
+
+ def process_single_word_phraselet_templates(
+ token, subword_index, checking_tags, token_indexes_to_multiword_lemmas):
+ for phraselet_template in (
+ phraselet_template for phraselet_template in
+ self.semantic_matching_helper.phraselet_templates if
+ phraselet_template.single_word() and (
+ token._.holmes.is_matchable or subword_index is not None)):
+ # see note below for explanation
+ if (not checking_tags or token.tag_ in phraselet_template.parent_tags) and \
+ token.tag_ not in stop_tags:
+ if token.i in token_indexes_to_multiword_lemmas and not match_all_words:
+ lemma = derived_lemma = token_indexes_to_multiword_lemmas[token.i]
+ else:
+ lemma, derived_lemma = get_lemmas_from_index(Index(token.i, subword_index))
+ if self.ontology is not None and replace_with_hypernym_ancestors:
+ lemma, derived_lemma = replace_lemmas_with_most_general_ancestor(
+ lemma, derived_lemma)
+ phraselet_label = ''.join((phraselet_template.label, ': ', derived_lemma))
+ if derived_lemma not in stop_lemmas and derived_lemma != 'ENTITYNOUN':
+ # ENTITYNOUN has to be excluded as single word although it is still
+ # permitted as the child of a relation phraselet template
+ add_new_phraselet_info(
+ phraselet_label, phraselet_template, not checking_tags,
+ None, lemma, derived_lemma, token.pos_, token.ent_type_,
+ token._.holmes.is_initial_question_word,
+ token._.holmes.has_initial_question_word_in_phrase,
+ None, None, None, None, None, None)
+
+ def add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(index_list):
+ # for each token in the list, find out whether it has subwords and if so add the
+ # head subword to the list
+ for index in index_list.copy():
+ token = doc[index.token_index]
+ for subword in (
+ subword for subword in token._.holmes.subwords if
+ subword.is_head and subword.containing_token_index == token.i):
+ index_list.append(Index(token.i, subword.index))
+ # if one or more subwords do not belong to this token, it is a hyphenated word
+ # within conjunction and the whole word should not be used to build relation phraselets.
+ if len([
+ subword for subword in token._.holmes.subwords if
+ subword.containing_token_index != token.i]) > 0:
+ index_list.remove(index)
+
+ self.redefine_multiwords_on_head_tokens(doc)
+ token_indexes_to_multiword_lemmas = {}
+ token_indexes_within_multiwords_to_ignore = []
+ for token in (token for token in doc if len(token._.holmes.lemma.split()) == 1):
+ entity_defined_multiword, indexes = \
+ self.semantic_matching_helper.get_entity_defined_multiword(token)
+ if entity_defined_multiword is not None:
+ for index in indexes:
+ if index == token.i:
+ token_indexes_to_multiword_lemmas[token.i] = entity_defined_multiword
+ else:
+ token_indexes_within_multiwords_to_ignore.append(index)
+ for token in doc:
+ if token.i in token_indexes_within_multiwords_to_ignore:
+ if match_all_words:
+ process_single_word_phraselet_templates(
+ token, None, False, token_indexes_to_multiword_lemmas)
+ continue
+ if len([
+ subword for subword in token._.holmes.subwords if
+ subword.containing_token_index != token.i]) == 0:
+ # whole single words involved in subword conjunction should not be included as
+ # these are partial words including hyphens.
+ process_single_word_phraselet_templates(
+ token, None, not match_all_words, token_indexes_to_multiword_lemmas)
+ if match_all_words:
+ for subword in (
+ subword for subword in token._.holmes.subwords if
+ token.i == subword.containing_token_index):
+ process_single_word_phraselet_templates(
+ token, subword.index, False, token_indexes_to_multiword_lemmas)
+ if ignore_relation_phraselets:
+ continue
+ if self.perform_coreference_resolution:
+ parents = [
+ Index(token_index, None) for token_index in
+ token._.holmes.token_and_coreference_chain_indexes]
+ else:
+ parents = [Index(token.i, None)]
+ add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(parents)
+ for parent in parents:
+ for dependency in (
+ dependency for dependency in doc[parent.token_index]._.holmes.children
+ if dependency.child_index not in token_indexes_within_multiwords_to_ignore):
+ if self.perform_coreference_resolution:
+ children = [
+ Index(token_index, None) for token_index in
+ dependency.child_token(doc)._.holmes.
+ token_and_coreference_chain_indexes]
+ else:
+ children = [Index(dependency.child_token(doc).i, None)]
+ add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(
+ children)
+ for child in children:
+ for phraselet_template in (
+ phraselet_template for phraselet_template in
+ self.semantic_matching_helper.phraselet_templates if not
+ phraselet_template.single_word() and (
+ not phraselet_template.reverse_only or include_reverse_only)):
+ if dependency.label in \
+ phraselet_template.dependency_labels and \
+ doc[parent.token_index].tag_ in phraselet_template.parent_tags\
+ and doc[child.token_index].tag_ in \
+ phraselet_template.child_tags and \
+ doc[parent.token_index]._.holmes.is_matchable and \
+ (doc[child.token_index]._.holmes.is_matchable or
+ (process_initial_question_words and
+ doc[child.token_index]._.holmes.is_initial_question_word)):
+ if parent.token_index in token_indexes_to_multiword_lemmas:
+ parent_lemma = parent_derived_lemma = \
+ token_indexes_to_multiword_lemmas[parent.token_index]
+ else:
+ parent_lemma, parent_derived_lemma = \
+ get_lemmas_from_index(parent)
+ if self.ontology is not None and replace_with_hypernym_ancestors:
+ parent_lemma, parent_derived_lemma = \
+ replace_lemmas_with_most_general_ancestor(
+ parent_lemma, parent_derived_lemma)
+ if child.token_index in token_indexes_to_multiword_lemmas:
+ child_lemma = child_derived_lemma = \
+ token_indexes_to_multiword_lemmas[child.token_index]
+ else:
+ child_lemma, child_derived_lemma = get_lemmas_from_index(child)
+ if self.ontology is not None and replace_with_hypernym_ancestors:
+ child_lemma, child_derived_lemma = \
+ replace_lemmas_with_most_general_ancestor(
+ child_lemma, child_derived_lemma)
+ phraselet_label = ''.join((
+ phraselet_template.label, ': ', parent_derived_lemma,
+ '-', child_derived_lemma))
+ is_reverse_only_parent_lemma = False
+ if reverse_only_parent_lemmas is not None:
+ for entry in reverse_only_parent_lemmas:
+ if entry[0] == doc[parent.token_index]._.holmes.lemma \
+ and entry[1] == doc[parent.token_index].pos_:
+ is_reverse_only_parent_lemma = True
+ if parent_lemma not in stop_lemmas and child_lemma not in \
+ stop_lemmas and not (
+ is_reverse_only_parent_lemma
+ and not include_reverse_only):
+ add_new_phraselet_info(
+ phraselet_label, phraselet_template, match_all_words,
+ is_reverse_only_parent_lemma,
+ parent_lemma, parent_derived_lemma,
+ doc[parent.token_index].pos_,
+ doc[parent.token_index].ent_type_,
+ doc[parent.token_index]._.holmes.is_initial_question_word,
+ doc[parent.token_index]._.holmes.
+ has_initial_question_word_in_phrase,
+ child_lemma, child_derived_lemma,
+ doc[child.token_index].pos_,
+ doc[child.token_index].ent_type_,
+ doc[child.token_index]._.holmes.is_initial_question_word,
+ doc[child.token_index]._.holmes.
+ has_initial_question_word_in_phrase)
+
+ # We do not check for matchability in order to catch pos_='X', tag_='TRUNC'. This
+ # is not a problem as only a limited range of parts of speech receive subwords in
+ # the first place.
+ for subword in (
+ subword for subword in token._.holmes.subwords if
+ subword.dependent_index is not None):
+ parent_subword_index = subword.index
+ child_subword_index = subword.dependent_index
+ if token._.holmes.subwords[parent_subword_index].containing_token_index != \
+ token.i and \
+ token._.holmes.subwords[child_subword_index].containing_token_index != \
+ token.i:
+ continue
+ for phraselet_template in (
+ phraselet_template for phraselet_template in
+ self.semantic_matching_helper.phraselet_templates if not
+ phraselet_template.single_word() and (
+ not phraselet_template.reverse_only or include_reverse_only)
+ and subword.dependency_label in phraselet_template.dependency_labels and
+ token.tag_ in phraselet_template.parent_tags):
+ parent_lemma, parent_derived_lemma = get_lemmas_from_index(Index(
+ token.i, parent_subword_index))
+ if self.ontology is not None and replace_with_hypernym_ancestors:
+ parent_lemma, parent_derived_lemma = \
+ replace_lemmas_with_most_general_ancestor(
+ parent_lemma, parent_derived_lemma)
+ child_lemma, child_derived_lemma = get_lemmas_from_index(Index(
+ token.i, child_subword_index))
+ if self.ontology is not None and replace_with_hypernym_ancestors:
+ child_lemma, child_derived_lemma = \
+ replace_lemmas_with_most_general_ancestor(
+ child_lemma, child_derived_lemma)
+ phraselet_label = ''.join((
+ phraselet_template.label, ': ', parent_derived_lemma, '-',
+ child_derived_lemma))
+ add_new_phraselet_info(
+ phraselet_label, phraselet_template, match_all_words,
+ False, parent_lemma, parent_derived_lemma, token.pos_, token.ent_type_,
+ token._.holmes.is_initial_question_word,
+ token._.holmes.has_initial_question_word_in_phrase,
+ child_lemma, child_derived_lemma, token.pos_, token.ent_type_,
+ token._.holmes.is_initial_question_word,
+ token._.holmes.has_initial_question_word_in_phrase)
+ if len(phraselet_labels_to_phraselet_infos) == 0 and not match_all_words:
+ for token in doc:
+ process_single_word_phraselet_templates(
+ token, None, False, token_indexes_to_multiword_lemmas)
+
+ def create_search_phrases_from_phraselet_infos(self, phraselet_infos,
+ reverse_matching_frequency_threshold=None):
+ """ Creates search phrases from phraselet info objects, returning a dictionary from
+ phraselet labels to the created search phrases.
+
+ reverse_matching_frequency_threshold: an optional threshold between 0.0 and 1.0.
+ Where the parent word in a phraselet has a frequency factor below the threshold,
+ the search phrase will be set to
+ *treat_as_reverse_only_during_initial_relation_matching=True*.
+ """
+
+ def create_phraselet_label(phraselet_info):
+ if phraselet_info.child_lemma is not None:
+ return ''.join((
+ phraselet_info.template_label, ': ', phraselet_info.parent_derived_lemma, '-',
+ phraselet_info.child_derived_lemma))
+ else:
+ return ''.join((
+ phraselet_info.template_label, ': ', phraselet_info.parent_derived_lemma))
+
+ def create_search_phrase_from_phraselet(phraselet_info):
+ for phraselet_template in self.semantic_matching_helper.phraselet_templates:
+ if phraselet_info.template_label == phraselet_template.label:
+ phraselet_doc = phraselet_template.template_doc.copy()
+ phraselet_doc[phraselet_template.parent_index]._.holmes.lemma = \
+ phraselet_info.parent_lemma
+ phraselet_doc[phraselet_template.parent_index]._.holmes.derived_lemma = \
+ phraselet_info.parent_derived_lemma
+ phraselet_doc[phraselet_template.parent_index]._.holmes.ent_type = \
+ phraselet_info.parent_ent_type
+ phraselet_doc[phraselet_template.parent_index]._.holmes.\
+ is_initial_question_word = \
+ phraselet_info.parent_is_initial_question_word
+ phraselet_doc[phraselet_template.parent_index]._.holmes.\
+ has_initial_question_word_in_phrase = \
+ phraselet_info.parent_has_initial_question_word_in_phrase
+ if phraselet_info.child_lemma is not None:
+ phraselet_doc[phraselet_template.child_index]._.holmes.lemma = \
+ phraselet_info.child_lemma
+ phraselet_doc[phraselet_template.child_index]._.holmes.derived_lemma = \
+ phraselet_info.child_derived_lemma
+ phraselet_doc[phraselet_template.child_index]._.holmes.ent_type = \
+ phraselet_info.child_ent_type
+ phraselet_doc[phraselet_template.child_index]._.holmes.\
+ is_initial_question_word = \
+ phraselet_info.child_is_initial_question_word
+ phraselet_doc[phraselet_template.child_index]._.holmes.\
+ has_initial_question_word_in_phrase = \
+ phraselet_info.child_has_initial_question_word_in_phrase
+ return self.create_search_phrase(
+ 'topic match phraselet', phraselet_doc,
+ create_phraselet_label(phraselet_info), phraselet_template,
+ phraselet_info.created_without_matching_tags,
+ reverse_matching_frequency_threshold is not None and
+ phraselet_info.parent_frequency_factor <
+ reverse_matching_frequency_threshold and
+ phraselet_info.child_lemma is not None and not
+ phraselet_template.question,
+ phraselet_info.reverse_only_parent_lemma,
+ True)
+ raise RuntimeError(''.join((
+ 'Phraselet template', phraselet_info.template_label, 'not found.')))
+
+ return { create_phraselet_label(phraselet_info) :
+ create_search_phrase_from_phraselet(phraselet_info) for phraselet_info in
+ phraselet_infos}
+
+ def redefine_multiwords_on_head_tokens(self, doc):
+
+ def loop_textual_representations(multiword_span):
+ for representation, _ in self.semantic_matching_helper.loop_textual_representations(
+ multiword_span):
+ yield representation, multiword_span.derived_lemma
+ if self.analyze_derivational_morphology:
+ for reverse_derived_lemma in \
+ self.semantic_matching_helper.reverse_derived_lemmas_in_ontology(
+ multiword_span):
+ yield reverse_derived_lemma, multiword_span.derived_lemma
+
+ if self.ontology is not None:
+ for token in (token for token in doc if len(token._.holmes.lemma.split()) == 1):
+ matched = False
+ for multiword_span in \
+ self.semantic_matching_helper.multiword_spans_with_head_token(token):
+ for representation, derived_lemma in \
+ loop_textual_representations(multiword_span):
+ if self.ontology.contains_multiword(representation):
+ matched = True
+ token._.holmes.lemma = representation.lower()
+ token._.holmes.derived_lemma = derived_lemma
+ # mark the dependent tokens as grammatical and non-matchable
+ for multiword_token in (
+ multiword_token for multiword_token in multiword_span.tokens
+ if multiword_token.i != token.i):
+ multiword_token._.holmes.children = [SemanticDependency(
+ multiword_token.i, 0 - (token.i + 1), None)]
+ multiword_token._.holmes.is_matchable = False
+ break
+ if matched:
+ break
+
+ def get_phraselet_labels_to_phraselet_infos(self, *, text_to_match_doc,
+ words_to_corpus_frequencies, maximum_corpus_frequency, process_initial_question_words):
+ phraselet_labels_to_phraselet_infos = {}
+ self.add_phraselets_to_dict(
+ text_to_match_doc,
+ phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
+ replace_with_hypernym_ancestors=False,
+ match_all_words=False,
+ ignore_relation_phraselets=False,
+ include_reverse_only=True,
+ stop_tags=self.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ stop_lemmas=self.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ reverse_only_parent_lemmas=
+ self.semantic_matching_helper.topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=words_to_corpus_frequencies,
+ maximum_corpus_frequency=maximum_corpus_frequency,
+ process_initial_question_words=process_initial_question_words)
+
+ # now add the single word phraselets whose tags did not match.
+ self.add_phraselets_to_dict(
+ text_to_match_doc,
+ phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
+ replace_with_hypernym_ancestors=False,
+ match_all_words=True,
+ ignore_relation_phraselets=True,
+ include_reverse_only=False, # value is irrelevant with
+ # ignore_relation_phraselets == True
+ stop_lemmas=self.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=self.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=
+ self.semantic_matching_helper.topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=words_to_corpus_frequencies,
+ maximum_corpus_frequency=maximum_corpus_frequency,
+ process_initial_question_words=False)
+ return phraselet_labels_to_phraselet_infos
+
+ def create_search_phrase(
+ self, search_phrase_text, search_phrase_doc,
+ label, phraselet_template, topic_match_phraselet_created_without_matching_tags,
+ treat_as_reverse_only_during_initial_relation_matching,
+ is_reverse_only_parent_lemma, process_initial_question_words):
+ """phraselet_template -- 'None' if this search phrase is not a topic match phraselet"""
+
+ def replace_grammatical_root_token_recursively(token):
+ """Where the syntactic root of a search phrase document is a grammatical token or is
+ marked as non-matchable, loop through the semantic dependencies to find the
+ semantic root.
+ """
+ for dependency in token._.holmes.children:
+ if dependency.child_index < 0:
+ return replace_grammatical_root_token_recursively(
+ token.doc[(0 - dependency.child_index) - 1])
+ if not token._.holmes.is_matchable:
+ for dependency in token._.holmes.children:
+ if dependency.child_index >= 0 and \
+ dependency.child_token(token.doc)._.holmes.is_matchable:
+ return replace_grammatical_root_token_recursively(
+ token.doc[dependency.child_index])
+ return token
+
+ def add_word_information(word, match_type, depth):
+ if word not in words_matching_root_token:
+ words_matching_root_token.append(word)
+ if not word in root_word_to_match_info_dict:
+ root_word_to_match_info_dict[word] = (match_type, depth)
+
+ def add_word_information_from_ontology(word):
+ for entry_word, entry_depth in \
+ self.semantic_matching_helper.ontology.get_words_matching_and_depths(word):
+ add_word_information(entry_word, 'ontology', entry_depth)
+ if self.semantic_matching_helper.analyze_derivational_morphology:
+ working_derived_lemma = self.semantic_analyzer.derived_holmes_lemma(
+ None, entry_word.lower())
+ if working_derived_lemma is not None:
+ add_word_information(working_derived_lemma, 'ontology', entry_depth)
+
+ if phraselet_template is None:
+ self.redefine_multiwords_on_head_tokens(search_phrase_doc)
+ # where a multiword exists as an ontology entry, the multiword should be used for
+ # matc0hing rather than the individual words. Not relevant for topic matching
+ # phraselets because the multiword will already have been set as the Holmes
+ # lemma of the word.
+
+ for token in search_phrase_doc:
+ if len(token._.holmes.righthand_siblings) > 0:
+ # SearchPhrases may not themselves contain conjunctions like 'and'
+ # because then the matching becomes too complicated
+ raise SearchPhraseContainsConjunctionError(search_phrase_text)
+ if token._.holmes.is_negated:
+ # SearchPhrases may not themselves contain negation
+ # because then the matching becomes too complicated
+ raise SearchPhraseContainsNegationError(search_phrase_text)
+ if self.perform_coreference_resolution and token.pos_ == 'PRON' and \
+ token._.holmes.is_involved_in_coreference():
+ # SearchPhrases may not themselves contain coreferring pronouns
+ # because then the matching becomes too complicated
+ raise SearchPhraseContainsCoreferringPronounError(search_phrase_text)
+
+ root_tokens = []
+ tokens_to_match = []
+ matchable_non_entity_tokens_to_vectors = {}
+ for token in search_phrase_doc:
+ # check whether grammatical token
+ if phraselet_template is not None and phraselet_template.parent_index != token.i and \
+ phraselet_template.child_index != token.i:
+ token._.holmes.is_matchable = False
+ if phraselet_template is not None and phraselet_template.parent_index == token.i and \
+ not phraselet_template.single_word() and \
+ phraselet_template.assigned_dependency_label is not None:
+ for dependency in (
+ dependency for dependency in token._.holmes.children if \
+ dependency.child_index == phraselet_template.child_index):
+ dependency.label = phraselet_template.assigned_dependency_label
+ if token._.holmes.is_matchable and not (
+ len(token._.holmes.children) > 0 and
+ token._.holmes.children[0].child_index < 0):
+ tokens_to_match.append(token)
+ if not self.semantic_matching_helper.is_entity_search_phrase_token(
+ token, phraselet_template is not None):
+ if phraselet_template is None and len(token._.holmes.lemma.split()) > 1:
+ working_lexeme = self.semantic_analyzer.vectors_nlp.vocab[token.lemma_]
+ else:
+ working_lexeme = \
+ self.semantic_analyzer.vectors_nlp.vocab[token._.holmes.lemma]
+ if working_lexeme.has_vector and working_lexeme.vector_norm > 0:
+ matchable_non_entity_tokens_to_vectors[token.i] = \
+ working_lexeme.vector
+ else:
+ matchable_non_entity_tokens_to_vectors[token.i] = None
+ if process_initial_question_words and \
+ self.semantic_analyzer.is_interrogative_pronoun(token):
+ tokens_to_match.append(token)
+ matchable_non_entity_tokens_to_vectors[token.i] = None
+ if token.dep_ == 'ROOT': # syntactic root
+ root_tokens.append(replace_grammatical_root_token_recursively(token))
+ if len(tokens_to_match) == 0:
+ raise SearchPhraseWithoutMatchableWordsError(search_phrase_text)
+ if len(root_tokens) > 1:
+ raise SearchPhraseContainsMultipleClausesError(search_phrase_text)
+ root_token = root_tokens[0]
+ if phraselet_template is None:
+ reverse_only = False
+ else:
+ reverse_only = not phraselet_template.question and (is_reverse_only_parent_lemma or
+ phraselet_template.reverse_only)
+
+ words_matching_root_token = []
+ root_word_to_match_info_dict = {}
+
+ add_word_information(root_token._.holmes.lemma, 'direct', 0)
+ if phraselet_template is None and root_token.lemma_.lower() == \
+ root_token._.holmes.lemma.lower():
+ add_word_information(root_token.text.lower(), 'direct', 0)
+ hyphen_normalized_text = \
+ self.semantic_matching_helper.normalize_hyphens(root_token.text)
+ if root_token.text != hyphen_normalized_text:
+ add_word_information(hyphen_normalized_text.lower(), 'direct', 0)
+ if self.semantic_matching_helper.analyze_derivational_morphology and \
+ root_token._.holmes.derived_lemma is not None:
+ add_word_information(root_token._.holmes.derived_lemma, 'derivation', 0)
+ if self.semantic_matching_helper.ontology is not None and not \
+ self.semantic_matching_helper.is_entity_search_phrase_token(
+ root_token, phraselet_template is not None):
+ add_word_information_from_ontology(root_token._.holmes.lemma)
+ if self.semantic_matching_helper.analyze_derivational_morphology and \
+ root_token._.holmes.derived_lemma is not None:
+ add_word_information_from_ontology(root_token._.holmes.derived_lemma)
+ if phraselet_template is None and root_token.lemma == \
+ root_token._.holmes.lemma:
+ add_word_information_from_ontology(root_token.text.lower())
+ if root_token.text != hyphen_normalized_text:
+ add_word_information_from_ontology(hyphen_normalized_text.lower())
+ if self.semantic_matching_helper.analyze_derivational_morphology:
+ for reverse_derived_lemma in \
+ self.semantic_matching_helper.reverse_derived_lemmas_in_ontology(
+ root_token):
+ add_word_information_from_ontology(reverse_derived_lemma)
+
+ return SearchPhrase(
+ search_phrase_doc, [token.i for token in tokens_to_match], root_token.i,
+ matchable_non_entity_tokens_to_vectors, label, phraselet_template is not None,
+ topic_match_phraselet_created_without_matching_tags,
+ (phraselet_template is not None and phraselet_template.question),
+ reverse_only, treat_as_reverse_only_during_initial_relation_matching,
+ words_matching_root_token, root_word_to_match_info_dict, len(tokens_to_match) == 1 and
+ not (phraselet_template is not None and phraselet_template.question))
+
+ def get_ontology_reverse_derivational_dict(self):
+ """During structural matching, a lemma or derived lemma matches any words in the ontology
+ that yield the same word as their derived lemmas. This method generates a dictionary
+ from derived lemmas to ontology words that yield them to facilitate such matching.
+ """
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ ontology_reverse_derivational_dict = {}
+ for ontology_word in self.ontology.words:
+ derived_lemmas = []
+ normalized_ontology_word = \
+ self.semantic_matching_helper.normalize_hyphens(ontology_word)
+ for textual_word in normalized_ontology_word.split():
+ derived_lemma = self.semantic_analyzer.derived_holmes_lemma(
+ None, textual_word.lower())
+ if derived_lemma is None:
+ derived_lemma = textual_word
+ derived_lemmas.append(derived_lemma)
+ derived_ontology_word = ' '.join(derived_lemmas)
+ if derived_ontology_word != ontology_word:
+ if derived_ontology_word in ontology_reverse_derivational_dict:
+ ontology_reverse_derivational_dict[derived_ontology_word].append(
+ ontology_word)
+ else:
+ ontology_reverse_derivational_dict[derived_ontology_word] = [ontology_word]
+ # sort entry lists to ensure deterministic behaviour
+ for derived_ontology_word in ontology_reverse_derivational_dict:
+ ontology_reverse_derivational_dict[derived_ontology_word] = \
+ sorted(ontology_reverse_derivational_dict[derived_ontology_word])
+ return ontology_reverse_derivational_dict
+ else:
+ return None
+
+class SemanticMatchingHelperFactory():
+ """Returns the correct *SemanticMatchingHelperFactory* for the language in use.
+ This class must be added to if additional implementations are added for new languages.
+ """
+
+ def semantic_matching_helper(self, *, language, ontology, analyze_derivational_morphology):
+ language_specific_rules_module = importlib.import_module(
+ '.'.join(('.lang', language, 'language_specific_rules')),
+ 'holmes_extractor')
+ return language_specific_rules_module.\
+ LanguageSpecificSemanticMatchingHelper(ontology, analyze_derivational_morphology)
+
+class SemanticMatchingHelper(ABC):
+ """Abstract *SemanticMatchingHelper* parent class containing language-specific properties and
+ methods that are required for matching and can be successfully and efficiently serialized.
+ Functionality is placed here that is common to all current implementations. It follows that
+ some functionality will probably have to be moved out to specific implementations whenever
+ an implementation for a new language is added.
+
+ For explanations of the abstract variables and methods, see the
+ *EnglishSemanticMatchingHelper* implementation where they can be illustrated with direct
+ examples.
+ """
+
+ noun_pos = NotImplemented
+
+ permissible_embedding_pos = NotImplemented
+
+ noun_kernel_dep = NotImplemented
+
+ minimum_embedding_match_word_length = NotImplemented
+
+ topic_matching_phraselet_stop_lemmas = NotImplemented
+
+ topic_matching_reverse_only_parent_lemmas = NotImplemented
+
+ topic_matching_phraselet_stop_tags = NotImplemented
+
+ supervised_document_classification_phraselet_stop_lemmas = NotImplemented
+
+ match_implication_dict = NotImplemented
+
+ phraselet_templates = NotImplemented
+
+ preferred_phraselet_pos = NotImplemented
+
+ entity_defined_multiword_pos = NotImplemented
+
+ entity_defined_multiword_entity_types = NotImplemented
+
+ sibling_marker_deps = NotImplemented
+
+ preposition_deps = NotImplemented
+
+ question_answer_blacklist_deps = NotImplemented
+
+ question_answer_final_blacklist_deps = NotImplemented
+
+ @abstractmethod
+ def normalize_hyphens(self, word):
+ pass
+
+ @abstractmethod
+ def question_word_matches(self, search_phrase_label:str, search_phrase_token:Token,
+ document_token:Token, document_vector, entity_label_to_vector_dict:dict,
+ initial_question_word_embedding_match_threshold:float) -> bool:
+ pass
+
+ def __init__(self, ontology, analyze_derivational_morphology):
+ self.ontology = ontology
+ self.analyze_derivational_morphology = analyze_derivational_morphology
+ for key, match_implication in self.match_implication_dict.items():
+ assert key == match_implication.search_phrase_dependency
+ assert key not in match_implication.document_dependencies
+ assert len([dep for dep in match_implication.document_dependencies
+ if match_implication.document_dependencies.count(dep) > 1]) == 0
+ assert key not in match_implication.reverse_document_dependencies
+ assert len([dep for dep in match_implication.reverse_document_dependencies
+ if match_implication.reverse_document_dependencies.count(dep) > 1]) == 0
+
+ def get_subtree_list_for_question_answer(self, token:Token):
+ """ Returns the part of the subtree of a token that has matched a question word
+ that is analysed as answering the question. Essentially, this is all the subtree but
+ excluding any areas that are in a conjunction relationship with *token*; these will be
+ returned as separate answers in their own right.
+ """
+ list_to_return = []
+ for working_token in token.subtree:
+ if token == working_token or working_token.dep_ not in \
+ self.question_answer_blacklist_deps or working_token.text == '-':
+ list_to_return.append(working_token)
+ else:
+ return [token] if len(list_to_return) == 0 else list_to_return
+ if len(list_to_return) > 1 and list_to_return[-1].dep_ in \
+ self.question_answer_final_blacklist_deps:
+ list_to_return = list_to_return[:-1]
+ return list_to_return
+
+ def cosine_similarity(self, vector1, vector2):
+ return dot (vector1,vector2) / (norm(vector1) * norm(vector2))
+
+ def token_matches_ent_type(self, token_vector, entity_label_to_vector_dict:dict,
+ entity_labels:tuple, initial_question_word_embedding_match_threshold:float) -> float:
+ """ Checks if the vector of a token lexeme has a similarity to a lexeme regarded as typical
+ for one of a group of entity labels above a threshold. If so, returns the similarity;
+ if not, returns *0.0*.
+
+ Parameters:
+
+ token_vector -- the document token vector.
+ entity_label_to_vector_dict -- a dictionary from entity labels to vectors for lexemes
+ regarded as typical for those entity labels.
+ entity_labels -- the entity labels to check for similarity.
+ initial_question_word_embedding_match_threshold -- the threshold above which a similarity
+ is regarded as significant.
+ """
+
+ if token_vector is not None:
+ for ent_type in entity_labels:
+ cosine_similarity = self.cosine_similarity(entity_label_to_vector_dict[ent_type],
+ token_vector)
+ if cosine_similarity > initial_question_word_embedding_match_threshold:
+ return cosine_similarity
+ return 0.0
+
+ def add_to_corpus_index(self, corpus_index_dict, parsed_document, document_label):
+ """ Indexes a parsed document. """
+
+ def add_dict_entry(dictionary, word, token_index, subword_index, match_type):
+ index = Index(token_index, subword_index)
+ corpus_word_position = CorpusWordPosition(document_label, index)
+ if match_type == 'entity':
+ key_word = word
+ else:
+ key_word = word.lower()
+ if key_word in dictionary.keys():
+ if index not in dictionary[key_word]:
+ dictionary[key_word].append((
+ corpus_word_position, word, match_type == 'derivation'))
+ else:
+ dictionary[key_word] = [(corpus_word_position, word, match_type == 'derivation')]
+
+ def get_ontology_defined_multiword(token):
+ for multiword_span in \
+ self.multiword_spans_with_head_token(token):
+ if self.ontology.contains_multiword(multiword_span.text):
+ return multiword_span.text, 'direct'
+ hyphen_normalized_text = self.normalize_hyphens(multiword_span.text)
+ if self.ontology.contains_multiword(hyphen_normalized_text):
+ return hyphen_normalized_text, 'direct'
+ elif self.ontology.contains_multiword(multiword_span.lemma):
+ return multiword_span.lemma, 'direct'
+ elif self.ontology.contains_multiword(multiword_span.derived_lemma):
+ return multiword_span.derived_lemma, 'derivation'
+ if self.analyze_derivational_morphology and self.ontology is not None:
+ for reverse_lemma in \
+ self.reverse_derived_lemmas_in_ontology(multiword_span):
+ return reverse_lemma, 'derivation'
+ return None, None
+
+ for token in parsed_document:
+
+ # parent check is necessary so we only find multiword entities once per
+ # search phrase. sibling_marker_deps applies to siblings which would
+ # otherwise be excluded because the main sibling would normally also match the
+ # entity root word.
+ if len(token.ent_type_) > 0 and (
+ token.dep_ == 'ROOT' or token.dep_ in self.sibling_marker_deps
+ or token.ent_type_ != token.head.ent_type_):
+ entity_label = ''.join(('ENTITY', token.ent_type_))
+ add_dict_entry(corpus_index_dict, entity_label, token.i, None, 'entity')
+ if self.ontology is not None:
+ ontology_defined_multiword, match_type = get_ontology_defined_multiword(token)
+ if ontology_defined_multiword is not None:
+ add_dict_entry(
+ corpus_index_dict, ontology_defined_multiword, token.i, None,
+ match_type)
+ continue
+ entity_defined_multiword, _ = \
+ self.get_entity_defined_multiword(token)
+ if entity_defined_multiword is not None:
+ add_dict_entry(
+ corpus_index_dict, entity_defined_multiword, token.i, None, 'direct')
+ for representation, match_type in self.loop_textual_representations(token):
+ add_dict_entry(
+ corpus_index_dict, representation, token.i, None, match_type)
+ for subword in token._.holmes.subwords:
+ for representation, match_type in self.loop_textual_representations(subword):
+ add_dict_entry(
+ corpus_index_dict, representation, token.i, subword.index,
+ match_type)
+
+ def get_corpus_index_removing_document(self, corpus_index_dict, document_label):
+ new_corpus_index_dict = {}
+ for entry in corpus_index_dict:
+ new_value = ([(c, w, m) for (c, w, m) in corpus_index_dict[entry] if c.document_label !=
+ document_label])
+ if len(new_value) > 0:
+ new_corpus_index_dict[entry] = new_value
+ return new_corpus_index_dict
+
+ def dependency_labels_match(self, *, search_phrase_dependency_label, document_dependency_label,
+ inverse_polarity:bool):
+ """Determines whether a dependency label in a search phrase matches a dependency label in
+ a document being searched.
+ inverse_polarity: *True* if the matching dependencies have to point in opposite
+ directions.
+ """
+ if not inverse_polarity:
+ if search_phrase_dependency_label == document_dependency_label:
+ return True
+ if search_phrase_dependency_label not in self.match_implication_dict.keys():
+ return False
+ return document_dependency_label in \
+ self.match_implication_dict[search_phrase_dependency_label].document_dependencies
+ else:
+ return search_phrase_dependency_label in self.match_implication_dict.keys() and \
+ document_dependency_label in self.match_implication_dict[
+ search_phrase_dependency_label].reverse_document_dependencies
+
+ def multiword_spans_with_head_token(self, token):
+ """Generator over *MultiwordSpan* objects with *token* at their head. Dependent phrases
+ are only returned for nouns because e.g. for verbs the whole sentence would be returned.
+ """
+
+ if not token.pos_ in self.noun_pos:
+ return
+ pointer = token.left_edge.i
+ while pointer <= token.right_edge.i:
+ working_text = ''
+ working_lemma = ''
+ working_derived_lemma = ''
+ working_tokens = []
+ inner_pointer = pointer
+ while inner_pointer <= token.right_edge.i and \
+ (token.doc[inner_pointer]._.holmes.is_matchable or
+ token.doc[inner_pointer].text == '-'):
+ if token.doc[inner_pointer].text != '-':
+ working_text = ' '.join((working_text, token.doc[inner_pointer].text))
+ working_lemma = ' '.join((
+ working_lemma, token.doc[inner_pointer]._.holmes.lemma))
+ if self.analyze_derivational_morphology and \
+ token.doc[inner_pointer]._.holmes.derived_lemma is not None:
+ this_token_derived_lemma = token.doc[inner_pointer]._.holmes.derived_lemma
+ else:
+ # if derivational morphology analysis is switched off, the derived lemma
+ # will be identical to the lemma and will not be yielded by
+ # _loop_textual_representations().
+ this_token_derived_lemma = token.doc[inner_pointer]._.holmes.lemma
+ working_derived_lemma = ' '.join((
+ working_derived_lemma, this_token_derived_lemma))
+ working_tokens.append(token.doc[inner_pointer])
+ inner_pointer += 1
+ if pointer + 1 < inner_pointer and token in working_tokens:
+ yield MultiwordSpan(
+ working_text.strip(), working_lemma.strip(), working_derived_lemma.strip(),
+ working_tokens)
+ pointer += 1
+
+ def reverse_derived_lemmas_in_ontology(self, obj):
+ """ Returns all ontology entries that point to the derived lemma of a token or token-like
+ object.
+ """
+ if isinstance(obj, Token):
+ derived_lemma = obj._.holmes.lemma_or_derived_lemma()
+ elif isinstance(obj, Subword):
+ derived_lemma = obj.lemma_or_derived_lemma()
+ elif isinstance(obj, MultiwordSpan):
+ derived_lemma = obj.derived_lemma
+ else:
+ raise RuntimeError(': '.join(('Unsupported type', str(type(obj)))))
+ derived_lemma = self.normalize_hyphens(derived_lemma)
+ if derived_lemma in self.ontology_reverse_derivational_dict:
+ return self.ontology_reverse_derivational_dict[derived_lemma]
+ else:
+ return []
+
+ def is_entity_search_phrase_token(
+ self, search_phrase_token, examine_lemma_rather_than_text):
+ if examine_lemma_rather_than_text:
+ word_to_check = search_phrase_token._.holmes.lemma
+ else:
+ word_to_check = search_phrase_token.text
+ return word_to_check[:6] == 'ENTITY' and len(word_to_check) > 6
+
+ def is_entitynoun_search_phrase_token(
+ self, search_phrase_token):
+ return search_phrase_token.text == 'ENTITYNOUN'
+
+ def entity_search_phrase_token_matches(
+ self, search_phrase_token, topic_match_phraselet, document_token):
+ if topic_match_phraselet:
+ word_to_check = search_phrase_token._.holmes.lemma
+ else:
+ word_to_check = search_phrase_token.text
+ return (
+ document_token.ent_type_ == word_to_check[6:] and
+ len(document_token._.holmes.lemma.strip()) > 0) or (
+ word_to_check == 'ENTITYNOUN' and
+ document_token.pos_ in self.noun_pos)
+ # len(document_token._.holmes.lemma.strip()) > 0: in German spaCy sometimes
+ # classifies whitespace as entities.
+
+ def loop_textual_representations(self, obj):
+ if isinstance(obj, Token):
+ yield obj.text, 'direct'
+ hyphen_normalized_text = self.normalize_hyphens(obj.text)
+ if hyphen_normalized_text != obj.text:
+ yield hyphen_normalized_text, 'direct'
+ if obj._.holmes.lemma != obj.text:
+ yield obj._.holmes.lemma, 'direct'
+ if self.analyze_derivational_morphology and obj._.holmes.derived_lemma is not None:
+ yield obj._.holmes.derived_lemma, 'derivation'
+ elif isinstance(obj, Subword):
+ yield obj.text, 'direct'
+ hyphen_normalized_text = self.normalize_hyphens(obj.text)
+ if hyphen_normalized_text != obj.text:
+ yield hyphen_normalized_text, 'direct'
+ if obj.text != obj.lemma:
+ yield obj.lemma, 'direct'
+ if self.analyze_derivational_morphology and obj.derived_lemma is not None:
+ yield obj.derived_lemma, 'derivation'
+ elif isinstance(obj, MultiwordSpan):
+ yield obj.text, 'direct'
+ hyphen_normalized_text = self.normalize_hyphens(obj.text)
+ if hyphen_normalized_text != obj.text:
+ yield hyphen_normalized_text, 'direct'
+ if obj.text != obj.lemma:
+ yield obj.lemma, 'direct'
+ if obj.lemma != obj.derived_lemma:
+ yield obj.derived_lemma, 'derivation'
+ else:
+ raise RuntimeError(': '.join(('Unsupported type', str(type(obj)))))
+
+ def belongs_to_entity_defined_multiword(self, token):
+ return token.pos_ in self.entity_defined_multiword_pos and token.ent_type_ in \
+ self.entity_defined_multiword_entity_types
+
+ def get_entity_defined_multiword(self, token):
+ """ If this token is at the head of a multiword recognized by spaCy named entity processing,
+ returns the multiword string in lower case and the indexes of the tokens that make up
+ the multiword, otherwise *None, None*.
+ """
+ if not self.belongs_to_entity_defined_multiword(token) or (
+ token.dep_ != 'ROOT' and self.belongs_to_entity_defined_multiword(token.head)) or \
+ token.ent_type_ == '' or token.left_edge.i == token.right_edge.i:
+ return None, None
+ working_ent = token.ent_type_
+ working_text = ''
+ working_indexes = []
+ for counter in range(token.left_edge.i, token.right_edge.i +1):
+ multiword_token = token.doc[counter]
+ if not self.belongs_to_entity_defined_multiword(multiword_token) or \
+ multiword_token.ent_type_ != working_ent:
+ if working_text != '':
+ return None, None
+ else:
+ continue
+ working_text = ' '.join((working_text, multiword_token.text))
+ working_indexes.append(multiword_token.i)
+ if len(working_text.split()) > 1:
+ return working_text.strip().lower(), working_indexes
+ else:
+ return None, None
+
+ def get_dependent_phrase(self, token, subword):
+ """Return the dependent phrase of a token, with an optional subword reference. Used in
+ building match dictionaries."""
+ if subword is not None:
+ return subword.text
+ if not token.pos_ in self.noun_pos:
+ return token.text
+ return_string = ''
+ pointer = token.left_edge.i - 1
+ while True:
+ pointer += 1
+ if token.doc[pointer].pos_ not in self.noun_pos and \
+ token.doc[pointer].dep_ not in self.noun_kernel_dep and pointer > token.i:
+ return return_string.strip()
+ if return_string == '':
+ return_string = token.doc[pointer].text
+ else:
+ return_string = ' '.join((return_string, token.doc[pointer].text))
+ if token.right_edge.i <= pointer:
+ return return_string
diff --git a/holmes_extractor/semantics.py b/holmes_extractor/semantics.py
deleted file mode 100644
index eee64ce..0000000
--- a/holmes_extractor/semantics.py
+++ /dev/null
@@ -1,2527 +0,0 @@
-from abc import ABC, abstractmethod
-import spacy
-import neuralcoref
-import jsonpickle
-import pkg_resources
-from spacy.tokens import Token, Doc
-from .errors import WrongModelDeserializationError, WrongVersionDeserializationError, \
- DocumentTooBigError
-
-SERIALIZED_DOCUMENT_VERSION = 3
-
-class SemanticDependency:
- """A labelled semantic dependency between two tokens."""
-
- def __init__(self, parent_index, child_index, label=None, is_uncertain=False):
- """Args:
-
- parent_index -- the index of the parent token within the document. The dependency will
- always be managed by the parent token, but the index is maintained within the
- object for convenience.
- child_index -- the index of the child token within the document, or one less than the zero
- minus the index of the child token within the document to indicate a grammatical
- dependency. A grammatical dependency means that the parent should be replaced by the
- child during matching.
- label -- the label of the semantic dependency, which must be *None* for grammatical
- dependencies.
- is_uncertain -- if *True*, any match involving this dependency will itself be uncertain.
- """
- if child_index < 0 and label is not None:
- raise RuntimeError(
- 'Semantic dependency with negative child index may not have a label.')
- if parent_index == child_index:
- raise RuntimeError(' '.join((
- 'Attempt to create self-referring semantic dependency with index',
- str(parent_index))))
- self.parent_index = parent_index
- self.child_index = child_index
- self.label = label
- self.is_uncertain = is_uncertain
-
- def child_token(self, doc):
- """Convenience method to return the child token of this dependency.
-
- doc -- the document containing the token.
- """
- return doc[self.child_index]
-
- def __str__(self):
- """e.g. *2:nsubj* or *2:nsubj(U)* to represent uncertainty."""
- working_label = str(self.label)
- if self.is_uncertain:
- working_label = ''.join((working_label, '(U)'))
- return ':'.join((str(self.child_index), working_label))
-
- def __eq__(self, other):
- return isinstance(other, SemanticDependency) and \
- self.parent_index == other.parent_index and self.child_index == other.child_index \
- and self.label == other.label and self.is_uncertain == other.is_uncertain
-
- def __hash__(self):
- return hash((self.parent_index, self.child_index, self.label, self.is_uncertain))
-
-class Mention:
- """ Simplified information about a coreference mention with respect to a specific token. """
-
- def __init__(self, root_index, indexes):
- self.root_index = root_index
- self.indexes = indexes
-
- def __str__(self):
- return ''.join(('[', str(self.root_index), '; ', str(self.indexes), ']'))
-
-class Subword:
- """A semantically atomic part of a word. Currently only used for German.
-
- containing_token_index -- the index of the containing token within the document.
- index -- the index of the subword within the word.
- text -- the original subword string.
- lemma -- the model-normalized representation of the subword string.
- derived_lemma -- where relevant, another lemma with which *lemma* is derivationally related
- and which can also be useful for matching in some usecases; otherwise *None*
- char_start_index -- the character index of the subword within the containing word.
- dependent_index -- the index of a subword that is dependent on this subword, or *None*
- if there is no such subword.
- dependency_label -- the label of the dependency between this subword and its dependent,
- or *None* if it has no dependent.
- governor_index -- the index of a subword on which this subword is dependent, or *None*
- if there is no such subword.
- governing_dependency_label -- the label of the dependency between this subword and its
- governor, or *None* if it has no governor.
- """
- def __init__(
- self, containing_token_index, index, text, lemma, derived_lemma, char_start_index,
- dependent_index, dependency_label, governor_index, governing_dependency_label):
- self.containing_token_index = containing_token_index
- self.index = index
- self.text = text
- self.lemma = lemma
- self.derived_lemma = derived_lemma
- self.char_start_index = char_start_index
- self.dependent_index = dependent_index
- self.dependency_label = dependency_label
- self.governor_index = governor_index
- self.governing_dependency_label = governing_dependency_label
-
- def lemma_or_derived_lemma(self):
- if self.derived_lemma is not None:
- return self.derived_lemma
- else:
- return self.lemma
-
- @property
- def is_head(self):
- return self.governor_index is None
-
- def __str__(self):
- if self.derived_lemma is not None:
- lemma_string = ''.join((self.lemma, '(', self.derived_lemma, ')'))
- else:
- lemma_string = self.lemma
- return '/'.join((self.text, lemma_string))
-
-class HolmesDictionary:
- """The holder object for token-level semantic information managed by Holmes
-
- Holmes dictionaries are accessed using the syntax *token._.holmes*.
-
- index -- the index of the token
- lemma -- the value returned from *._.holmes.lemma* for the token.
- derived_lemma -- the value returned from *._.holmes.derived_lemma for the token; where relevant,
- another lemma with which *lemma* is derivationally related and which can also be useful for
- matching in some usecases; otherwise *None*.
- """
-
- def __init__(self, index, lemma, derived_lemma):
- self.index = index
- self.lemma = lemma
- self._derived_lemma = derived_lemma
- self.children = [] # list of *SemanticDependency* objects where this token is the parent.
- self.righthand_siblings = [] # list of tokens to the right of this token that stand in a
- # conjunction relationship to this token and that share its semantic parents.
- self.token_or_lefthand_sibling_index = None # the index of this token's lefthand sibling,
- # or this token's own index if this token has no lefthand sibling.
- self.is_involved_in_or_conjunction = False
- self.is_negated = None
- self.is_matchable = None
- self.parent_dependencies = [] # list of [index, label] specifications of dependencies
- # where this token is the child. Takes any coreference resolution into account. Used in
- # topic matching.
- self.token_and_coreference_chain_indexes = None # where no coreference, only the token
- # index; where coreference, the token index followed by the indexes of coreferring tokens
- self.mentions = []
- self.mention_root_index = None # the lefthandmost token within of the mention that contains
- # this token within the first cluster to which this token
- # belongs, which will most often be this token itself
- self.subwords = []
-
- @property
- def derived_lemma(self):
- if self.lemma == self._derived_lemma: # can occur with phraselets
- return None
- else:
- return self._derived_lemma
-
- @derived_lemma.setter
- def derived_lemma(self, derived_lemma):
- self._derived_lemma = derived_lemma
-
- def lemma_or_derived_lemma(self):
- if self.derived_lemma is not None:
- return self.derived_lemma
- else:
- return self.lemma
-
- @property
- def is_uncertain(self):
- """if *True*, a match involving this token will itself be uncertain."""
- return self.is_involved_in_or_conjunction
-
- def loop_token_and_righthand_siblings(self, doc):
- """Convenience generator to loop through this token and any righthand siblings."""
- indexes = [self.index]
- indexes.extend(self.righthand_siblings)
- indexes = sorted(indexes) # in rare cases involving truncated nouns in German, righthand
- #siblings can actually end up to the left of the head word.
- for index in indexes:
- yield doc[index]
-
- def get_sibling_indexes(self, doc):
- """ Returns the indexes of this token and any siblings, ordered from left to right. """
- # with truncated nouns in German, the righthand siblings may occasionally occur to the left
- # of the head noun
- head_sibling = doc[self.token_or_lefthand_sibling_index]
- indexes = [self.token_or_lefthand_sibling_index]
- indexes.extend(head_sibling._.holmes.righthand_siblings)
- return sorted(indexes)
-
- def has_dependency_with_child_index(self, index):
- for dependency in self.children:
- if dependency.child_index == index:
- return True
- return False
-
- def get_label_of_dependency_with_child_index(self, index):
- for dependency in self.children:
- if dependency.child_index == index:
- return dependency.label
- return None
-
- def has_dependency_with_label(self, label):
- for dependency in self.children:
- if dependency.label == label:
- return True
- return False
-
- def has_dependency_with_child_index_and_label(self, index, label):
- for dependency in self.children:
- if dependency.child_index == index and dependency.label == label:
- return True
- return False
-
- def remove_dependency_with_child_index(self, index):
- self.children = [dep for dep in self.children if dep.child_index != index]
-
- def string_representation_of_children(self):
- children = sorted(
- self.children, key=lambda dependency: dependency.child_index)
- return '; '.join(str(child) for child in children)
-
-
-class SerializedHolmesDocument:
- """Consists of the spaCy represention returned by *get_bytes()* plus a jsonpickle representation
- of each token's *SemanticDictionary*.
- """
-
- def __init__(self, serialized_spacy_document, dictionaries, model):
- self._serialized_spacy_document = serialized_spacy_document
- self._dictionaries = dictionaries
- self._model = model
- self._version = SERIALIZED_DOCUMENT_VERSION
-
- def holmes_document(self, semantic_analyzer):
- doc = Doc(semantic_analyzer.nlp.vocab).from_bytes(
- self._serialized_spacy_document)
- for token in doc:
- token._.holmes = self._dictionaries[token.i]
- return doc
-
-class PhraseletTemplate:
- """A template for a phraselet used in topic matching.
-
- Properties:
-
- label -- a label for the relation which will be used to form part of the labels of phraselets
- derived from this template.
- template_sentence -- a sentence with the target grammatical structure for phraselets derived
- from this template.
- parent_index -- the index within 'template_sentence' of the parent participant in the dependency
- (for relation phraselets) or of the word (for single-word phraselets).
- child_index -- the index within 'template_sentence' of the child participant in the dependency
- (for relation phraselets) or 'None' for single-word phraselets.
- dependency_labels -- the labels of dependencies that match the template
- (for relation phraselets) or 'None' for single-word phraselets.
- parent_tags -- the tag_ values of parent participants in the dependency (for parent phraselets)
- of of the word (for single-word phraselets) that match the template.
- child_tags -- the tag_ values of child participants in the dependency (for parent phraselets)
- that match the template, or 'None' for single-word phraselets.
- reverse_only -- specifies that relation phraselets derived from this template should only be
- reverse-matched, e.g. matching should only be attempted during topic matching when the
- possible child token has already been matched to a single-word phraselet. This
- is used for performance reasons when the parent tag belongs to a closed word class like
- prepositions. Reverse-only phraselets are ignored in supervised document classification.
- assigned_dependency_label -- if a value other than 'None', specifies a dependency label that
- should be used to relabel the relationship between the parent and child participants.
- Has no effect if child_index is None.
- """
-
- def __init__(
- self, label, template_sentence, parent_index, child_index,
- dependency_labels, parent_tags, child_tags, *, reverse_only,
- assigned_dependency_label=None):
- self.label = label
- self.template_sentence = template_sentence
- self.parent_index = parent_index
- self.child_index = child_index
- self.dependency_labels = dependency_labels
- self.parent_tags = parent_tags
- self.child_tags = child_tags
- self.reverse_only = reverse_only
- self.assigned_dependency_label = assigned_dependency_label
-
- def single_word(self):
- """ 'True' if this is a template for single-word phraselets, otherwise 'False'. """
- return self.child_index is None
-
-class SemanticAnalyzerFactory():
- """Returns the correct *SemanticAnalyzer* for the model language. This class must be added to
- if additional *SemanticAnalyzer* implementations are added for new languages.
- """
-
- def semantic_analyzer(self, *, model, perform_coreference_resolution, debug=False):
- language = model[0:2]
- if language == 'en':
- return EnglishSemanticAnalyzer(
- model=model, perform_coreference_resolution=perform_coreference_resolution,
- debug=debug)
- elif language == 'de':
- return GermanSemanticAnalyzer(
- model=model, perform_coreference_resolution=perform_coreference_resolution,
- debug=debug)
- else:
- raise ValueError(
- ' '.join(['No semantic analyzer for model', language]))
-
-class SemanticAnalyzer(ABC):
- """Abstract *SemanticAnalyzer* parent class. Functionality is placed here that is common to all
- current implementations. It follows that some functionality will probably have to be moved
- out to specific implementations whenever an implementation for a new language is added.
-
- For explanations of the abstract variables and methods, see the *EnglishSemanticAnalyzer*
- implementation where they can be illustrated with direct examples.
- """
-
- def __init__(self, *, model, perform_coreference_resolution, debug):
- """Args:
-
- model -- the name of the spaCy model
- perform_coreference_resolution -- *True* if neuralcoref should be added to the pipe,
- *None* if neuralcoref should be added to the pipe if coreference resolution is
- available for the model
- debug -- *True* if the object should print a representation of each parsed document
- """
- self.nlp = spacy.load(model)
- if perform_coreference_resolution is None and self.model_supports_coreference_resolution():
- perform_coreference_resolution = True
- if perform_coreference_resolution:
- neuralcoref.add_to_pipe(self.nlp)
- self.model = model
- self.perform_coreference_resolution = perform_coreference_resolution
- self.debug = debug
- self._derivational_dictionary = self._load_derivational_dictionary()
-
- Token.set_extension('holmes', default='')
-
- def _load_derivational_dictionary(self):
- in_package_filename = ''.join(('data/derivation_', self.model[0:2], '.csv'))
- absolute_filename = pkg_resources.resource_filename(__name__, in_package_filename)
- dictionary = {}
- with open(absolute_filename, "r", encoding="utf-8") as file:
- for line in file.readlines():
- words = [word.strip() for word in line.split(',')]
- for index in range(len(words)):
- dictionary[words[index]] = words[0]
- return dictionary
-
- def reload_model(self):
- spacy.load(self.model)
-
- def parse(self, text):
- """Performs a full spaCy and Holmes parse on a string.
- """
- spacy_doc = self.spacy_parse(text)
- holmes_doc = self.holmes_parse(spacy_doc)
- return holmes_doc
-
- _maximum_document_size = 1000000
-
- def spacy_parse(self, text):
- """Performs a standard spaCy parse on a string.
- """
- if len(text) > self._maximum_document_size:
- raise DocumentTooBigError(' '.join((
- 'size:', str(len(text)), 'max:', str(self._maximum_document_size))))
- return self.nlp(text)
-
- def holmes_parse(self, spacy_doc):
- """Adds the Holmes-specific information to each token within a spaCy document.
- """
- for token in spacy_doc:
- lemma = self._holmes_lemma(token)
- derived_lemma = self.derived_holmes_lemma(token, lemma)
- token._.set('holmes', HolmesDictionary(token.i, lemma, derived_lemma))
- for token in spacy_doc:
- self._set_negation(token)
- for token in spacy_doc:
- self._initialize_semantic_dependencies(token)
- for token in spacy_doc:
- self._mark_if_righthand_sibling(token)
- token._.holmes.token_or_lefthand_sibling_index = self._lefthand_sibling_recursively(
- token)
- for token in spacy_doc:
- self._copy_any_sibling_info(token)
- subword_cache = {}
- for token in spacy_doc:
- self._add_subwords(token, subword_cache)
- for token in spacy_doc:
- self._set_coreference_information(token)
- for token in spacy_doc:
- self._set_matchability(token)
- for token in spacy_doc:
- self._correct_auxiliaries_and_passives(token)
- for token in spacy_doc:
- self._copy_any_sibling_info(token)
- for token in spacy_doc:
- self._normalize_predicative_adjectives(token)
- for token in spacy_doc:
- self._handle_relative_constructions(token)
- for token in spacy_doc:
- self._create_additional_preposition_phrase_semantic_dependencies(token)
- for token in spacy_doc:
- self._perform_language_specific_tasks(token)
- for token in spacy_doc:
- self._create_parent_dependencies(token)
- self.debug_structures(spacy_doc)
- return spacy_doc
-
- def model_supports_embeddings(self):
- return self.nlp.meta['vectors']['vectors'] > 0
-
- def model_supports_coreference_resolution(self):
- return self._model_supports_coreference_resolution
-
- def dependency_labels_match(self, *, search_phrase_dependency_label, document_dependency_label):
- """Determines whether a dependency label in a search phrase matches a dependency label in
- a document being searched.
- """
- if search_phrase_dependency_label == document_dependency_label:
- return True
- if search_phrase_dependency_label not in self._matching_dep_dict.keys():
- return False
- return document_dependency_label in self._matching_dep_dict[search_phrase_dependency_label]
-
- def _lefthand_sibling_recursively(self, token):
- """If *token* is a righthand sibling, return the index of the token that has a sibling
- reference to it, otherwise return the index of *token* itself.
- """
- if token.dep_ not in self._conjunction_deps:
- return token.i
- else:
- return self._lefthand_sibling_recursively(token.head)
-
- def debug_structures(self, doc):
- if self.debug:
- for token in doc:
- if token._.holmes.derived_lemma is not None:
- lemma_string = ''.join((
- token._.holmes.lemma, '(', token._.holmes.derived_lemma, ')'))
- else:
- lemma_string = token._.holmes.lemma
- subwords_strings = ';'.join(str(subword) for subword in token._.holmes.subwords)
- subwords_strings = ''.join(('[', subwords_strings, ']'))
- negation_string = 'negative' if token._.holmes.is_negated else 'positive'
- uncertainty_string = 'uncertain' if token._.holmes.is_uncertain else 'certain'
- matchability_string = 'matchable' if token._.holmes.is_matchable else 'unmatchable'
- if self.is_involved_in_coreference(token):
- coreference_string = '; '.join(
- str(mention) for mention in token._.holmes.mentions)
- else:
- coreference_string = ''
- print(
- token.i, token.text, lemma_string, subwords_strings, token.pos_, token.tag_,
- token.dep_, token.ent_type_, token.head.i,
- token._.holmes.string_representation_of_children(),
- token._.holmes.righthand_siblings, negation_string,
- uncertainty_string, matchability_string, coreference_string)
-
- def to_serialized_string(self, spacy_doc):
- dictionaries = []
- for token in spacy_doc:
- dictionaries.append(token._.holmes)
- token._.holmes = None
- serialized_document = SerializedHolmesDocument(
- spacy_doc.to_bytes(), dictionaries, self.model)
- for token in spacy_doc:
- token._.holmes = dictionaries[token.i]
- return jsonpickle.encode(serialized_document)
-
- def from_serialized_string(self, serialized_spacy_doc):
- serialized_document = jsonpickle.decode(serialized_spacy_doc)
- if serialized_document._model != self.model:
- raise WrongModelDeserializationError(serialized_document._model)
- if serialized_document._version != SERIALIZED_DOCUMENT_VERSION:
- raise WrongVersionDeserializationError(serialized_document._version)
- return serialized_document.holmes_document(self)
-
- def get_dependent_phrase(self, token, subword):
- """Return the dependent phrase of a token, with an optional subword reference. Used in
- building match dictionaries"""
- if subword is not None:
- return subword.text
- if not token.pos_ in self.noun_pos:
- return token.text
- return_string = ''
- pointer = token.left_edge.i - 1
- while True:
- pointer += 1
- if token.doc[pointer].pos_ not in self.noun_pos and token.doc[pointer].dep_ not in \
- self.noun_kernel_dep and pointer > token.i:
- return return_string.strip()
- if return_string == '':
- return_string = token.doc[pointer].text
- else:
- return_string = ' '.join((return_string, token.doc[pointer].text))
- if token.right_edge.i <= pointer:
- return return_string
-
- def is_involved_in_coreference(self, token):
- return len(token._.holmes.mentions) > 0
-
- def _set_coreference_information(self, token):
- token._.holmes.token_and_coreference_chain_indexes = [token.i]
- if not self.perform_coreference_resolution or not token.doc._.has_coref or not \
- token._.in_coref:
- return
- for cluster in token._.coref_clusters:
- counter = 0
- this_token_mention_index = -1
- for span in cluster:
- for candidate in span.root._.holmes.loop_token_and_righthand_siblings(
- token.doc):
- if candidate.i == token.i and candidate.i >= span.start and candidate.i < \
- span.end:
- this_token_mention_index = counter
- if token._.holmes.mention_root_index is None:
- token._.holmes.mention_root_index = span.root.i
- break
- if this_token_mention_index > -1:
- break
- counter += 1
- counter = 0
- if this_token_mention_index > -1:
- for span in cluster:
- if abs(counter - this_token_mention_index) <= \
- self._maximum_mentions_in_coreference_chain and \
- abs(span.root.i - token.i) < \
- self._maximum_word_distance_in_coreference_chain:
- siblings_of_span_root = [span.root.i]
- siblings_of_span_root.extend(span.root._.holmes.righthand_siblings)
- indexes_within_mention = []
- for candidate in siblings_of_span_root:
- if span.start <= candidate < span.end and not \
- (candidate != token.i and token.i in siblings_of_span_root):
- indexes_within_mention.append(candidate)
- token._.holmes.mentions.append(Mention(span.root.i, indexes_within_mention))
- counter += 1
- working_set = set()
- for mention in token._.holmes.mentions:
- working_set.update(mention.indexes)
- if len(working_set) > 1:
- working_set.remove(token.i)
- token._.holmes.token_and_coreference_chain_indexes.extend(sorted(working_set))
- # this token must always be the first in the list to ensure it is recorded as the
- # structurally matched token during structural matching
-
- def belongs_to_entity_defined_multiword(self, token):
- return token.pos_ in self._entity_defined_multiword_pos and token.ent_type_ in \
- self._entity_defined_multiword_entity_types
-
- def get_entity_defined_multiword(self, token):
- """ If this token is at the head of a multiword recognized by spaCy named entity processing,
- returns the multiword string in lower case and the indexes of the tokens that make up
- the multiword, otherwise *None, None*.
- """
- if not self.belongs_to_entity_defined_multiword(token) or (
- token.dep_ != 'ROOT' and self.belongs_to_entity_defined_multiword(token.head)) or \
- token.ent_type_ == '' or token.left_edge.i == token.right_edge.i:
- return None, None
- working_ent = token.ent_type_
- working_text = ''
- working_indexes = []
- for counter in range(token.left_edge.i, token.right_edge.i +1):
- multiword_token = token.doc[counter]
- if not self.belongs_to_entity_defined_multiword(multiword_token) or \
- multiword_token.ent_type_ != working_ent:
- if working_text != '':
- return None, None
- else:
- continue
- working_text = ' '.join((working_text, multiword_token.text))
- working_indexes.append(multiword_token.i)
- if len(working_text.split()) > 1:
- return working_text.strip().lower(), working_indexes
- else:
- return None, None
-
- def embedding_matching_permitted(self, obj):
- if isinstance(obj, Token):
- if len(obj._.holmes.lemma.split()) > 1:
- working_lemma = obj.lemma_
- else:
- working_lemma = obj._.holmes.lemma
- return obj.pos_ in self._permissible_embedding_pos and \
- len(working_lemma) >= self._minimum_embedding_match_word_length
- elif isinstance(obj, Subword):
- return len(obj.lemma) >= self._minimum_embedding_match_word_length
-
- language_name = NotImplemented
-
- noun_pos = NotImplemented
-
- _matchable_pos = NotImplemented
-
- _adjectival_predicate_head_pos = NotImplemented
-
- _adjectival_predicate_subject_pos = NotImplemented
-
- noun_kernel_dep = NotImplemented
-
- sibling_marker_deps = NotImplemented
-
- _adjectival_predicate_subject_dep = NotImplemented
-
- _adjectival_predicate_predicate_dep = NotImplemented
-
- _modifier_dep = NotImplemented
-
- _spacy_noun_to_preposition_dep = NotImplemented
-
- _spacy_verb_to_preposition_dep = NotImplemented
-
- _holmes_noun_to_preposition_dep = NotImplemented
-
- _holmes_verb_to_preposition_dep = NotImplemented
-
- _conjunction_deps = NotImplemented
-
- _interrogative_pronoun_tags = NotImplemented
-
- _semantic_dependency_excluded_tags = NotImplemented
-
- _generic_pronoun_lemmas = NotImplemented
-
- _or_lemma = NotImplemented
-
- _matching_dep_dict = NotImplemented
-
- _mark_child_dependencies_copied_to_siblings_as_uncertain = NotImplemented
-
- _maximum_mentions_in_coreference_chain = NotImplemented
-
- _maximum_word_distance_in_coreference_chain = NotImplemented
-
- _model_supports_coreference_resolution = NotImplemented
-
- _entity_defined_multiword_pos = NotImplemented
-
- _entity_defined_multiword_entity_types = NotImplemented
-
- phraselet_templates = NotImplemented
-
- topic_matching_phraselet_stop_lemmas = NotImplemented
-
- supervised_document_classification_phraselet_stop_lemmas = NotImplemented
-
- topic_matching_reverse_only_parent_lemmas = NotImplemented
-
- preferred_phraselet_pos = NotImplemented
-
- _permissible_embedding_pos = NotImplemented
-
- _minimum_embedding_match_word_length = NotImplemented
-
- @abstractmethod
- def _add_subwords(self, token, subword_cache):
- pass
-
- @abstractmethod
- def _set_negation(self, token):
- pass
-
- @abstractmethod
- def _correct_auxiliaries_and_passives(self, token):
- pass
-
- @abstractmethod
- def _perform_language_specific_tasks(self, token):
- pass
-
- @abstractmethod
- def _handle_relative_constructions(self, token):
- pass
-
- @abstractmethod
- def _holmes_lemma(self, token):
- pass
-
- def derived_holmes_lemma(self, token, lemma):
- if lemma in self._derivational_dictionary:
- derived_lemma = self._derivational_dictionary[lemma]
- if lemma == derived_lemma: # basis entry, so do not call language specific method
- return None
- else:
- return derived_lemma
- else:
- return self._language_specific_derived_holmes_lemma(token, lemma)
-
- @abstractmethod
- def normalize_hyphens(self, word):
- pass
-
- @abstractmethod
- def _language_specific_derived_holmes_lemma(self, token, lemma):
- pass
-
- def _initialize_semantic_dependencies(self, token):
- for child in (
- child for child in token.children if child.dep_ != 'punct' and
- child.tag_ not in self._semantic_dependency_excluded_tags):
- token._.holmes.children.append(SemanticDependency(token.i, child.i, child.dep_))
-
- def _mark_if_righthand_sibling(self, token):
- if token.dep_ in self.sibling_marker_deps: # i.e. is righthand sibling
- working_token = token
- working_or_conjunction_flag = False
- # work up through the tree until the lefthandmost sibling element with the
- # semantic relationships to the rest of the sentence is reached
- while working_token.dep_ in self._conjunction_deps:
- working_token = working_token.head
- for working_child in working_token.children:
- if working_child.lemma_ == self._or_lemma:
- working_or_conjunction_flag = True
- # add this element to the lefthandmost sibling as a righthand sibling
- working_token._.holmes.righthand_siblings.append(token.i)
- if working_or_conjunction_flag:
- working_token._.holmes.is_involved_in_or_conjunction = True
-
- def _copy_any_sibling_info(self, token):
- # Copy the or conjunction flag to righthand siblings
- if token._.holmes.is_involved_in_or_conjunction:
- for righthand_sibling in token._.holmes.righthand_siblings:
- token.doc[righthand_sibling]._.holmes.is_involved_in_or_conjunction = True
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.child_index >= 0):
- # where a token has a dependent token and the dependent token has righthand siblings,
- # add dependencies from the parent token to the siblings
- for child_righthand_sibling in \
- token.doc[dependency.child_index]._.holmes.righthand_siblings:
- # Check this token does not already have the dependency
- if len([dependency for dependency in token._.holmes.children if
- dependency.child_index == child_righthand_sibling]) == 0:
- child_index_to_add = child_righthand_sibling
- # If this token is a grammatical element, it needs to point to new
- # child dependencies as a grammatical element as well
- if dependency.child_index < 0:
- child_index_to_add = 0 - (child_index_to_add + 1)
- # Check adding the new dependency will not result in a loop and that
- # this token still does not have the dependency now its index has
- # possibly been changed
- if token.i != child_index_to_add and not \
- token._.holmes.has_dependency_with_child_index(child_index_to_add):
- token._.holmes.children.append(SemanticDependency(
- token.i, child_index_to_add, dependency.label, dependency.is_uncertain))
- # where a token has a dependent token and the parent token has righthand siblings,
- # add dependencies from the siblings to the dependent token, unless the dependent
- # token is to the right of the parent token but to the left of the sibling.
- for righthand_sibling in (
- righthand_sibling for righthand_sibling in
- token._.holmes.righthand_siblings if righthand_sibling !=
- dependency.child_index and (
- righthand_sibling < dependency.child_index or
- dependency.child_index < token.i)):
- # unless the sibling already contains a dependency with the same label
- # or the sibling has this token as a dependent child
- righthand_sibling_token = token.doc[righthand_sibling]
- if len([sibling_dependency for sibling_dependency in
- righthand_sibling_token._.holmes.children if
- sibling_dependency.label == dependency.label]) == 0 and \
- dependency.label not in self._conjunction_deps and not \
- righthand_sibling_token._.holmes.has_dependency_with_child_index(
- dependency.child_index) \
- and righthand_sibling != dependency.child_index:
- righthand_sibling_token._.holmes.children.append(SemanticDependency(
- righthand_sibling, dependency.child_index, dependency.label,
- self._mark_child_dependencies_copied_to_siblings_as_uncertain
- or dependency.is_uncertain))
-
- def _normalize_predicative_adjectives(self, token):
- """Change phrases like *the town is old* and *the man is poor* so their
- semantic structure is equivalent to *the old town* and *the poor man*.
- """
- if token.pos_ == self._adjectival_predicate_head_pos:
- altered = False
- for predicative_adjective_index in (
- dependency.child_index for dependency in \
- token._.holmes.children if dependency.label ==
- self._adjectival_predicate_predicate_dep and
- token.doc[dependency.child_index].pos_ == 'ADJ' and
- dependency.child_index >= 0):
- for subject_index in (
- dependency.child_index for dependency in
- token._.holmes.children if dependency.label ==
- self._adjectival_predicate_subject_dep and (
- dependency.child_token(token.doc).pos_ in
- self._adjectival_predicate_subject_pos or
- self.is_involved_in_coreference(dependency.child_token(token.doc))) and
- dependency.child_index >= 0 and \
- dependency.child_index != predicative_adjective_index):
- token.doc[subject_index]._.holmes.children.append(
- SemanticDependency(
- subject_index, predicative_adjective_index, self._modifier_dep))
- altered = True
- if altered:
- token._.holmes.children = [SemanticDependency(
- token.i, 0 - (subject_index + 1), None)]
-
- def _create_additional_preposition_phrase_semantic_dependencies(self, token):
- """In structures like 'Somebody needs insurance for a period' it seems to be
- mainly language-dependent whether the preposition phrase is analysed as being
- dependent on the preceding noun or the preceding verb. We add an additional, new
- dependency to whichever of the noun or the verb does not already have one. In English,
- the new label is defined in *_matching_dep_dict* in such a way that original
- dependencies in search phrases match new dependencies in documents but not vice versa.
- This restriction is not applied in German because the fact the verb can be placed in
- different positions within the sentence means there is considerable variation around
- how prepositional phrases are analyzed by spaCy.
- """
-
- def add_dependencies_pointing_to_preposition_and_siblings(parent, label):
- for working_preposition in token._.holmes.loop_token_and_righthand_siblings(token.doc):
- if parent.i != working_preposition.i:
- parent._.holmes.children.append(SemanticDependency(
- parent.i, working_preposition.i, label, True))
-
- # token is a preposition ...
- if token.pos_ == 'ADP':
- # directly preceded by a noun
- if token.i > 0 and token.doc[token.i-1].sent == token.sent and (
- token.doc[token.i-1].pos_ in ('NOUN', 'PROPN') or
- self.is_involved_in_coreference(token.doc[token.i-1])):
- preceding_noun = token.doc[token.i-1]
- # and the noun is governed by at least one verb
- governing_verbs = [
- working_token for working_token in token.sent
- if working_token.pos_ == 'VERB' and
- working_token._.holmes.has_dependency_with_child_index(
- preceding_noun.i)]
- if len(governing_verbs) == 0:
- return
- # if the noun governs the preposition, add new possible dependencies
- # from the verb(s)
- for governing_verb in governing_verbs:
- if preceding_noun._.holmes.has_dependency_with_child_index_and_label(
- token.i, self._spacy_noun_to_preposition_dep) and not \
- governing_verb._.holmes.has_dependency_with_child_index_and_label(
- token.i, self._spacy_verb_to_preposition_dep):
- add_dependencies_pointing_to_preposition_and_siblings(
- governing_verb, self._holmes_verb_to_preposition_dep)
- # if the verb(s) governs the preposition, add new possible dependencies
- # from the noun
- if governing_verbs[0]._.holmes.has_dependency_with_child_index_and_label(
- token.i, self._spacy_verb_to_preposition_dep) and not \
- preceding_noun._.holmes.has_dependency_with_child_index_and_label(
- token.i, self._spacy_noun_to_preposition_dep):
- # check the preposition is not pointing back to a relative clause
- for preposition_dep_index in (
- dep.child_index for dep in token._.holmes.children
- if dep.child_index >= 0):
- if token.doc[preposition_dep_index]._.holmes.\
- has_dependency_with_label('relcl'):
- return
- add_dependencies_pointing_to_preposition_and_siblings(
- preceding_noun, self._holmes_noun_to_preposition_dep)
-
- def _set_matchability(self, token):
- """Marks whether this token, if it appears in a search phrase, should require a counterpart
- in a document being matched.
- """
- token._.holmes.is_matchable = (
- token.pos_ in self._matchable_pos or self.is_involved_in_coreference(token)) \
- and token.tag_ not in self._interrogative_pronoun_tags and \
- token._.holmes.lemma not in self._generic_pronoun_lemmas
-
- def _move_information_between_tokens(self, from_token, to_token):
- """Moves semantic child and sibling information from one token to another.
-
- Args:
-
- from_token -- the source token, which will be marked as a grammatical token
- pointing to *to_token*.
- to_token -- the destination token.
- """
- linking_dependencies = [
- dependency for dependency in from_token._.holmes.children
- if dependency.child_index == to_token.i]
- if len(linking_dependencies) == 0:
- return # should only happen if there is a problem with the spaCy structure
- linking_dependency_label = linking_dependencies[0].label
- # only loop dependencies whose label or index are not already present at the destination
- for dependency in (
- dependency for dependency in from_token._.holmes.children
- if dependency.label != linking_dependency_label and not
- to_token._.holmes.has_dependency_with_child_index(dependency.child_index) and
- to_token.i != dependency.child_index):
- to_token._.holmes.children.append(SemanticDependency(
- to_token.i, dependency.child_index, dependency.label, dependency.is_uncertain))
- from_token._.holmes.children = [SemanticDependency(from_token.i, 0 - (to_token.i + 1))]
- to_token._.holmes.righthand_siblings.extend(
- from_token._.holmes.righthand_siblings)
- from_token._.holmes.righthand_siblings = []
- if from_token._.holmes.is_involved_in_or_conjunction:
- to_token._.holmes.is_involved_in_or_conjunction = True
- if from_token._.holmes.is_negated:
- to_token._.holmes.is_negated = True
- # If from_token is the righthand sibling of some other token within the same sentence,
- # replace that token's reference with a reference to to_token
- for token in from_token.sent:
- if from_token.i in token._.holmes.righthand_siblings:
- token._.holmes.righthand_siblings.remove(from_token.i)
- if token.i != to_token.i:
- token._.holmes.righthand_siblings.append(to_token.i)
-
- def _create_parent_dependencies(self, token):
- if self.perform_coreference_resolution:
- for linked_parent_index in token._.holmes.token_and_coreference_chain_indexes:
- linked_parent = token.doc[linked_parent_index]
- for child_dependency in (
- child_dependency for child_dependency in linked_parent._.holmes.children
- if child_dependency.child_index >= 0):
- child_token = child_dependency.child_token(token.doc)
- for linked_child_index in \
- child_token._.holmes.token_and_coreference_chain_indexes:
- linked_child = token.doc[linked_child_index]
- linked_child._.holmes.parent_dependencies.append([
- token.i, child_dependency.label])
- else:
- for child_dependency in (
- child_dependency for child_dependency in token._.holmes.children
- if child_dependency.child_index >= 0):
- child_token = child_dependency.child_token(token.doc)
- child_token._.holmes.parent_dependencies.append([
- token.i, child_dependency.label])
-
-class EnglishSemanticAnalyzer(SemanticAnalyzer):
-
- language_name = 'English'
-
- # The part of speech tags that require a match in the search sentence when they occur within a
- # search_phrase
- _matchable_pos = ('ADJ', 'ADP', 'ADV', 'NOUN', 'NUM', 'PROPN', 'VERB')
-
- # The part of speech tags that can refer to nouns
- noun_pos = ('NOUN', 'PROPN')
-
- # The part of speech tags that can refer to the head of an adjectival predicate phrase
- # ("is" in "The dog is tired")
- _adjectival_predicate_head_pos = 'VERB'
-
- # The part of speech tags that can refer to the subject of a adjectival predicate
- # ("dog" in "The dog is tired")
- _adjectival_predicate_subject_pos = ('NOUN', 'PROPN', 'PRON')
-
- # Dependency labels that mark noun kernel elements that are not the head noun
- noun_kernel_dep = ('nmod', 'compound', 'appos', 'nummod')
-
- # Dependency labels that can mark righthand siblings
- sibling_marker_deps = ('conj', 'appos')
-
- # Dependency label that marks the subject of an adjectival predicate
- _adjectival_predicate_subject_dep = 'nsubj'
-
- # Dependency label that marks the predicate of an adjectival predicate
- _adjectival_predicate_predicate_dep = 'acomp'
-
- # Dependency label that marks a modifying adjective
- _modifier_dep = 'amod'
-
- # Original dependency label from nouns to prepositions
- _spacy_noun_to_preposition_dep = 'prep'
-
- # Original dependency label from verbs to prepositions
- _spacy_verb_to_preposition_dep = 'prep'
-
- # Added possible dependency label from nouns to prepositions
- _holmes_noun_to_preposition_dep = 'prepposs'
-
- # Added possible dependency label from verbs to prepositions
- _holmes_verb_to_preposition_dep = 'prepposs'
-
- # Dependency labels that occur in a conjunction phrase (righthand siblings and conjunctions)
- _conjunction_deps = ('conj', 'appos', 'cc')
-
- # Syntactic tags that can mark interrogative pronouns
- _interrogative_pronoun_tags = ('WDT', 'WP', 'WRB')
-
- # Syntactic tags that exclude a token from being the child token within a semantic dependency
- _semantic_dependency_excluded_tags = ('DT')
-
- # Generic pronouns
- _generic_pronoun_lemmas = ('something', 'somebody', 'someone')
-
- # The word for 'or' in this language
- _or_lemma = 'or'
-
- # Map from dependency tags as occurring within search phrases to corresponding dependency tags
- # as occurring within documents being searched. This is the main source of the asymmetry
- # in matching from search phrases to documents versus from documents to search phrases.
- _matching_dep_dict = {
- 'nsubj': ['csubj', 'poss', 'pobjb', 'pobjo', 'advmodsubj', 'arg'],
- 'acomp': ['amod', 'advmod', 'npmod', 'advcl'],
- 'amod': ['acomp', 'advmod', 'npmod', 'advcl'],
- 'advmod': ['acomp', 'amod', 'npmod', 'advcl'],
- 'arg': [
- 'nsubj', 'csubj', 'poss', 'pobjb', 'advmodsubj', 'dobj', 'pobjo', 'relant',
- 'nsubjpass', 'csubjpass', 'compound', 'advmodobj', 'dative', 'pobjp'],
- 'compound': [
- 'nmod', 'appos', 'nounmod', 'nsubj', 'csubj', 'poss', 'pobjb',
- 'advmodsubj', 'dobj', 'pobjo', 'relant', 'pobjp',
- 'nsubjpass', 'csubjpass', 'arg', 'advmodobj', 'dative'],
- 'dative': ['pobjt', 'relant', 'nsubjpass'],
- 'pobjt': ['dative', 'relant'],
- 'nsubjpass': [
- 'dobj', 'pobjo', 'poss', 'relant', 'csubjpass',
- 'compound', 'advmodobj', 'arg', 'dative'],
- 'dobj': [
- 'pobjo', 'poss', 'relant', 'nsubjpass', 'csubjpass',
- 'compound', 'advmodobj', 'arg', 'xcomp'],
- 'nmod': ['appos', 'compound', 'nummod'],
- 'poss': [
- 'pobjo', 'nsubj', 'csubj', 'pobjb', 'advmodsubj', 'arg', 'relant',
- 'nsubjpass', 'csubjpass', 'compound', 'advmodobj'],
- 'pobjo': [
- 'poss', 'dobj', 'relant', 'nsubjpass', 'csubjpass',
- 'compound', 'advmodobj', 'arg', 'xcomp', 'nsubj', 'csubj', 'advmodsubj'],
- 'pobjb': ['nsubj', 'csubj', 'poss', 'advmodsubj', 'arg'],
- 'pobjp': ['compound'],
- 'prep': ['prepposs'],
- 'xcomp': [
- 'pobjo', 'poss', 'relant', 'nsubjpass', 'csubjpass',
- 'compound', 'advmodobj', 'arg', 'dobj']}
-
- # Where dependencies from a parent to a child are copied to the parent's righthand siblings,
- # it can make sense to mark the dependency as uncertain depending on the underlying spaCy
- # representations for the individual language
- _mark_child_dependencies_copied_to_siblings_as_uncertain = True
-
- # Coreference chains are only processed up to this number of mentions away from the currently
- # matched document location
- _maximum_mentions_in_coreference_chain = 3
-
- # Coreference chains are only processed up to this number of words away from the currently
- # matched document location
- _maximum_word_distance_in_coreference_chain = 300
-
- # Presently depends purely on the language
- _model_supports_coreference_resolution = True
-
- # The part-of-speech labels permitted for elements of an entity-defined multiword.
- _entity_defined_multiword_pos = ('NOUN', 'PROPN')
-
- # The entity labels permitted for elements of an entity-defined multiword.
- _entity_defined_multiword_entity_types = ('PERSON', 'ORG', 'GPE', 'WORK_OF_ART')
-
- # The templates used to generate topic matching phraselets.
- phraselet_templates = [
- PhraseletTemplate(
- "predicate-actor", "A thing does", 2, 1,
- ['nsubj', 'csubj', 'pobjb', 'advmodsubj'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "predicate-patient", "Somebody does a thing", 1, 3,
- ['dobj', 'relant', 'advmodobj', 'xcomp'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- reverse_only=False),
- PhraseletTemplate(
- "word-ofword", "A thing of a thing", 1, 4,
- ['pobjo', 'poss'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- reverse_only=False),
- PhraseletTemplate(
- "predicate-toughmovedargument", "A thing is easy to do", 5, 1,
- ['arg'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "predicate-passivesubject", "A thing is done", 3, 1,
- ['nsubjpass', 'csubjpass'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "be-attribute", "Something is a thing", 1, 3,
- ['attr'],
- ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=True),
- PhraseletTemplate(
- "predicate-recipient", "Somebody gives a thing something", 1, 3,
- ['dative', 'pobjt'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "governor-adjective", "A described thing", 2, 1,
- ['acomp', 'amod', 'advmod', 'npmod', 'advcl'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['JJ', 'JJR', 'JJS', 'VBN', 'RB', 'RBR', 'RBS'], reverse_only=False),
- PhraseletTemplate(
- "noun-noun", "A thing thing", 2, 1,
- ['nmod', 'appos', 'compound', 'nounmod'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "number-noun", "Seven things", 1, 0,
- ['nummod'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
- ['CD'], reverse_only=False),
- PhraseletTemplate(
- "prepgovernor-noun", "A thing in a thing", 1, 4,
- ['pobjp'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=False),
- PhraseletTemplate(
- "prep-noun", "in a thing", 0, 2,
- ['pobj'],
- ['IN'], ['FW', 'NN', 'NNP', 'NNPS', 'NNS'], reverse_only=True),
- PhraseletTemplate(
- "word", "thing", 0, None,
- None,
- ['FW', 'NN', 'NNP', 'NNPS', 'NNS'],
- None, reverse_only=False)
- ]
-
- # Lemmas that should be suppressed within relation phraselets or as words of
- # single-word phraselets during topic matching.
- topic_matching_phraselet_stop_lemmas = ('then', 'therefore', 'so', '-pron-')
-
- # Lemmas that should be suppressed within relation phraselets or as words of
- # single-word phraselets during supervised document classification.
- supervised_document_classification_phraselet_stop_lemmas = ('be', 'have')
-
- # Parent lemma / part-of-speech combinations that should lead to phraselets being
- # reverse-matched only during topic matching.
- topic_matching_reverse_only_parent_lemmas = (
- ('be', 'VERB'), ('have', 'VERB'), ('do', 'VERB'),
- ('say', 'VERB'), ('go', 'VERB'), ('get', 'VERB'), ('make', 'VERB'))
-
- # Parts of speech that are preferred as lemmas within phraselets
- preferred_phraselet_pos = ('NOUN', 'PROPN')
-
- # Parts of speech for which embedding matching is attempted
- _permissible_embedding_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV')
-
- # Minimum length of a word taking part in an embedding-based match.
- # Necessary because of the proliferation of short nonsense strings in the vocabularies.
- _minimum_embedding_match_word_length = 3
-
- def _add_subwords(self, token, subword_cache):
- """ Analyses the internal structure of the word to find atomic semantic elements. Is
- relevant for German and not currently implemented for English.
- """
- pass
-
- def _set_negation(self, token):
- """Marks the negation on the token. A token is negative if it or one of its ancestors
- has a negation word as a syntactic (not semantic!) child.
- """
- if token._.holmes.is_negated is not None:
- return
- for child in token.children:
- if child._.holmes.lemma in (
- 'nobody', 'nothing', 'nowhere', 'noone', 'neither', 'nor', 'no') \
- or child.dep_ == 'neg':
- token._.holmes.is_negated = True
- return
- if child._.holmes.lemma in ('more', 'longer'):
- for grandchild in child.children:
- if grandchild._.holmes.lemma == 'no':
- token._.holmes.is_negated = True
- return
- if token.dep_ == 'ROOT':
- token._.holmes.is_negated = False
- return
- self._set_negation(token.head)
- token._.holmes.is_negated = token.head._.holmes.is_negated
-
- def _correct_auxiliaries_and_passives(self, token):
- """Wherever auxiliaries and passives are found, derive the semantic information
- from the syntactic information supplied by spaCy.
- """
- # 'auxpass' means an auxiliary used in a passive context. We mark its subject with
- # a new dependency label 'nsubjpass'.
- if len([
- dependency for dependency in token._.holmes.children
- if dependency.label == 'auxpass']) > 0:
- for dependency in token._.holmes.children:
- if dependency.label == 'nsubj':
- dependency.label = 'nsubjpass'
-
- # Structures like 'he used to' and 'he is going to'
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'xcomp'):
- child = dependency.child_token(token.doc)
- # distinguish 'he used to ...' from 'he used it to ...'
- if token._.holmes.lemma == 'use' and token.tag_ == 'VBD' and len([
- element for element in token._.holmes.children
- if element.label == 'dobj']) == 0:
- self._move_information_between_tokens(token, child)
- elif token._.holmes.lemma == 'go':
- # 'was going to' is marked as uncertain, 'is going to' is not marked as uncertain
- uncertainty_flag = False
- for other_dependency in (
- other_dependency for other_dependency in
- token._.holmes.children if other_dependency.label == 'aux'):
- other_dependency_token = other_dependency.child_token(token.doc)
- if other_dependency_token._.holmes.lemma == 'be' and \
- other_dependency_token.tag_ == 'VBD': # 'was going to'
- uncertainty_flag = True
- self._move_information_between_tokens(token, child)
- if uncertainty_flag:
- for child_dependency in child._.holmes.children:
- child_dependency.is_uncertain = True
- else:
- # constructions like:
- #
- #'she told him to close the contract'
- #'he decided to close the contract'
- for other_dependency in token._.holmes.children:
- if other_dependency.label in ('dobj', 'nsubjpass') or (
- other_dependency.label == 'nsubj' and \
- len([
- element for element in token._.holmes.children
- if element.label == 'dobj'])
- == 0):
- if len([
- element for element in child._.holmes.children
- if element.label == 'auxpass']) > 0:
- if not child._.holmes.has_dependency_with_child_index(
- other_dependency.child_index) and \
- dependency.child_index > other_dependency.child_index:
- child._.holmes.children.append(SemanticDependency(
- dependency.child_index, other_dependency.child_index,
- 'nsubjpass', True))
- else:
- if not child._.holmes.has_dependency_with_child_index(
- other_dependency.child_index) and \
- dependency.child_index > other_dependency.child_index:
- child._.holmes.children.append(SemanticDependency(
- dependency.child_index, other_dependency.child_index,
- 'nsubj', True))
-
- def _handle_relative_constructions(self, token):
- if token.dep_ == 'relcl':
- for dependency in token._.holmes.children:
- child = dependency.child_token(token.doc)
- # handle 'whose' clauses
- for child_dependency in (
- child_dependency for child_dependency in
- child._.holmes.children if child_dependency.child_index >= 0
- and child_dependency.label == 'poss' and
- child_dependency.child_token(token.doc).tag_ == 'WP$'):
- whose_pronoun_token = child_dependency.child_token(
- token.doc)
- working_index = whose_pronoun_token.i
- while working_index >= token.sent.start:
- # find the antecedent (possessed entity)
- for dependency in (
- dependency for dependency in
- whose_pronoun_token.doc[working_index]._.holmes.children
- if dependency.label == 'relcl'):
- working_token = child.doc[working_index]
- working_token = working_token.doc[
- working_token._.holmes.token_or_lefthand_sibling_index]
- for lefthand_sibling_of_antecedent in \
- working_token._.holmes.loop_token_and_righthand_siblings(
- token.doc):
- # find the possessing noun
- for possessing_noun in (
- possessing_noun for possessing_noun in
- child._.holmes.loop_token_and_righthand_siblings(token.doc)
- if possessing_noun.i != lefthand_sibling_of_antecedent.i):
- # add the semantic dependency
- possessing_noun._.holmes.children.append(
- SemanticDependency(
- possessing_noun.i,
- lefthand_sibling_of_antecedent.i, 'poss',
- lefthand_sibling_of_antecedent.i != working_index))
- # remove the syntactic dependency
- possessing_noun._.holmes.remove_dependency_with_child_index(
- whose_pronoun_token.i)
- whose_pronoun_token._.holmes.children = [SemanticDependency(
- whose_pronoun_token.i, 0 - (working_index + 1), None)]
- return
- working_index -= 1
- return
- if child.tag_ in ('WP', 'WRB', 'WDT'): # 'that' or 'which'
- working_dependency_label = dependency.label
- child._.holmes.children = [SemanticDependency(
- child.i, 0 - (token.head.i + 1), None)]
- else:
- # relative antecedent, new dependency tag, 'the man I saw yesterday'
- working_dependency_label = 'relant'
- last_righthand_sibling_of_predicate = list(
- token._.holmes.loop_token_and_righthand_siblings(token.doc))[-1]
- for preposition_dependency in (
- dep for dep in last_righthand_sibling_of_predicate._.holmes.children
- if dep.label == 'prep' and
- dep.child_token(token.doc)._.holmes.is_matchable):
- preposition = preposition_dependency.child_token(token.doc)
- for grandchild_dependency in (
- dep for dep in preposition._.holmes.children if
- dep.child_token(token.doc).tag_ in ('WP', 'WRB', 'WDT')
- and dep.child_token(token.doc).i >= 0):
- # 'that' or 'which'
- complementizer = grandchild_dependency.child_token(token.doc)
- preposition._.holmes.remove_dependency_with_child_index(
- grandchild_dependency.child_index)
- # a new relation pointing directly to the antecedent noun
- # will be added in the section below
- complementizer._.holmes.children = [SemanticDependency(
- grandchild_dependency.child_index, 0 - (token.head.i + 1), None)]
- displaced_preposition_dependencies = [
- dep for dep in
- last_righthand_sibling_of_predicate._.holmes.children
- if dep.label == 'prep'
- and len(dep.child_token(token.doc)._.holmes.children) == 0
- and dep.child_token(token.doc)._.holmes.is_matchable]
- antecedent = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
- if len(displaced_preposition_dependencies) > 0:
- displaced_preposition = \
- displaced_preposition_dependencies[0].child_token(token.doc)
- for lefthand_sibling_of_antecedent in (
- lefthand_sibling_of_antecedent for lefthand_sibling_of_antecedent in
- antecedent._.holmes.loop_token_and_righthand_siblings(token.doc)
- if displaced_preposition.i != lefthand_sibling_of_antecedent.i):
- displaced_preposition._.holmes.children.append(SemanticDependency(
- displaced_preposition.i, lefthand_sibling_of_antecedent.i,
- 'pobj', lefthand_sibling_of_antecedent.i != token.head.i))
- #Where the antecedent is not the final one before the relative
- #clause, mark the dependency as uncertain
- for sibling_of_pred in \
- token._.holmes.loop_token_and_righthand_siblings(token.doc):
- if not sibling_of_pred._.holmes.has_dependency_with_child_index(
- displaced_preposition.i) and \
- sibling_of_pred.i != displaced_preposition.i:
- sibling_of_pred._.holmes.children.append(SemanticDependency(
- sibling_of_pred.i, displaced_preposition.i, 'prep', True))
- if working_dependency_label != 'relant':
- # if 'that' or 'which', remove it
- sibling_of_pred._.holmes.remove_dependency_with_child_index(
- child.i)
- else:
- for lefthand_sibling_of_antecedent in \
- antecedent._.holmes.loop_token_and_righthand_siblings(token.doc):
- for sibling_of_predicate in (
- sibling_of_predicate for sibling_of_predicate
- in token._.holmes.loop_token_and_righthand_siblings(token.doc)
- if sibling_of_predicate.i != lefthand_sibling_of_antecedent.i):
- sibling_of_predicate._.holmes.children.append(SemanticDependency(
- sibling_of_predicate.i, lefthand_sibling_of_antecedent.i,
- working_dependency_label,
- lefthand_sibling_of_antecedent.i != token.head.i))
- #Where the antecedent is not the final one before the relative
- #clause, mark the dependency as uncertain
- if working_dependency_label != 'relant':
- sibling_of_predicate._.holmes.remove_dependency_with_child_index(
- child.i)
- break
-
- def _holmes_lemma(self, token):
- """Relabel the lemmas of phrasal verbs in sentences like 'he gets up' to incorporate
- the entire phrasal verb to facilitate matching.
- """
- if token.pos_ == 'VERB':
- for child in token.children:
- if child.tag_ == 'RP':
- return ' '.join([token.lemma_.lower(), child.lemma_.lower()])
- return token.lemma_.lower()
-
- def normalize_hyphens(self, word):
- """ Normalizes hyphens for ontology matching. Depending on the language,
- this may involve replacing them with spaces (English) or deleting them entirely
- (German).
- """
- if word.strip().startswith('-') or word.endswith('-'):
- return word
- else:
- return word.replace('-', ' ')
-
- def _language_specific_derived_holmes_lemma(self, token, lemma):
- """Generates and returns a derived lemma where appropriate, otherwise returns *None*."""
- if (token is None or token.pos_ == 'NOUN') and len(lemma) >= 10:
- possible_lemma = None
- if lemma.endswith('isation') or lemma.endswith('ization'):
- possible_lemma = ''.join((lemma[:-5], 'e')) # 'isation', 'ization' -> 'ise', 'ize'
- if possible_lemma.endswith('ise'):
- lemma_to_test_in_vocab = ''.join((possible_lemma[:-3], 'ize'))
- # only American spellings in vocab
- else:
- lemma_to_test_in_vocab = possible_lemma
- elif lemma.endswith('ication'):
- possible_lemma = ''.join((lemma[:-7], 'y')) # implication -> imply
- lemma_to_test_in_vocab = possible_lemma
- if (possible_lemma is None or self.nlp.vocab[lemma_to_test_in_vocab].is_oov) and \
- lemma.endswith('ation'):
- possible_lemma = ''.join((lemma[:-3], 'e')) # manipulation -> manipulate
- lemma_to_test_in_vocab = possible_lemma
- if possible_lemma is not None and not self.nlp.vocab[lemma_to_test_in_vocab].is_oov:
- return possible_lemma
- # deadjectival nouns in -ness
- if (token is None or token.pos_ == 'NOUN') and len(lemma) >= 7 and lemma.endswith('ness'):
- working_possible_lemma = lemma[:-4]
- # 'bawdiness'
- if working_possible_lemma[-1] == 'i':
- working_possible_lemma = ''.join((working_possible_lemma[:-1], 'y'))
- if not self.nlp.vocab[working_possible_lemma].is_oov:
- return working_possible_lemma
- else:
- return None
- # adverb with 'ly' -> adjective without 'ly'
- if token is None or token.tag_ == 'RB':
- # domestically -> domestic
- if lemma.endswith('ically'):
- return lemma[:-4]
- # 'regrettably', 'horribly' -> 'regrettable', 'horrible'
- if lemma.endswith('ably') or lemma.endswith('ibly'):
- return ''.join((lemma[:-1], 'e'))
- if lemma.endswith('ly'):
- derived_lemma = lemma[:-2]
- # 'happily' -> 'happy'
- if derived_lemma[-1] == 'i':
- derived_lemma = ''.join((derived_lemma[:-1], 'y'))
- return derived_lemma
- # singing -> sing
- if (token is None or token.tag_ == 'NN') and lemma.endswith('ing'):
- lemmatization_sentence = ' '.join(('it is', lemma))
- lemmatization_doc = self.spacy_parse(lemmatization_sentence)
- return lemmatization_doc[2].lemma_.lower()
- return None
-
- def _perform_language_specific_tasks(self, token):
-
- # Because phrasal verbs are conflated into a single lemma, remove the dependency
- # from the verb to the preposition
- if token.tag_ == 'RP':
- token.head._.holmes.remove_dependency_with_child_index(token.i)
-
- # mark modal verb dependencies as uncertain
- if token.pos_ == 'VERB':
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'aux'):
- child = dependency.child_token(token.doc)
- if child.pos_ == 'VERB' and child._.holmes.lemma not in \
- ('be', 'have', 'do', 'go', 'use', 'will', 'shall'):
- for other_dependency in (
- other_dependency for other_dependency in
- token._.holmes.children if other_dependency.label != 'aux'):
- other_dependency.is_uncertain = True
-
- # set auxiliaries as not matchable
- if token.dep_ in ('aux', 'auxpass'):
- token._.holmes.is_matchable = False
-
- # Add new dependencies to phrases with 'by', 'of' and 'to' to enable the matching
- # of deverbal nominal phrases with verb phrases; add 'dative' dependency to
- # nouns within dative 'to' phrases; add new dependency spanning other prepositions
- # to facilitate topic matching and supervised document classification
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label in ('prep', 'agent', 'dative')):
- child = dependency.child_token(token.doc)
- if child._.holmes.lemma == 'by':
- working_dependency_label = 'pobjb'
- elif child._.holmes.lemma == 'of':
- working_dependency_label = 'pobjo'
- elif child._.holmes.lemma == 'to':
- if dependency.label == 'dative':
- working_dependency_label = 'dative'
- else:
- working_dependency_label = 'pobjt'
- else:
- working_dependency_label = 'pobjp'
- # for 'by', 'of' and 'to' the preposition is marked as not matchable
- if working_dependency_label != 'pobjp':
- child._.holmes.is_matchable = False
- for child_dependency in (
- child_dependency for child_dependency in child._.holmes.children
- if child_dependency.label == 'pobj' and token.i !=
- child_dependency.child_index):
- token._.holmes.children.append(SemanticDependency(
- token.i, child_dependency.child_index, working_dependency_label,
- dependency.is_uncertain or child_dependency.is_uncertain))
-
- # where a 'prepposs' dependency has been added and the preposition is not 'by', 'of' or
- #'to', add a corresponding uncertain 'pobjp'
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'prepposs'):
- child = dependency.child_token(token.doc)
- for child_dependency in (
- child_dependency for child_dependency in
- child._.holmes.children if child_dependency.label == 'pobj' and token.i !=
- child_dependency.child_index and child._.holmes.is_matchable):
- token._.holmes.children.append(
- SemanticDependency(token.i, child_dependency.child_index, 'pobjp', True))
-
- # handle present active participles
- if token.dep_ == 'acl' and token.tag_ == 'VBG':
- lefthand_sibling = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
- for antecedent in \
- lefthand_sibling._.holmes.loop_token_and_righthand_siblings(token.doc):
- if token.i != antecedent.i:
- token._.holmes.children.append(
- SemanticDependency(token.i, antecedent.i, 'nsubj'))
-
- # handle past passive participles
- if token.dep_ == 'acl' and token.tag_ == 'VBN':
- lefthand_sibling = token.doc[token.head._.holmes.token_or_lefthand_sibling_index]
- for antecedent in \
- lefthand_sibling._.holmes.loop_token_and_righthand_siblings(token.doc):
- if token.i != antecedent.i:
- token._.holmes.children.append(
- SemanticDependency(token.i, antecedent.i, 'dobj'))
-
- # handle phrases like 'cat-eating dog' and 'dog-eaten cat', adding new dependencies
- if token.dep_ == 'amod' and token.pos_ == 'VERB':
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'npadvmod'):
- if token.tag_ == 'VBG':
- dependency.label = 'advmodobj'
- noun_dependency = 'advmodsubj'
- elif token.tag_ == 'VBN':
- dependency.label = 'advmodsubj'
- noun_dependency = 'advmodobj'
- else:
- break
- for noun in token.head._.holmes.loop_token_and_righthand_siblings(token.doc):
- if token.i != noun.i:
- token._.holmes.children.append(SemanticDependency(
- token.i, noun.i, noun_dependency, noun.i != token.head.i))
- break # we only handle one antecedent, spaCy never seems to produce more anyway
-
- # handle phrases like 'he is thinking about singing', 'he keeps on singing'
- # find governed verb
- if token.pos_ == 'VERB' and token.dep_ == 'pcomp':
- # choose correct noun dependency for passive or active structure
- if len([
- dependency for dependency in token._.holmes.children
- if dependency.label == 'auxpass']) > 0:
- new_dependency_label = 'nsubjpass'
- else:
- new_dependency_label = 'nsubj'
- # check that governed verb does not already have a dependency with the same label
- if len([
- target_token_dependency for target_token_dependency in token._.holmes.children
- if target_token_dependency.label == new_dependency_label]) == 0:
- # Go back in the sentence to find the first subject phrase
- counter = token.i
- while True:
- counter -= 1
- if counter < token.sent.start:
- return
- if token.doc[counter].dep_ in ('nsubj', 'nsubjpass'):
- break
- # From the subject phrase loop up through the syntactic parents
- # to handle relative constructions
- working_token = token.doc[counter]
- while True:
- if working_token.tag_.startswith('NN') or \
- self.is_involved_in_coreference(working_token):
- for source_token in \
- working_token._.holmes.loop_token_and_righthand_siblings(token.doc):
- for target_token in \
- token._.holmes.loop_token_and_righthand_siblings(token.doc):
- if target_token.i != source_token.i:
- # such dependencies are always uncertain
- target_token._.holmes.children.append(SemanticDependency(
- target_token.i, source_token.i, new_dependency_label, True))
- return
- if working_token.dep_ != 'ROOT':
- working_token = working_token.head
- else:
- return
-
- # handle phrases like 'he is easy to find', 'he is ready to go'
- # There is no way of knowing from the syntax whether the noun is a semantic
- # subject or object of the verb, so the new dependency label 'arg' is added.
- if token.tag_.startswith('NN') or self.is_involved_in_coreference(token):
- for adjective_dep in (
- dep for dep in token._.holmes.children if
- dep.label == self._modifier_dep and dep.child_token(token.doc).pos_ == 'ADJ'):
- adj_token = adjective_dep.child_token(token.doc)
- for verb_dep in (
- dep for dep in adj_token._.holmes.children if
- dep.label == 'xcomp' and dep.child_token(token.doc).pos_ == 'VERB'):
- verb_token = verb_dep.child_token(token.doc)
- verb_token._.holmes.children.append(SemanticDependency(
- verb_token.i, token.i, 'arg', True))
-
-class GermanSemanticAnalyzer(SemanticAnalyzer):
-
- language_name = 'German'
-
- noun_pos = ('NOUN', 'PROPN', 'ADJ')
-
- _matchable_pos = ('ADJ', 'ADP', 'ADV', 'NOUN', 'NUM', 'PROPN', 'VERB', 'AUX')
-
- _adjectival_predicate_head_pos = 'AUX'
-
- _adjectival_predicate_subject_pos = ('NOUN', 'PROPN', 'PRON')
-
- noun_kernel_dep = ('nk', 'pnc')
-
- sibling_marker_deps = ('cj', 'app')
-
- _adjectival_predicate_subject_dep = 'sb'
-
- _adjectival_predicate_predicate_dep = 'pd'
-
- _modifier_dep = 'nk'
-
- _spacy_noun_to_preposition_dep = 'mnr'
-
- _spacy_verb_to_preposition_dep = 'mo'
-
- _holmes_noun_to_preposition_dep = 'mnrposs'
-
- _holmes_verb_to_preposition_dep = 'moposs'
-
- _conjunction_deps = ('cj', 'cd', 'punct', 'app')
-
- _interrogative_pronoun_tags = ('PWAT', 'PWAV', 'PWS')
-
- _semantic_dependency_excluded_tags = ('ART')
-
- _generic_pronoun_lemmas = ('jemand', 'etwas')
-
- _or_lemma = 'oder'
-
- _matching_dep_dict = {
- 'sb': ['pobjb', 'ag', 'arg', 'intcompound'],
- 'ag': ['nk', 'pobjo', 'intcompound'],
- 'oa': ['pobjo', 'ag', 'arg', 'intcompound', 'og', 'oc'],
- 'arg': ['sb', 'oa', 'ag', 'intcompound', 'pobjb', 'pobjo'],
- 'mo': ['moposs', 'mnr', 'mnrposs', 'nk', 'oc'],
- 'mnr': ['mnrposs', 'mo', 'moposs', 'nk', 'oc'],
- 'nk': ['ag', 'pobjo', 'intcompound', 'oc', 'mo'],
- 'pobjo': ['ag', 'intcompound'],
- 'pobjp': ['intcompound'],
- # intcompound is only used within extensive matching because it is not assigned
- # in the context of registering search phrases.
- 'intcompound': ['sb', 'oa', 'ag', 'og', 'nk', 'mo', 'pobjo', 'pobjp']
- }
-
- _mark_child_dependencies_copied_to_siblings_as_uncertain = False
-
- # Never used at the time of writing
- _maximum_mentions_in_coreference_chain = 3
-
- # Never used at the time of writing
- _maximum_word_distance_in_coreference_chain = 300
-
- _model_supports_coreference_resolution = False
-
- _entity_defined_multiword_pos = ('NOUN', 'PROPN')
-
- _entity_defined_multiword_entity_types = ('PER', 'LOC')
-
- phraselet_templates = [
- PhraseletTemplate(
- "verb-nom", "Eine Sache tut", 2, 1,
- ['sb', 'pobjb'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=False),
- PhraseletTemplate(
- "verb-acc", "Jemand tut eine Sache", 1, 3,
- ['oa', 'pobjo', 'ag', 'og', 'oc'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=False),
- PhraseletTemplate(
- "verb-dat", "Jemand gibt einer Sache etwas", 1, 3,
- ['da'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=False),
- PhraseletTemplate(
- "verb-pd", "Jemand ist eine Sache", 1, 3,
- ['pd'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=True),
- PhraseletTemplate(
- "noun-dependent", "Eine beschriebene Sache", 2, 1,
- ['nk'],
- ['FM', 'NE', 'NNE', 'NN'],
- ['FM', 'NE', 'NNE', 'NN', 'ADJA', 'ADJD', 'ADV', 'CARD'], reverse_only=False),
- PhraseletTemplate(
- "verb-adverb", "schnell machen", 1, 0,
- ['mo', 'moposs', 'oc'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
- ['ADJA', 'ADJD', 'ADV'], reverse_only=False),
- PhraseletTemplate(
- "prepgovernor-noun", "Eine Sache in einer Sache", 1, 4,
- ['pobjp'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'FM', 'NE', 'NNE', 'NN'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=False),
- PhraseletTemplate(
- "prep-noun", "in einer Sache", 0, 2,
- ['nk'],
- ['APPO', 'APPR', 'APPRART', 'APZR'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=True),
- PhraseletTemplate(
- "verb-toughmovedargument", "Eine Sache ist schwer zu tun", 5, 1,
- ['arg'],
- [
- 'VMFIN', 'VMINF', 'VMPP', 'VVFIN', 'VVIMP', 'VVINF', 'VVIZU', 'VVPP',
- 'VAFIN', 'VAIMP', 'VAINF', 'VAPP'],
- ['FM', 'NE', 'NNE', 'NN'], reverse_only=False),
- PhraseletTemplate(
- "intcompound", "Eine Sache in einer Sache", 1, 4,
- ['intcompound'],
- ['NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD', 'TRUNC'],
- ['NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD', 'TRUNC'], reverse_only=False,
- assigned_dependency_label='intcompound'),
- PhraseletTemplate(
- "word", "Sache", 0, None,
- None,
- ['FM', 'NE', 'NNE', 'NN'],
- None, reverse_only=False)]
-
- topic_matching_phraselet_stop_lemmas = ('dann', 'danach', 'so', 'ich')
-
- supervised_document_classification_phraselet_stop_lemmas = ('sein', 'haben')
-
- topic_matching_reverse_only_parent_lemmas = (
- ('sein', 'AUX'), ('werden', 'AUX'), ('haben', 'AUX'), ('sagen', 'VERB'),
- ('machen', 'VERB'), ('tun', 'VERB'))
-
- preferred_phraselet_pos = ('NOUN', 'PROPN')
-
- _permissible_embedding_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV')
-
- _minimum_embedding_match_word_length = 4
-
- # Only words at least this long are examined for possible subwords
- _minimum_length_for_subword_search = 10
-
- # Part-of-speech tags examined for subwords
- # Verbs are not examined because the separable parts that would typically be found as
- # subwords are too short to be found.
- _tag_for_subword_search = ('NE', 'NNE', 'NN', 'TRUNC', 'ADJA', 'ADJD')
-
- # Absolute minimum length of a subword.
- _minimum_subword_length = 3
-
- # Subwords at least this long are more likely to be genuine (not nonsensical) vocab entries.
- _minimum_long_subword_length = 6
-
- # Subwords longer than this are likely not be atomic and solutions that split them up are
- # preferred
- _maximum_realistic_subword_length = 12
-
- # Scoring bonus where a Fugen-S follows a whitelisted ending
- # (one where a Fugen-S is normally expected)
- _fugen_s_after_whitelisted_ending_bonus = 5
-
- # Scoring bonus where a Fugen-S follows an ending where it is neither expected nor disallowed
- _fugen_s_after_non_whitelisted_non_blacklisted_ending_bonus = 3
-
- # Both words around a Fugen-S have to be at least this long for the scoring bonus to be applied
- _fugen_s_whitelist_bonus_surrounding_word_minimum_length = 5
-
- # Endings after which a Fugen-S is normally expected
- _fugen_s_ending_whitelist = (
- 'tum', 'ling', 'ion', 'tät', 'heit', 'keit', 'schaft', 'sicht', 'ung')
-
- # Endings after which a Fugen_S is normally disallowed
- _fugen_s_ending_blacklist = (
- 'a', 'ä', 'e', 'i', 'o', 'ö', 'u', 'ü', 'nt', 'sch', 's', 'ß', 'st', 'tz', 'z')
-
- # Blacklisted subwords
- _subword_blacklist = (
- 'igkeit', 'igkeiten', 'digkeit', 'digkeiten', 'schaft', 'schaften',
- 'keit', 'keiten', 'lichkeit', 'lichkeiten', 'tigten', 'tigung', 'tigungen', 'barkeit',
- 'barkeiten', 'heit', 'heiten', 'ung', 'ungen', 'aften', 'erung', 'erungen', 'mungen')
-
- # Bigraphs of two consonants that can occur at the start of a subword.
- _subword_start_consonant_bigraph_whitelist = (
- 'bl', 'br', 'ch', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gm', 'gn', 'gr', 'kl', 'kn', 'kr',
- 'kw', 'pf', 'ph', 'pl', 'pn', 'pr', 'ps', 'rh', 'sc', 'sh', 'sk', 'sl', 'sm', 'sp', 'st',
- 'sw', 'sz', 'th', 'tr', 'vl', 'vr', 'wr', 'zw')
-
- # Bigraphs of two consonants that can occur at the end of a subword.
- # Bigraphs where the second consonant is 's' are always allowed.
- _subword_end_consonant_bigraph_whitelist = (
- 'bb', 'bs', 'bt', 'ch', 'ck', 'ct', 'dd', 'ds', 'dt', 'ff', 'fs', 'ft', 'gd', 'gg', 'gn',
- 'gs', 'gt', 'hb', 'hd', 'hf', 'hg', 'hk', 'hl', 'hm', 'hn', 'hp', 'hr', 'hs', 'ht', 'ks',
- 'kt', 'lb', 'lc', 'ld', 'lf', 'lg', 'lk', 'll', 'lm', 'ln', 'lp', 'ls', 'lt', 'lx', 'lz',
- 'mb', 'md', 'mk', 'mm', 'mp', 'ms', 'mt', 'mx', 'nb', 'nd', 'nf', 'ng', 'nk', 'nn', 'np',
- 'ns', 'nt', 'nx', 'nz', 'pf', 'ph', 'pp', 'ps', 'pt', 'rb', 'rc', 'rd', 'rf', 'rg', 'rk',
- 'rl', 'rm', 'rn', 'rp', 'rr', 'rs', 'rt', 'rx', 'rz', 'sk', 'sl', 'sp', 'ss', 'st', 'th',
- 'ts', 'tt', 'tz', 'xt', 'zt', 'ßt')
-
- # Letters that can represent vowel sounds
- _vowels = ('a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü', 'y')
-
- # Subwords used in analysis but not recorded on the Holmes dictionary instances. At present
- # the code only supports these in word-final position; word-initial position would require
- # a code change.
- _non_recorded_subword_list = ('lein', 'chen')
-
- # Subword solutions that scored higher than this are regarded as probably wrong and so are
- # not recorded.
- _maximum_acceptable_subword_score = 8
-
- def _add_subwords(self, token, subword_cache):
-
- class PossibleSubword:
- """ A subword within a possible solution.
-
- text -- the text
- char_start_index -- the character start index of the subword within the word.
- fugen_s_status --
- '1' if the preceding word has an ending that normally has a Fugen-s,
- '2' if the preceding word has an ending that precludes using a Fugen-s,
- '0' otherwise.
- """
-
- def __init__(self, text, char_start_index, fugen_s_status):
- self.text = text
- self.char_start_index = char_start_index
- self.fugen_s_status = fugen_s_status
-
- def get_subword(lemma, initial_index, length):
- # find the shortest subword longer than length, unless length is less than
- # _minimum_long_subword_length, in which case only that length is tried. This strategy
- # is necessary because of the large number of nonsensical short vocabulary entries.
- for end_index in range(initial_index + length, len(lemma) + 1):
- possible_word = lemma[initial_index: end_index]
- if not self.nlp.vocab[possible_word].is_oov and len(possible_word) >= 2 and \
- (
- possible_word[0] in self._vowels or possible_word[1] in self._vowels
- or
- possible_word[:2] in self._subword_start_consonant_bigraph_whitelist) \
- and (
- possible_word[-1] in self._vowels or possible_word[-2] in self._vowels
- or
- possible_word[-2:] in self._subword_end_consonant_bigraph_whitelist):
- return possible_word
- elif length < self._minimum_long_subword_length:
- break
- return None
-
- def score(possible_solution):
- # Lower scores are better.
- number = 0
- for subword in possible_solution:
- # subwords shorter than _minimum_long_subword_length: penalty of 2
- if len(subword.text) < self._minimum_long_subword_length:
- number += 2 * (self._minimum_long_subword_length - len(subword.text))
- # subwords longer than 12: penalty of 1
- elif len(subword.text) > self._maximum_realistic_subword_length:
- number += len(subword.text) - self._maximum_realistic_subword_length
- # fugen-s after a whitelist ending
- if subword.fugen_s_status == 2:
- number -= self._fugen_s_after_whitelisted_ending_bonus
- # fugen-s after an ending that is neither whitelist nor blacklist
- elif subword.fugen_s_status == 1:
- number -= self._fugen_s_after_non_whitelisted_non_blacklisted_ending_bonus
- return number
-
- def scan_recursively_for_subwords(lemma, initial_index=0):
-
- if initial_index == 0: # only need to check on the initial (outermost) call
- for char in lemma:
- if not char.isalpha() and char != '-':
- return
- if initial_index + 1 < len(lemma) and lemma[initial_index] == '-':
- return scan_recursively_for_subwords(lemma, initial_index + 1)
- lengths = list(range(self._minimum_subword_length, 1 + len(lemma) - initial_index))
- possible_solutions = []
- working_subword = None
- for length in lengths:
- if working_subword is not None and len(working_subword) >= length:
- # we are catching up with the length already returned by get_subword
- continue
- working_subword = get_subword(lemma, initial_index, length)
- if working_subword is None or working_subword in self._subword_blacklist or \
- '-' in working_subword:
- continue
- possible_solution = [PossibleSubword(working_subword, initial_index, 0)]
- if \
- (
- initial_index + len(working_subword) == len(lemma)) or (
- initial_index + len(working_subword)
- + 1 == len(lemma) and lemma[-1] == '-') \
- or (
- initial_index + len(working_subword) + 2 == len(lemma) and lemma[-2:] ==
- 's-'):
- # we have reached the end of the word
- possible_solutions.append(possible_solution)
- break
- following_subwords = scan_recursively_for_subwords(
- lemma, initial_index + len(working_subword))
- if following_subwords is not None:
- possible_solution.extend(following_subwords)
- possible_solutions.append(possible_solution)
- if initial_index + len(working_subword) + 2 < len(lemma) and lemma[
- initial_index + len(working_subword): initial_index +
- len(working_subword) + 2] == 's-':
- following_initial_index = initial_index + len(working_subword) + 2
- elif initial_index + len(working_subword) + 1 < len(lemma) and \
- lemma[initial_index + len(working_subword)] == 's':
- following_initial_index = initial_index + len(working_subword) + 1
- else:
- continue
- possible_solution = [PossibleSubword(working_subword, initial_index, 0)]
- following_subwords = scan_recursively_for_subwords(lemma, following_initial_index)
- if following_subwords is not None:
- for ending in self._fugen_s_ending_whitelist:
- if working_subword.endswith(ending):
- following_subwords[0].fugen_s_status = 2
- if following_subwords[0].fugen_s_status == 0 and len(working_subword) >= \
- self._fugen_s_whitelist_bonus_surrounding_word_minimum_length and \
- len(following_subwords[0].text) >= \
- self._fugen_s_whitelist_bonus_surrounding_word_minimum_length:
- # if the first does not have a whitelist ending and one of the words is
- # short, do not give the score bonus
- following_subwords[0].fugen_s_status = 1
- for ending in self._fugen_s_ending_blacklist:
- # blacklist ending: take the bonus away again
- if working_subword.endswith(ending):
- following_subwords[0].fugen_s_status = 0
- possible_solution.extend(following_subwords)
- possible_solutions.append(possible_solution)
- if len(possible_solutions) > 0:
- possible_solutions = sorted(
- possible_solutions, key=lambda possible_solution: score(possible_solution))
- return possible_solutions[0]
-
- def get_lemmatization_doc(possible_subwords, pos):
- # We retrieve the lemma for each subword by calling Spacy. To reduce the
- # overhead, we concatenate the subwords in the form:
- # Subword1. Subword2. Subword3
- entry_words = []
- for counter in range(len(possible_subwords)):
- if counter + 1 == len(possible_subwords) and pos == 'ADJ':
- entry_words.append(possible_subwords[counter].text)
- else:
- entry_words.append(possible_subwords[counter].text.capitalize())
- subword_lemmatization_string = ' . '.join(entry_words)
- return self.spacy_parse(subword_lemmatization_string)
-
- if not token.tag_ in self._tag_for_subword_search or (
- len(token._.holmes.lemma) < self._minimum_length_for_subword_search and
- '-' not in token._.holmes.lemma):
- return
- if token.text in subword_cache:
- cached_subwords = subword_cache[token.text]
- for cached_subword in cached_subwords:
- token._.holmes.subwords.append(Subword(
- token.i, cached_subword.index, cached_subword.text, cached_subword.lemma,
- cached_subword.derived_lemma, cached_subword.char_start_index,
- cached_subword.dependent_index, cached_subword.dependency_label,
- cached_subword.governor_index, cached_subword.governing_dependency_label))
- else:
- working_subwords = []
- possible_subwords = scan_recursively_for_subwords(token._.holmes.lemma)
- if possible_subwords is None or score(possible_subwords) > \
- self._maximum_acceptable_subword_score:
- return
- if len(possible_subwords) == 1 and token._.holmes.lemma.isalpha():
- # not ... isalpha(): hyphenation
- subword_cache[token.text] = []
- else:
- index = 0
- if token._.holmes.lemma[0] == '-':
- # with truncated nouns, the righthand siblings may actually occur to the left
- # of the head noun
- head_sibling = token.doc[token._.holmes.token_or_lefthand_sibling_index]
- if len(head_sibling._.holmes.righthand_siblings) > 0:
- indexes = token._.holmes.get_sibling_indexes(token.doc)
- first_sibling = token.doc[indexes[0]]
- first_sibling_possible_subwords = \
- scan_recursively_for_subwords(first_sibling._.holmes.lemma)
- if first_sibling_possible_subwords is not None:
- first_sibling_lemmatization_doc = get_lemmatization_doc(
- first_sibling_possible_subwords, token.pos_)
- final_subword_counter = len(first_sibling_possible_subwords) - 1
- if final_subword_counter > 0 and \
- first_sibling_possible_subwords[
- final_subword_counter].text \
- in self._non_recorded_subword_list:
- final_subword_counter -= 1
- for counter in range(final_subword_counter):
- first_sibling_possible_subword = \
- first_sibling_possible_subwords[counter]
- if first_sibling_possible_subword.text in \
- self._non_recorded_subword_list:
- continue
- text = first_sibling.text[
- first_sibling_possible_subword.char_start_index:
- first_sibling_possible_subword.char_start_index +
- len(first_sibling_possible_subword.text)]
- lemma = first_sibling_lemmatization_doc[counter*2].lemma_.lower()
- derived_lemma = self.derived_holmes_lemma(None, lemma)
- working_subwords.append(Subword(
- first_sibling.i, index, text, lemma, derived_lemma,
- first_sibling_possible_subword.char_start_index,
- None, None, None, None))
- index += 1
- lemmatization_doc = get_lemmatization_doc(possible_subwords, token.pos_)
- for counter, possible_subword in enumerate(possible_subwords):
- possible_subword = possible_subwords[counter]
- if possible_subword.text in self._non_recorded_subword_list:
- continue
- text = token.text[
- possible_subword.char_start_index:
- possible_subword.char_start_index + len(possible_subword.text)]
- lemma = lemmatization_doc[counter*2].lemma_.lower()
- derived_lemma = self.derived_holmes_lemma(None, lemma)
- working_subwords.append(Subword(
- token.i, index, text, lemma, derived_lemma,
- possible_subword.char_start_index, None, None, None, None))
- index += 1
- if token._.holmes.lemma[-1] == '-':
- # with truncated nouns, the righthand siblings may actually occur to the left
- # of the head noun
- head_sibling = token.doc[token._.holmes.token_or_lefthand_sibling_index]
- if len(head_sibling._.holmes.righthand_siblings) > 0:
- indexes = token._.holmes.get_sibling_indexes(token.doc)
- last_sibling_index = indexes[-1]
- if token.i != last_sibling_index:
- last_sibling = token.doc[last_sibling_index]
- last_sibling_possible_subwords = \
- scan_recursively_for_subwords(last_sibling._.holmes.lemma)
- if last_sibling_possible_subwords is not None:
- last_sibling_lemmatization_doc = get_lemmatization_doc(
- last_sibling_possible_subwords, token.pos_)
- for counter in range(1, len(last_sibling_possible_subwords)):
- last_sibling_possible_subword = \
- last_sibling_possible_subwords[counter]
- if last_sibling_possible_subword.text in \
- self._non_recorded_subword_list:
- continue
- text = last_sibling.text[
- last_sibling_possible_subword.char_start_index:
- last_sibling_possible_subword.char_start_index +
- len(last_sibling_possible_subword.text)]
- lemma = last_sibling_lemmatization_doc[counter*2].lemma_.lower()
- derived_lemma = self.derived_holmes_lemma(None, lemma)
- working_subwords.append(Subword(
- last_sibling.i, index, text, lemma, derived_lemma,
- last_sibling_possible_subword.char_start_index,
- None, None, None, None))
- index += 1
-
- if index > 1: # if only one subword was found, no need to record it on ._.holmes
- for counter, working_subword in enumerate(working_subwords):
- if counter > 0:
- dependency_label = 'intcompound'
- dependent_index = counter - 1
- else:
- dependency_label = None
- dependent_index = None
- if counter + 1 < len(working_subwords):
- governing_dependency_label = 'intcompound'
- governor_index = counter + 1
- else:
- governing_dependency_label = None
- governor_index = None
- working_subword = working_subwords[counter]
- token._.holmes.subwords.append(Subword(
- working_subword.containing_token_index,
- working_subword.index, working_subword.text, working_subword.lemma,
- working_subword.derived_lemma, working_subword.char_start_index,
- dependent_index, dependency_label, governor_index,
- governing_dependency_label))
- if token._.holmes.lemma.isalpha(): # caching only where no hyphenation
- subword_cache[token.text] = token._.holmes.subwords
- if len(token._.holmes.subwords) > 1 and 'nicht' in (
- subword.lemma for subword in token._.holmes.subwords):
- token._.holmes.is_negated = True
-
- def _set_negation(self, token):
- """Marks the negation on the token. A token is negative if it or one of its ancestors
- has a negation word as a syntactic (not semantic!) child.
- """
- if token._.holmes.is_negated is not None:
- return
- for child in token.children:
- if child._.holmes.lemma in ('nicht', 'kein', 'keine', 'nie') or \
- child._.holmes.lemma.startswith('nirgend'):
- token._.holmes.is_negated = True
- return
- if token.dep_ == 'ROOT':
- token._.holmes.is_negated = False
- return
- self._set_negation(token.head)
- token._.holmes.is_negated = token.head._.holmes.is_negated
-
- def _correct_auxiliaries_and_passives(self, token):
- """Wherever auxiliaries and passives are found, derive the semantic information
- from the syntactic information supplied by spaCy.
- """
-
- def correct_auxiliaries_and_passives_recursively(token, processed_auxiliary_indexes):
- if token.i not in processed_auxiliary_indexes:
- processed_auxiliary_indexes.append(token.i)
- if (token.pos_ == 'AUX' or token.tag_.startswith('VM')) and len([
- dependency for dependency in token._.holmes.children if
- dependency.child_index >= 0 and
- token.doc[dependency.child_index].tag_ == 'PTKVZ']) == 0: # 'vorhaben'
- for dependency in (
- dependency for dependency in token._.holmes.children
- if token.doc[dependency.child_index].pos_ in ('VERB', 'AUX') and
- token.doc[dependency.child_index].dep_ in ('oc', 'pd')):
- token._.holmes.is_matchable = False
- child = token.doc[dependency.child_index]
- self._move_information_between_tokens(token, child)
- # VM indicates a modal verb, which has to be marked as uncertain
- if token.tag_.startswith('VM') or dependency.is_uncertain:
- for child_dependency in child._.holmes.children:
- child_dependency.is_uncertain = True
- # 'er ist froh zu kommen' / 'er ist schwer zu erreichen'
- # set dependency label to 'arg' because semantic role could be either
- # subject or object
- if token._.holmes.lemma == 'sein' and (
- len([
- child_dependency for child_dependency in
- child._.holmes.children if child_dependency.label == 'pm' and
- child_dependency.child_token(token.doc).tag_ == 'PTKZU']) > 0
- or child.tag_ == 'VVIZU'):
- for new_dependency in (
- new_dependency for new_dependency in
- child._.holmes.children if new_dependency.label == 'sb'):
- new_dependency.label = 'arg'
- new_dependency.is_uncertain = True
- # passive construction
- if (token._.holmes.lemma == 'werden' and child.tag_ not in (
- 'VVINF', 'VAINF', 'VAFIN', 'VAINF')):
- for child_or_sib in \
- child._.holmes.loop_token_and_righthand_siblings(token.doc):
- #mark syntactic subject as semantic object
- for grandchild_dependency in [
- grandchild_dependency for
- grandchild_dependency in child_or_sib._.holmes.children
- if grandchild_dependency.label == 'sb']:
- grandchild_dependency.label = 'oa'
- #mark syntactic object as synctactic subject, removing the
- #preposition 'von' or 'durch' from the construction and marking
- #it as non-matchable
- for grandchild_dependency in (
- gd for gd in
- child_or_sib._.holmes.children if gd.child_index >= 0):
- grandchild = token.doc[grandchild_dependency.child_index]
- if (
- grandchild_dependency.label == 'sbp' and
- grandchild._.holmes.lemma in ('von', 'vom')) or (
- grandchild_dependency.label == 'mo' and
- grandchild._.holmes.lemma in (
- 'von', 'vom', 'durch')):
- grandchild._.holmes.is_matchable = False
- for great_grandchild_dependency in \
- grandchild._.holmes.children:
- if child_or_sib.i != \
- great_grandchild_dependency.child_index:
- child_or_sib._.holmes.children.append(
- SemanticDependency(
- child_or_sib.i,
- great_grandchild_dependency.child_index,
- 'sb', dependency.is_uncertain))
- child_or_sib._.holmes.remove_dependency_with_child_index(
- grandchild_dependency.child_index)
- for syntactic_child in token.children:
- correct_auxiliaries_and_passives_recursively(
- syntactic_child, processed_auxiliary_indexes)
-
- if token.dep_ == 'ROOT':
- correct_auxiliaries_and_passives_recursively(token, [])
-
- def _handle_relative_constructions(self, token):
- for dependency in (
- dependency for dependency in token._.holmes.children if
- dependency.child_index >= 0 and
- dependency.child_token(token.doc).tag_ in ('PRELS', 'PRELAT') and
- dependency.child_token(token.doc).dep_ != 'par'):
- counter = dependency.child_index
- while counter > token.sent.start:
- # find the antecedent
- counter -= 1
- working_token = token.doc[counter]
- if working_token.pos_ in ('NOUN', 'PROPN') and working_token.dep_ not in \
- self.sibling_marker_deps:
- working_dependency = None
- for antecedent in (
- antecedent for antecedent in
- working_token._.holmes.loop_token_and_righthand_siblings(token.doc)
- if antecedent.i != token.i):
- # add new dependency from the verb to the antecedent
- working_dependency = SemanticDependency(
- token.i, antecedent.i, dependency.label, True)
- token._.holmes.children.append(working_dependency)
- # the last antecedent before the pronoun is not uncertain, so reclassify it
- if working_dependency is not None:
- working_dependency.is_uncertain = False
- # remove the dependency from the verb to the relative pronoun
- token._.holmes.remove_dependency_with_child_index(
- dependency.child_index)
- # label the relative pronoun as a grammatical token pointing to its
- # direct antecedent
- dependency.child_token(token.doc)._.holmes.children = [SemanticDependency(
- dependency.child_index, 0 - (working_dependency.child_index + 1),
- None)]
-
- def _holmes_lemma(self, token):
- """Relabel the lemmas of separable verbs in sentences like 'er steht auf' to incorporate
- the entire separable verb to facilitate matching.
- """
- if token.pos_ in ('VERB', 'AUX') and token.tag_ not in ('VAINF', 'VMINF', 'VVINF', 'VVIZU'):
- for child in token.children:
- if child.tag_ == 'PTKVZ':
- child_lemma = child.lemma_.lower()
- if child_lemma == 'einen':
- child_lemma = 'ein'
- return ''.join([child_lemma, token.lemma_.lower()])
- if token.tag_ == 'APPRART':
- if token.lemma_.lower() == 'im':
- return 'in'
- if token.lemma_.lower() == 'am':
- return 'an'
- if token.lemma_.lower() == 'beim':
- return 'bei'
- if token.lemma_.lower() == 'zum':
- return 'zu'
- if token.lemma_.lower() == 'zur':
- return 'zu'
- # sometimes adjectives retain their inflectional endings
- if token.tag_ == 'ADJA' and len(token.lemma_.lower()) > 5 and \
- token.lemma_.lower().endswith('en'):
- return token.lemma_.lower().rstrip('en')
- if token.tag_ == 'ADJA' and len(token.lemma_.lower()) > 5 and \
- token.lemma_.lower().endswith('e'):
- return token.lemma_.lower().rstrip('e')
- return token.lemma_.lower()
-
- _ung_ending_blacklist = ('sprung', 'schwung', 'nibelung')
-
- def normalize_hyphens(self, word):
- """ Normalizes hyphens in a multiword for ontology matching. Depending on the language,
- this may involve replacing them with spaces (English) or deleting them entirely
- (German).
- """
- if word.strip().startswith('-') or word.endswith('-'):
- return word
- else:
- return word.replace('-', '')
-
- def _language_specific_derived_holmes_lemma(self, token, lemma):
- """ token is None where *lemma* belongs to a subword """
-
- # verbs with 'ieren' -> 'ation'
- if (token is None or token.pos_ == 'VERB') and len(lemma) > 9 and \
- lemma.endswith('ieren'):
- working_lemma = ''.join((lemma[:-5], 'ation'))
- if not self.nlp.vocab[working_lemma].is_oov:
- return working_lemma
- # nouns with 'ierung' -> 'ation'
- if (token is None or token.pos_ == 'NOUN') and len(lemma) > 10 and \
- lemma.endswith('ierung'):
- working_lemma = ''.join((lemma[:-6], 'ation'))
- if not self.nlp.vocab[working_lemma].is_oov:
- return working_lemma
- # nominalization with 'ung'
- if (token is None or token.tag_ == 'NN') and lemma.endswith('ung'):
- for word in self._ung_ending_blacklist:
- if lemma.endswith(word):
- return None
- if (lemma.endswith('erung') and not lemma.endswith('ierung')) or \
- lemma.endswith('elung'):
- return ''.join((lemma[:-3], 'n'))
- elif lemma.endswith('lung') and len(lemma) >= 5 and \
- lemma[-5] not in ('a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü', 'h'):
- return ''.join((lemma[:-4], 'eln'))
- return ''.join((lemma[:-3], 'en'))
- # nominalization with 'heit', 'keit'
- if (token is None or token.tag_ == 'NN') and (
- lemma.endswith('keit') or lemma.endswith('heit')):
- return lemma[:-4]
- if (token is None or token.pos_ in ('NOUN', 'PROPN')) and len(lemma) > 6 and \
- (lemma.endswith('chen') or lemma.endswith('lein')):
- # len > 6: because e.g. Dach and Loch have lemmas 'dachen' and 'lochen'
- working_lemma = lemma[-12:-4]
- # replace umlauts in the last 8 characters of the derived lemma
- working_lemma = working_lemma.replace('ä', 'a').replace('ö', 'o').replace('ü', 'u')
- working_lemma = ''.join((lemma[:-12], working_lemma))
- if not self.nlp.vocab[working_lemma].is_oov:
- return working_lemma
- if lemma[-4] == 'l': # 'lein' where original word ends in 'l'
- second_working_lemma = ''.join((working_lemma, 'l'))
- if not self.nlp.vocab[working_lemma].is_oov:
- return second_working_lemma
- second_working_lemma = lemma[:-4] # 'Löffelchen'
- if not self.nlp.vocab[second_working_lemma].is_oov:
- return second_working_lemma
- if lemma[-4] == 'l': # 'Schlüsselein'
- second_working_lemma = ''.join((second_working_lemma, 'l'))
- if not self.nlp.vocab[second_working_lemma].is_oov:
- return second_working_lemma
- return working_lemma
- if (token is None or token.tag_ == 'NN') and lemma.endswith('e') and len(lemma) > 1 and \
- not lemma[-2] in self._vowels:
- # for comparability with diminutive forms, e.g. äuglein <-> auge
- return lemma[:-1]
- return None
-
- def _perform_language_specific_tasks(self, token):
-
- # Because separable verbs are conflated into a single lemma, remove the dependency
- # from the verb to the preposition
- if token.tag_ == 'PTKVZ' and token.head.pos_ in ('VERB', 'AUX') and \
- token.head.tag_ not in ('VAINF', 'VMINF', 'VVINF', 'VVIZU'):
- token.head._.holmes.remove_dependency_with_child_index(token.i)
-
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label in ('mo', 'mnr', 'pg', 'op')):
- child = dependency.child_token(token.doc)
- for child_dependency in (
- child_dependency for child_dependency in
- child._.holmes.children if child_dependency.label == 'nk' and
- token.i != child_dependency.child_index and child.pos_ == 'ADP'):
- if dependency.label in ('mnr', 'pg', 'op') and \
- dependency.child_token(token.doc)._.holmes.lemma in ('von', 'vom'):
- token._.holmes.children.append(SemanticDependency(
- token.i, child_dependency.child_index, 'pobjo'))
- # pobjO from English 'of'
- child._.holmes.is_matchable = False
- elif dependency.label in ('mnr') and \
- dependency.child_token(token.doc)._.holmes.lemma in ('durch'):
- token._.holmes.children.append(SemanticDependency(
- token.i, child_dependency.child_index, 'pobjb'))
- # pobjB from English 'by'
- else:
- token._.holmes.children.append(SemanticDependency(
- token.i, child_dependency.child_index, 'pobjp',
- dependency.is_uncertain or child_dependency.is_uncertain))
-
- # # where a 'moposs' or 'mnrposs' dependency has been added and the preposition is not
- # 'von' or 'vom' add a corresponding uncertain 'pobjp'
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label in ['moposs', 'mnrposs']):
- child = dependency.child_token(token.doc)
- for child_dependency in (
- child_dependency for child_dependency in
- child._.holmes.children if child_dependency.label == 'nk' and
- token.i != child_dependency.child_index and child._.holmes.is_matchable):
- token._.holmes.children.append(SemanticDependency(
- token.i, child_dependency.child_index, 'pobjp', True))
-
- # Loop through the structure around a dependent verb to find the lexical token at which
- # to add new dependencies, and find out whether it is active or passive so we know
- # whether to add an 'sb' or an 'oa'.
- def find_target_tokens_and_dependency_recursively(token, visited=[]):
- visited.append(token.i)
- tokens_to_return = []
- target_dependency = 'sb'
- # Loop through grammatical tokens. 'dependency.child_index + token.i != -1' would mean
- # a grammatical token were pointing to itself (should never happen!)
- if len([
- dependency for dependency in token._.holmes.children
- if dependency.child_index < 0 and dependency.child_index + token.i != -1]) > 0:
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.child_index < 0 and dependency.child_index + token.i != -1):
- # resolve the grammatical token pointer
- child_token = token.doc[0 - (dependency.child_index + 1)]
- # passive construction
- if (token._.holmes.lemma == 'werden' and child_token.tag_ not in
- ('VVINF', 'VAINF', 'VAFIN', 'VAINF')):
- target_dependency = 'oa'
- if child_token.i not in visited:
- new_tokens, new_target_dependency = \
- find_target_tokens_and_dependency_recursively(child_token, visited)
- tokens_to_return.extend(new_tokens)
- if new_target_dependency == 'oa':
- target_dependency = 'oa'
- else:
- tokens_to_return.append(token)
- else:
- # we have reached the target token
- tokens_to_return.append(token)
- return tokens_to_return, target_dependency
-
- # 'Der Mann hat xxx, es zu yyy' and similar structures
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label in ('oc', 'oa', 'mo', 're') and
- token.pos_ in ('VERB', 'AUX') and dependency.child_token(token.doc).pos_ in \
- ('VERB', 'AUX')):
- dependencies_to_add = []
- target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
- dependency.child_token(token.doc))
- # with um ... zu structures the antecedent subject is always the subject of the
- # dependent clause, unlike with 'zu' structures without the 'um'
- if len([other_dependency for other_dependency in target_tokens[0]._.holmes.children
- if other_dependency.child_token(token.doc)._.holmes.lemma == 'um' and
- other_dependency.child_token(token.doc).tag_ == 'KOUI']) == 0:
- # er hat ihm vorgeschlagen, etwas zu tun
- for other_dependency in (
- other_dependency for other_dependency
- in token._.holmes.children if other_dependency.label == 'da'):
- dependencies_to_add.append(other_dependency)
- if len(dependencies_to_add) == 0:
- # er hat ihn gezwungen, etwas zu tun
- # We have to distinguish this type of 'oa' relationship from dependent
- # clauses and reflexive pronouns ('er entschied sich, ...')
- for other_dependency in (
- other_dependency for other_dependency
- in token._.holmes.children if other_dependency.label == 'oa' and
- other_dependency.child_token(token.doc).pos_ not in ('VERB', 'AUX') and
- other_dependency.child_token(token.doc).tag_ != 'PRF'):
- dependencies_to_add.append(other_dependency)
- if len(dependencies_to_add) == 0:
- # We haven't found any object dependencies, so take the subject dependency
- for other_dependency in (
- other_dependency for other_dependency
- in token._.holmes.children if other_dependency.label == 'sb'):
- dependencies_to_add.append(other_dependency)
- for target_token in target_tokens:
- for other_dependency in (
- other_dependency for other_dependency in
- dependencies_to_add if target_token.i != other_dependency.child_index):
- # these dependencies are always uncertain
- target_token._.holmes.children.append(SemanticDependency(
- target_token.i, other_dependency.child_index, target_dependency, True))
-
- # 'Der Löwe bat den Hund, die Katze zu jagen' and similar structures
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'oc' and token.pos_ == 'NOUN' and
- dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')):
- target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
- dependency.child_token(token.doc))
- for target_token in target_tokens:
- target_token._.holmes.children.append(SemanticDependency(
- target_token.i, token.i, target_dependency, True))
-
- # 'er dachte darüber nach, es zu tun' and similar structures
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'op' and dependency.child_token(token.doc).tag_ == 'PROAV'):
- child_token = dependency.child_token(token.doc)
- for child_dependency in (
- child_dependency for child_dependency in
- child_token._.holmes.children if child_dependency.label == 're' and
- child_dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')):
- target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
- child_dependency.child_token(token.doc))
- for other_dependency in (
- other_dependency for other_dependency
- in token._.holmes.children if other_dependency.label == 'sb'):
- for target_token in target_tokens:
- target_token._.holmes.children.append(SemanticDependency(
- target_token.i, other_dependency.child_index, target_dependency, True))
-
- # 'er war froh, etwas zu tun'
- for dependency in (
- dependency for dependency in token._.holmes.children
- if dependency.label == 'nk' and token.pos_ in ('NOUN', 'PROPN')
- and token.dep_ == 'sb' and dependency.child_token(token.doc).pos_ == 'ADJ'):
- child_token = dependency.child_token(token.doc)
- for child_dependency in (
- child_dependency for child_dependency in
- child_token._.holmes.children if child_dependency.label in ('oc', 're') and
- child_dependency.child_token(token.doc).pos_ in ('VERB', 'AUX')):
- target_tokens, target_dependency = find_target_tokens_and_dependency_recursively(
- child_dependency.child_token(token.doc))
- for target_token in (
- target_token for target_token in target_tokens
- if target_token.i != dependency.parent_index):
- # these dependencies are always uncertain
- target_token._.holmes.children.append(SemanticDependency(
- target_token.i, dependency.parent_index, target_dependency, True))
-
- # sometimes two verb arguments are interpreted as both subjects or both objects,
- # if this occurs reinterpret them
-
- # find first 'sb' dependency for verb
- dependencies = [
- dependency for dependency in token._.holmes.children
- if token.pos_ == 'VERB' and dependency.label == 'sb' and not
- dependency.is_uncertain]
- if len(dependencies) > 0 and len([
- object_dependency for object_dependency
- in dependencies if object_dependency.label == 'oa' and not
- dependency.is_uncertain]) == 0:
- dependencies.sort(key=lambda dependency: dependency.child_index)
- first_real_subject = dependencies[0].child_token(token.doc)
- for real_subject_index in \
- first_real_subject._.holmes.get_sibling_indexes(token.doc):
- for dependency in dependencies:
- if dependency.child_index == real_subject_index:
- dependencies.remove(dependency)
- for dependency in (other_dependency for other_dependency in dependencies):
- dependency.label = 'oa'
-
- dependencies = [
- dependency for dependency in token._.holmes.children
- if token.pos_ == 'VERB' and dependency.label == 'oa' and not
- dependency.is_uncertain]
- if len(dependencies) > 0 and len([
- object_dependency for object_dependency
- in dependencies if object_dependency.label == 'sb' and not
- dependency.is_uncertain]) == 0:
- dependencies.sort(key=lambda dependency: dependency.child_index)
- first_real_subject = dependencies[0].child_token(token.doc)
- real_subject_indexes = first_real_subject._.holmes.get_sibling_indexes(token.doc)
- if len(dependencies) > len(real_subject_indexes):
- for dependency in (
- dependency for dependency in dependencies if
- dependency.child_index in real_subject_indexes):
- dependency.label = 'sb'
diff --git a/holmes_extractor/structural_matching.py b/holmes_extractor/structural_matching.py
deleted file mode 100644
index 083d9f3..0000000
--- a/holmes_extractor/structural_matching.py
+++ /dev/null
@@ -1,2118 +0,0 @@
-import copy
-from threading import Lock
-from functools import total_ordering
-from spacy.tokens.token import Token
-from .errors import *
-from .semantics import SemanticDependency, Subword
-
-ONTOLOGY_DEPTHS_TO_NAMES = {
- -4: 'an ancestor', -3: 'a great-grandparent', -2: 'a grandparent', -1: 'a parent',
- 0: 'a synonym', 1: 'a child', 2: 'a grandchild', 3: 'a great-grandchild', 4: 'a descendant'}
-
-class WordMatch:
- """A match between a searched phrase word and a document word.
-
- Properties:
-
- search_phrase_token -- the spaCy token from the search phrase.
- search_phrase_word -- the word that matched from the search phrase.
- document_token -- the spaCy token from the document.
- first_document_token -- the first token that matched from the document, which will equal
- *document_token* except with multiword matches.
- last_document_token -- the lst token that matched from the document, which will equal
- *document_token* except with multiword matches.
- document_subword -- the subword from the token that matched, or *None* if the match was
- with the whole token.
- document_word -- the word or subword that matched structurally from the document.
- type -- *direct*, *entity*, *embedding*, *ontology* or *derivation*.
- similarity_measure -- for type *embedding*, the similarity between the two tokens,
- otherwise 1.0.
- is_negated -- *True* if this word match leads to a match of which it
- is a part being negated.
- is_uncertain -- *True* if this word match leads to a match of which it
- is a part being uncertain.
- structurally_matched_document_token -- the spaCy token from the document that matched
- the dependency structure, which may be different from *document_token* if coreference
- resolution is active.
- involves_coreference -- *True* if *document_token* and *structurally_matched_document_token*
- are different.
- extracted_word -- within the coreference chain, the most specific term that corresponded to
- document_word in the ontology.
- depth -- the number of hyponym relationships linking *search_phrase_word* and
- *extracted_word*, or *0* if ontology-based matching is not active.
- """
-
- def __init__(
- self, search_phrase_token, search_phrase_word, document_token,
- first_document_token, last_document_token, document_subword, document_word,
- type, similarity_measure, is_negated, is_uncertain,
- structurally_matched_document_token, extracted_word, depth):
-
- self.search_phrase_token = search_phrase_token
- self.search_phrase_word = search_phrase_word
- self.document_token = document_token
- self.first_document_token = first_document_token
- self.last_document_token = last_document_token
- self.document_subword = document_subword
- self.document_word = document_word
- self.type = type
- self.similarity_measure = similarity_measure
- self.is_negated = is_negated
- self.is_uncertain = is_uncertain
- self.structurally_matched_document_token = structurally_matched_document_token
- self.extracted_word = extracted_word
- self.depth = depth
-
- @property
- def involves_coreference(self):
- return self.document_token != self.structurally_matched_document_token
-
- def get_document_index(self):
- if self.document_subword is not None:
- subword_index = self.document_subword.index
- else:
- subword_index = None
- return Index(self.document_token.i, subword_index)
-
- def explain(self):
- """ Creates a human-readable explanation of the word match from the perspective of the
- document word (e.g. to be used as a tooltip over it)."""
- search_phrase_display_word = self.search_phrase_token._.holmes.lemma.upper()
- if self.type == 'direct':
- return ''.join(("Matches ", search_phrase_display_word, " directly."))
- elif self.type == 'derivation':
- return ''.join(("Has a common stem with ", search_phrase_display_word, "."))
- elif self.type == 'entity':
- return ''.join(("Matches the ", search_phrase_display_word, " placeholder."))
- elif self.type == 'embedding':
- printable_similarity = str(int(self.similarity_measure * 100))
- return ''.join((
- "Has a word embedding that is ", printable_similarity,
- "% similar to ", search_phrase_display_word, "."))
- elif self.type == 'ontology':
- working_depth = self.depth
- if working_depth > 4:
- working_depth = 4
- elif working_depth < -4:
- working_depth = -4
- return ''.join((
- "Is ", ONTOLOGY_DEPTHS_TO_NAMES[working_depth], " of ",
- search_phrase_display_word, " in the ontology."))
- else:
- raise RuntimeError(' '.join(('Unrecognized type', self.type)))
-
-class Match:
- """A match between a search phrase and a document.
-
- Properties:
-
- word_matches -- a list of *WordMatch* objects.
- is_negated -- *True* if this match is negated.
- is_uncertain -- *True* if this match is uncertain.
- involves_coreference -- *True* if this match was found using coreference resolution.
- search_phrase_label -- the label of the search phrase that matched.
- document_label -- the label of the document that matched.
- from_single_word_phraselet -- *True* if this is a match against a single-word
- phraselet.
- from_topic_match_phraselet_created_without_matching_tags -- **True** or **False**
- from_reverse_only_topic_match_phraselet -- **True** or **False**
- overall_similarity_measure -- the overall similarity of the match, or *1.0* if the embedding
- strategy was not involved in the match.
- index_within_document -- the index of the document token that matched the search phrase
- root token.
- """
-
- def __init__(
- self, search_phrase_label, document_label, from_single_word_phraselet,
- from_topic_match_phraselet_created_without_matching_tags,
- from_reverse_only_topic_match_phraselet):
- self.word_matches = []
- self.is_negated = False
- self.is_uncertain = False
- self.search_phrase_label = search_phrase_label
- self.document_label = document_label
- self.from_single_word_phraselet = from_single_word_phraselet
- self.from_topic_match_phraselet_created_without_matching_tags = \
- from_topic_match_phraselet_created_without_matching_tags
- self.from_reverse_only_topic_match_phraselet = from_reverse_only_topic_match_phraselet
- self.index_within_document = None
- self.overall_similarity_measure = '1.0'
-
- @property
- def involves_coreference(self):
- for word_match in self.word_matches:
- if word_match.involves_coreference:
- return True
- return False
-
- def __copy__(self):
- match_to_return = Match(
- self.search_phrase_label, self.document_label, self.from_single_word_phraselet,
- self.from_topic_match_phraselet_created_without_matching_tags,
- self.from_reverse_only_topic_match_phraselet)
- match_to_return.word_matches = self.word_matches.copy()
- match_to_return.is_negated = self.is_negated
- match_to_return.is_uncertain = self.is_uncertain
- match_to_return.index_within_document = self.index_within_document
- return match_to_return
-
- def get_subword_index(self):
- if self.word_matches[0].document_subword is None:
- return None
- return self.word_matches[0].document_subword.index
-
- def get_subword_index_for_sorting(self):
- # returns *-1* rather than *None* in the absence of a subword
- if self.word_matches[0].document_subword is None:
- return -1
- return self.word_matches[0].document_subword.index
-
-@total_ordering
-class Index:
- """ The position of a word or subword within a document. """
-
- def __init__(self, token_index, subword_index):
- self.token_index = token_index
- self.subword_index = subword_index
-
- def is_subword(self):
- return self.subword_index is not None
-
- def __eq__(self, other):
- return isinstance(other, Index) and \
- self.token_index == other.token_index and self.subword_index == other.subword_index
-
- def __lt__(self, other):
- if not isinstance(other, Index):
- raise RuntimeError('Comparison between Index and another type.')
- if self.token_index < other.token_index:
- return True
- if not self.is_subword() and other.is_subword():
- return True
- if self.is_subword() and other.is_subword() and self.subword_index < other.subword_index:
- return True
- return False
-
- def __hash__(self):
- return hash((self.token_index, self.subword_index))
-
-class PhraseletInfo:
- """Information describing a topic matching phraselet.
-
- Parameters:
-
- label -- the phraselet label.
- template_label -- the value of 'PhraseletTemplate.label'.
- parent_lemma -- the parent lemma, or the lemma for single-word phraselets.
- parent_derived_lemma -- the parent derived lemma, or the derived lemma for single-word
- phraselets.
- parent_pos -- the part of speech tag of the token that supplied the parent word.
- child_lemma -- the child lemma, or 'None' for single-word phraselets.
- child_derived_lemma -- the child derived lemma, or 'None' for single-word phraselets.
- child_pos -- the part of speech tag of the token that supplied the child word, or 'None'
- for single-word phraselets.
- created_without_matching_tags -- 'True' if created without matching tags.
- reverse_only_parent_lemma -- 'True' if the parent lemma is in the reverse matching list.
- """
-
- def __init__(
- self, label, template_label, parent_lemma, parent_derived_lemma, parent_pos,
- child_lemma, child_derived_lemma, child_pos, created_without_matching_tags,
- reverse_only_parent_lemma):
- self.label = label
- self.template_label = template_label
-
- self.parent_lemma = parent_lemma
- self.parent_derived_lemma = parent_derived_lemma
- self.parent_pos = parent_pos
- self.child_lemma = child_lemma
- self.child_derived_lemma = child_derived_lemma
- self.child_pos = child_pos
- self.created_without_matching_tags = created_without_matching_tags
- self.reverse_only_parent_lemma = reverse_only_parent_lemma
-
- def __eq__(self, other):
- return isinstance(other, PhraseletInfo) and \
- self.label == other.label and \
- self.template_label == other.template_label and \
- self.parent_lemma == other.parent_lemma and \
- self.parent_derived_lemma == other.parent_derived_lemma and \
- self.parent_pos == other.parent_pos and \
- self.child_lemma == other.child_lemma and \
- self.child_derived_lemma == other.child_derived_lemma and \
- self.child_pos == other.child_pos and \
- self.created_without_matching_tags == other.created_without_matching_tags and \
- self.reverse_only_parent_lemma == other.reverse_only_parent_lemma
-
- def __hash__(self):
- return hash((
- self.label, self.template_label, self.parent_lemma, self.parent_derived_lemma,
- self.parent_pos, self.child_lemma, self.child_derived_lemma,
- self.child_pos, self.created_without_matching_tags, self.reverse_only_parent_lemma))
-
-class ThreadsafeContainer:
- """Container for search phrases and documents that are registered and maintained on the
- manager object as opposed to being supplied with an individual query.
- """
-
- def __init__(self):
- self._search_phrases = []
- # Dict from document labels to IndexedDocument objects
- self._indexed_documents = {}
- self._lock = Lock()
-
- def remove_all_search_phrases(self):
- with self._lock:
- self._search_phrases = []
-
- def remove_all_search_phrases_with_label(self, label):
- with self._lock:
- self._search_phrases = [
- search_phrase for search_phrase in self._search_phrases
- if search_phrase.label != label]
-
- def register_search_phrase(self, search_phrase):
- with self._lock:
- self._search_phrases.append(search_phrase)
-
- def list_search_phrase_labels(self):
- with self._lock:
- search_phrase_labels = sorted({
- search_phrase.label for search_phrase in self._search_phrases})
- return search_phrase_labels
-
- def register_document(self, indexed_document, label):
- with self._lock:
- if label in self._indexed_documents.keys():
- raise DuplicateDocumentError(label)
- self._indexed_documents[label] = indexed_document
-
- def remove_document(self, label):
- with self._lock:
- self._indexed_documents.pop(label)
-
- def remove_all_documents(self):
- with self._lock:
- self._indexed_documents = {}
-
- def document_labels(self):
- """Returns a list of the labels of the currently registered documents."""
-
- with self._lock:
- document_labels = self._indexed_documents.keys()
- return document_labels
-
- def get_document(self, label):
- with self._lock:
- if label in self._indexed_documents.keys():
- document = self._indexed_documents[label].doc
- else:
- document = None
- return document
-
- def get_indexed_documents(self):
- with self._lock:
- return self._indexed_documents.copy()
-
- def get_search_phrases(self):
- with self._lock:
- return self._search_phrases.copy()
-
-class StructuralMatcher:
- """The class responsible for matching search phrases with documents."""
-
- def __init__(
- self, semantic_analyzer, ontology, overall_similarity_threshold,
- embedding_based_matching_on_root_words, analyze_derivational_morphology,
- perform_coreference_resolution):
- """Args:
-
- semantic_analyzer -- the *SemanticAnalyzer* object to use
- ontology -- optionally, an *Ontology* object to use in matching
- overall_similarity_threshold -- if embedding-based matching is to be activated, a float
- value between 0 and 1. A match between a search phrase and a document is then valid
- if the geometric mean of all the similarities between search phrase tokens and
- document tokens is this value or greater. If this value is set to 1.0,
- embedding-based matching is deactivated.
- embedding_based_matching_on_root_words -- determines whether or not embedding-based
- matching should be attempted on search-phrase root tokens, which has a considerable
- performance hit. Defaults to *False*.
- analyze_derivational_morphology -- *True* if matching should be attempted between different
- words from the same word family. Defaults to *True*.
- perform_coreference_resolution -- *True* if coreference resolution should be performed.
- """
- self.semantic_analyzer = semantic_analyzer
- self.ontology = ontology
- self.overall_similarity_threshold = overall_similarity_threshold
- self.embedding_based_matching_on_root_words = embedding_based_matching_on_root_words
- self.analyze_derivational_morphology = analyze_derivational_morphology
- self.perform_coreference_resolution = perform_coreference_resolution
- self.populate_ontology_reverse_derivational_dict()
-
- def populate_ontology_reverse_derivational_dict(self):
- """During structural matching, a lemma or derived lemma matches any words in the ontology
- that yield the same word as their derived lemmas. This method generates a dictionary
- from derived lemmas to ontology words that yield them to facilitate such matching.
- """
- if self.analyze_derivational_morphology and self.ontology is not None:
- ontology_reverse_derivational_dict = {}
- for ontology_word in self.ontology.words:
- derived_lemmas = []
- normalized_ontology_word = \
- self.semantic_analyzer.normalize_hyphens(ontology_word)
- for textual_word in normalized_ontology_word.split():
- derived_lemma = self.semantic_analyzer.derived_holmes_lemma(
- None, textual_word.lower())
- if derived_lemma is None:
- derived_lemma = textual_word
- derived_lemmas.append(derived_lemma)
- derived_ontology_word = ' '.join(derived_lemmas)
- if derived_ontology_word != ontology_word:
- if derived_ontology_word in ontology_reverse_derivational_dict:
- ontology_reverse_derivational_dict[derived_ontology_word].append(
- ontology_word)
- else:
- ontology_reverse_derivational_dict[derived_ontology_word] = [ontology_word]
- # sort entry lists to ensure deterministic behaviour
- for derived_ontology_word in ontology_reverse_derivational_dict:
- ontology_reverse_derivational_dict[derived_ontology_word] = \
- sorted(ontology_reverse_derivational_dict[derived_ontology_word])
- self.ontology_reverse_derivational_dict = ontology_reverse_derivational_dict
- else:
- self.ontology_reverse_derivational_dict = None
-
- def reverse_derived_lemmas_in_ontology(self, obj):
- """ Returns all ontology entries that point to the derived lemma of a token.
- """
- if isinstance(obj, Token):
- derived_lemma = obj._.holmes.lemma_or_derived_lemma()
- elif isinstance(obj, Subword):
- derived_lemma = obj.lemma_or_derived_lemma()
- elif isinstance(obj, self._MultiwordSpan):
- derived_lemma = obj.derived_lemma
- else:
- raise RuntimeError(': '.join(('Unsupported type', str(type(obj)))))
- derived_lemma = self.semantic_analyzer.normalize_hyphens(derived_lemma)
- if derived_lemma in self.ontology_reverse_derivational_dict:
- return self.ontology_reverse_derivational_dict[derived_lemma]
- else:
- return []
-
- class _SearchPhrase:
-
- def __init__(
- self, doc, matchable_tokens, root_token,
- matchable_non_entity_tokens_to_lexemes, single_token_similarity_threshold, label,
- ontology, topic_match_phraselet,
- topic_match_phraselet_created_without_matching_tags, reverse_only,
- structural_matcher):
- """Args:
-
- doc -- the Holmes document created for the search phrase
- matchable_tokens -- a list of tokens all of which must have counterparts in the
- document to produce a match
- root_token -- the token at which recursive matching starts
- matchable_non_entity_tokens_to_lexemes -- dictionary from token indexes to *Lexeme*
- objects. Only used when embedding matching is active.
- single_token_similarity_threshold -- the lowest similarity value that a single token
- within this search phrase could have with a matching document token to achieve
- the overall matching threshold for a match.
- label -- a label for the search phrase.
- ontology -- a reference to the ontology held by the outer *StructuralMatcher* object.
- topic_match_phraselet -- 'True' if a topic match phraselet, otherwise 'False'.
- topic_match_phraselet_created_without_matching_tags -- 'True' if a topic match
- phraselet created without matching tags (match_all_words), otherwise 'False'.
- reverse_only -- 'True' if a phraselet that should only be reverse-matched.
- structural_matcher -- the enclosing instance.
- """
- self.doc = doc
- self._matchable_token_indexes = [token.i for token in matchable_tokens]
- self._root_token_index = root_token.i
- self.matchable_non_entity_tokens_to_lexemes = matchable_non_entity_tokens_to_lexemes
- self.single_token_similarity_threshold = single_token_similarity_threshold
- self.label = label
- self.ontology = ontology
- self.topic_match_phraselet = topic_match_phraselet
- self.topic_match_phraselet_created_without_matching_tags = \
- topic_match_phraselet_created_without_matching_tags
- self.reverse_only = reverse_only
- self.treat_as_reverse_only_during_initial_relation_matching = False # phraselets are
- # set to this value during topic matching to prevent them from being taken into
- # account during initial relation matching because the parent relation occurs too
- # frequently during the corpus. 'reverse_only' cannot be used instead because it
- # has an effect on scoring.
- self.words_matching_root_token, self.root_word_to_match_info_dict = \
- self.get_words_matching_root_token_and_match_type_dict(structural_matcher)
- self.has_single_matchable_word = len(matchable_tokens) == 1
-
- @property
- def matchable_tokens(self):
- return [self.doc[index] for index in self._matchable_token_indexes]
-
- @property
- def root_token(self):
- return self.doc[self._root_token_index]
-
- def get_words_matching_root_token_and_match_type_dict(self, structural_matcher):
- """ Create list of all words that match the root token of the search phrase,
- taking any ontology into account; create a dictionary from these words
- to match types and depths.
- """
-
- def add_word_information(word, match_type, depth):
- if word not in list_to_return:
- list_to_return.append(word)
- if not word in root_word_to_match_info_dict:
- root_word_to_match_info_dict[word] = (match_type, depth)
-
- def add_word_information_from_ontology(word):
- for entry_word, entry_depth in \
- structural_matcher.ontology.get_words_matching_and_depths(word):
- add_word_information(entry_word, 'ontology', entry_depth)
- if structural_matcher.analyze_derivational_morphology:
- working_derived_lemma = \
- structural_matcher.semantic_analyzer.derived_holmes_lemma(
- None, entry_word.lower())
- if working_derived_lemma is not None:
- add_word_information(working_derived_lemma, 'ontology', entry_depth)
-
- list_to_return = []
- root_word_to_match_info_dict = {}
-
- add_word_information(self.root_token._.holmes.lemma, 'direct', 0)
- if not self.topic_match_phraselet:
- add_word_information(self.root_token.text.lower(), 'direct', 0)
- hyphen_normalized_text = \
- structural_matcher.semantic_analyzer.normalize_hyphens(self.root_token.text)
- if self.root_token.text != hyphen_normalized_text:
- add_word_information(hyphen_normalized_text.lower(), 'direct', 0)
- if structural_matcher.analyze_derivational_morphology and \
- self.root_token._.holmes.derived_lemma is not None:
- add_word_information(self.root_token._.holmes.derived_lemma, 'derivation', 0)
- if structural_matcher.ontology is not None and not \
- structural_matcher._is_entity_search_phrase_token(
- self.root_token, self.topic_match_phraselet):
- add_word_information_from_ontology(self.root_token._.holmes.lemma)
- if structural_matcher.analyze_derivational_morphology and \
- self.root_token._.holmes.derived_lemma is not None:
- add_word_information_from_ontology(self.root_token._.holmes.derived_lemma)
- if not self.topic_match_phraselet:
- add_word_information_from_ontology(self.root_token.text.lower())
- if self.root_token.text != hyphen_normalized_text:
- add_word_information_from_ontology(hyphen_normalized_text.lower())
- if structural_matcher.analyze_derivational_morphology:
- for reverse_derived_lemma in \
- structural_matcher.reverse_derived_lemmas_in_ontology(self.root_token):
- add_word_information_from_ontology(reverse_derived_lemma)
- return list_to_return, root_word_to_match_info_dict
-
- class _IndexedDocument:
- """Args:
-
- doc -- the Holmes document
- words_to_token_info_dict -- a dictionary from words to tuples containing:
- - the token indexes where each word occurs in the document
- - the word representation
- - a boolean value specifying whether the index is based on derivation
- """
-
- def __init__(self, doc, words_to_token_info_dict):
- self.doc = doc
- self.words_to_token_info_dict = words_to_token_info_dict
-
- class _MultiwordSpan:
-
- def __init__(self, text, lemma, derived_lemma, tokens):
- """Args:
-
- text -- the raw text representation of the multiword span
- lemma - the lemma representation of the multiword span
- derived_lemma - the lemma representation with individual words that have derived
- lemmas replaced by those derived lemmas
- tokens -- a list of tokens that make up the multiword span
- """
- self.text = text
- self.lemma = lemma
- self.derived_lemma = derived_lemma
- self.tokens = tokens
-
- def _multiword_spans_with_head_token(self, token):
- """Generator over *_MultiwordSpan* objects with *token* at their head. Dependent phrases
- are only returned for nouns because e.g. for verbs the whole sentence would be returned.
- """
-
- if not token.pos_ in self.semantic_analyzer.noun_pos:
- return
- pointer = token.left_edge.i
- while pointer <= token.right_edge.i:
- if token.doc[pointer].pos_ in self.semantic_analyzer.noun_pos \
- and token.doc[pointer].dep_ in self.semantic_analyzer.noun_kernel_dep:
- working_text = ''
- working_lemma = ''
- working_derived_lemma = ''
- working_tokens = []
- inner_pointer = pointer
- while inner_pointer <= token.right_edge.i and \
- token.doc[inner_pointer].pos_ in self.semantic_analyzer.noun_pos:
- working_text = ' '.join((working_text, token.doc[inner_pointer].text))
- working_lemma = ' '.join((
- working_lemma, token.doc[inner_pointer]._.holmes.lemma))
- if self.analyze_derivational_morphology and \
- token.doc[inner_pointer]._.holmes.derived_lemma is not None:
- this_token_derived_lemma = token.doc[inner_pointer]._.holmes.derived_lemma
- else:
- # if derivational morphology analysis is switched off, the derived lemma
- # will be identical to the lemma and will not be yielded by
- # _loop_textual_representations().
- this_token_derived_lemma = token.doc[inner_pointer]._.holmes.lemma
- working_derived_lemma = ' '.join((
- working_derived_lemma, this_token_derived_lemma))
- working_tokens.append(token.doc[inner_pointer])
- inner_pointer += 1
- if pointer + 1 < inner_pointer and token in working_tokens:
- yield self._MultiwordSpan(
- working_text.strip(), working_lemma.strip(), working_derived_lemma.strip(),
- working_tokens)
- pointer += 1
-
- def add_phraselets_to_dict(
- self, doc, *, phraselet_labels_to_phraselet_infos,
- replace_with_hypernym_ancestors, match_all_words,
- ignore_relation_phraselets, include_reverse_only, stop_lemmas,
- reverse_only_parent_lemmas):
- """ Creates topic matching phraselets extracted from a matching text.
-
- Properties:
-
- doc -- the Holmes-parsed document
- phraselet_labels_to_phraselet_infos -- a dictionary from labels to phraselet info objects
- that are used to generate phraselet search phrases.
- replace_with_hypernym_ancestors -- if 'True', all words present in the ontology
- are replaced with their most general (highest) ancestors.
- match_all_words -- if 'True', word phraselets are generated for all matchable words
- rather than just for words whose tags match the phraselet template; multiwords
- are not taken into account when processing single-word phraselets; and single-word
- phraselets are generated for subwords.
- ignore_relation_phraselets -- if 'True', only single-word phraselets are processed.
- include_reverse_only -- whether to generate phraselets that are only reverse-matched.
- Reverse matching is used in topic matching but not in supervised document
- classification.
- stop_lemmas -- lemmas that should prevent all types of phraselet production.
- reverse_only_parent_lemmas -- lemma / part-of-speech combinations that, when present at
- the parent pole of a relation phraselet, should cause that phraselet to be
- reverse-matched.
- """
-
- index_to_lemmas_cache = {}
- def get_lemmas_from_index(index):
- """ Returns the lemma and the derived lemma. Phraselets form a special case where
- the derived lemma is set even if it is identical to the lemma. This is necessary
- because the lemma may be set to a different value during the lifecycle of the
- object. The property getter in the SemanticDictionary class ensures that
- derived_lemma is None is always returned where the two strings are identical.
- """
- if index in index_to_lemmas_cache:
- return index_to_lemmas_cache[index]
- token = doc[index.token_index]
- if self._is_entity_search_phrase_token(token, False):
- # False in order to get text rather than lemma
- index_to_lemmas_cache[index] = token.text, token.text
- return token.text, token.text
- # keep the text, because the lemma will be lowercase
- if index.is_subword():
- lemma = token._.holmes.subwords[index.subword_index].lemma
- if self.analyze_derivational_morphology:
- derived_lemma = token._.holmes.subwords[index.subword_index].\
- lemma_or_derived_lemma()
- else:
- derived_lemma = lemma
- if self.ontology is not None and self.analyze_derivational_morphology:
- for reverse_derived_word in self.reverse_derived_lemmas_in_ontology(
- token._.holmes.subwords[index.subword_index]):
- derived_lemma = reverse_derived_word.lower()
- break
- else:
- lemma = token._.holmes.lemma
- if self.analyze_derivational_morphology:
- derived_lemma = token._.holmes.lemma_or_derived_lemma()
- else:
- derived_lemma = lemma
- if self.ontology is not None and not self.ontology.contains(lemma):
- if self.ontology.contains(token.text.lower()):
- lemma = derived_lemma = token.text.lower()
- # ontology contains text but not lemma, so return text
- if self.ontology is not None and self.analyze_derivational_morphology:
- for reverse_derived_word in self.reverse_derived_lemmas_in_ontology(token):
- derived_lemma = reverse_derived_word.lower()
- break
- # ontology contains a word pointing to the same derived lemma,
- # so return that. Note that if there are several such words the same
- # one will always be returned.
- index_to_lemmas_cache[index] = lemma, derived_lemma
- return lemma, derived_lemma
-
- def replace_lemmas_with_most_general_ancestor(lemma, derived_lemma):
- new_derived_lemma = self.ontology.get_most_general_hypernym_ancestor(
- derived_lemma).lower()
- if derived_lemma != new_derived_lemma:
- lemma = derived_lemma = new_derived_lemma
- return lemma, derived_lemma
-
- def lemma_replacement_indicated(existing_lemma, existing_pos, new_lemma, new_pos):
- if existing_lemma is None:
- return False
- if not existing_pos in self.semantic_analyzer.preferred_phraselet_pos and \
- new_pos in self.semantic_analyzer.preferred_phraselet_pos:
- return True
- if existing_pos in self.semantic_analyzer.preferred_phraselet_pos and \
- new_pos not in self.semantic_analyzer.preferred_phraselet_pos:
- return False
- return len(new_lemma) < len(existing_lemma)
-
- def add_new_phraselet_info(
- phraselet_label, phraselet_template, created_without_matching_tags,
- is_reverse_only_parent_lemma, parent_lemma, parent_derived_lemma, parent_pos,
- child_lemma, child_derived_lemma, child_pos):
- if phraselet_label not in phraselet_labels_to_phraselet_infos:
- phraselet_labels_to_phraselet_infos[phraselet_label] = PhraseletInfo(
- phraselet_label, phraselet_template.label, parent_lemma,
- parent_derived_lemma, parent_pos, child_lemma, child_derived_lemma,
- child_pos, created_without_matching_tags,
- is_reverse_only_parent_lemma)
- else:
- existing_phraselet = phraselet_labels_to_phraselet_infos[phraselet_label]
- if lemma_replacement_indicated(
- existing_phraselet.parent_lemma, existing_phraselet.parent_pos,
- parent_lemma, parent_pos):
- existing_phraselet.parent_lemma = parent_lemma
- existing_phraselet.parent_pos = parent_pos
- if lemma_replacement_indicated(
- existing_phraselet.child_lemma, existing_phraselet.child_pos, child_lemma,
- child_pos):
- existing_phraselet.child_lemma = child_lemma
- existing_phraselet.child_pos = child_pos
-
- def process_single_word_phraselet_templates(
- token, subword_index, checking_tags, token_indexes_to_multiword_lemmas):
- for phraselet_template in (
- phraselet_template for phraselet_template in
- self.semantic_analyzer.phraselet_templates if
- phraselet_template.single_word() and (
- token._.holmes.is_matchable or subword_index is not None)):
- # see note below for explanation
- if not checking_tags or token.tag_ in phraselet_template.parent_tags:
- phraselet_doc = self.semantic_analyzer.parse(
- phraselet_template.template_sentence)
- if token.i in token_indexes_to_multiword_lemmas and not match_all_words:
- lemma = derived_lemma = token_indexes_to_multiword_lemmas[token.i]
- else:
- lemma, derived_lemma = get_lemmas_from_index(Index(token.i, subword_index))
- if self.ontology is not None and replace_with_hypernym_ancestors:
- lemma, derived_lemma = replace_lemmas_with_most_general_ancestor(
- lemma, derived_lemma)
- phraselet_doc[phraselet_template.parent_index]._.holmes.lemma = lemma
- phraselet_doc[phraselet_template.parent_index]._.holmes.derived_lemma = \
- derived_lemma
- phraselet_label = ''.join((phraselet_template.label, ': ', derived_lemma))
- if derived_lemma not in stop_lemmas and derived_lemma != 'ENTITYNOUN':
- # ENTITYNOUN has to be excluded as single word although it is still
- # permitted as the child of a relation phraselet template
- add_new_phraselet_info(
- phraselet_label, phraselet_template, not checking_tags,
- None, lemma, derived_lemma, token.pos_, None, None, None)
-
- def add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(index_list):
- # for each token in the list, find out whether it has subwords and if so add the
- # head subword to the list
- for index in index_list.copy():
- token = doc[index.token_index]
- for subword in (
- subword for subword in token._.holmes.subwords if
- subword.is_head and subword.containing_token_index == token.i):
- index_list.append(Index(token.i, subword.index))
- # if one or more subwords do not belong to this token, it is a hyphenated word
- # within conjunction and the whole word should not be used to build relation phraselets.
- if len([
- subword for subword in token._.holmes.subwords if
- subword.containing_token_index != token.i]) > 0:
- index_list.remove(index)
-
- self._redefine_multiwords_on_head_tokens(doc)
- token_indexes_to_multiword_lemmas = {}
- token_indexes_within_multiwords_to_ignore = []
- for token in (token for token in doc if len(token._.holmes.lemma.split()) == 1):
- entity_defined_multiword, indexes = \
- self.semantic_analyzer.get_entity_defined_multiword(token)
- if entity_defined_multiword is not None:
- for index in indexes:
- if index == token.i:
- token_indexes_to_multiword_lemmas[token.i] = entity_defined_multiword
- else:
- token_indexes_within_multiwords_to_ignore.append(index)
- for token in doc:
- if token.i in token_indexes_within_multiwords_to_ignore:
- if match_all_words:
- process_single_word_phraselet_templates(
- token, None, False, token_indexes_to_multiword_lemmas)
- continue
- if len([
- subword for subword in token._.holmes.subwords if
- subword.containing_token_index != token.i]) == 0:
- # whole single words involved in subword conjunction should not be included as
- # these are partial words including hyphens.
- process_single_word_phraselet_templates(
- token, None, not match_all_words, token_indexes_to_multiword_lemmas)
- if match_all_words:
- for subword in (
- subword for subword in token._.holmes.subwords if
- token.i == subword.containing_token_index):
- process_single_word_phraselet_templates(
- token, subword.index, False, token_indexes_to_multiword_lemmas)
- if ignore_relation_phraselets:
- continue
- if self.perform_coreference_resolution:
- parents = [
- Index(token_index, None) for token_index in
- token._.holmes.token_and_coreference_chain_indexes]
- else:
- parents = [Index(token.i, None)]
- add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(parents)
- for parent in parents:
- for dependency in (
- dependency for dependency in doc[parent.token_index]._.holmes.children
- if dependency.child_index not in token_indexes_within_multiwords_to_ignore):
- if self.perform_coreference_resolution:
- children = [
- Index(token_index, None) for token_index in
- dependency.child_token(doc)._.holmes.
- token_and_coreference_chain_indexes]
- else:
- children = [Index(dependency.child_token(doc).i, None)]
- add_head_subwords_to_token_list_and_remove_words_with_subword_conjunction(
- children)
- for child in children:
- for phraselet_template in (
- phraselet_template for phraselet_template in
- self.semantic_analyzer.phraselet_templates if not
- phraselet_template.single_word() and (
- not phraselet_template.reverse_only or include_reverse_only)):
- if dependency.label in \
- phraselet_template.dependency_labels and \
- doc[parent.token_index].tag_ in phraselet_template.parent_tags\
- and doc[child.token_index].tag_ in \
- phraselet_template.child_tags and \
- doc[parent.token_index]._.holmes.is_matchable and \
- doc[child.token_index]._.holmes.is_matchable:
- phraselet_doc = self.semantic_analyzer.parse(
- phraselet_template.template_sentence)
- if parent.token_index in token_indexes_to_multiword_lemmas:
- parent_lemma = parent_derived_lemma = \
- token_indexes_to_multiword_lemmas[parent.token_index]
- else:
- parent_lemma, parent_derived_lemma = \
- get_lemmas_from_index(parent)
- if self.ontology is not None and replace_with_hypernym_ancestors:
- parent_lemma, parent_derived_lemma = \
- replace_lemmas_with_most_general_ancestor(
- parent_lemma, parent_derived_lemma)
- if child.token_index in token_indexes_to_multiword_lemmas:
- child_lemma = child_derived_lemma = \
- token_indexes_to_multiword_lemmas[child.token_index]
- else:
- child_lemma, child_derived_lemma = get_lemmas_from_index(child)
- if self.ontology is not None and replace_with_hypernym_ancestors:
- child_lemma, child_derived_lemma = \
- replace_lemmas_with_most_general_ancestor(
- child_lemma, child_derived_lemma)
- phraselet_doc[phraselet_template.parent_index]._.holmes.lemma = \
- parent_lemma
- phraselet_doc[phraselet_template.parent_index]._.holmes.\
- derived_lemma = parent_derived_lemma
- phraselet_doc[phraselet_template.child_index]._.holmes.lemma = \
- child_lemma
- phraselet_doc[phraselet_template.child_index]._.holmes.\
- derived_lemma = child_derived_lemma
- phraselet_label = ''.join((
- phraselet_template.label, ': ', parent_derived_lemma,
- '-', child_derived_lemma))
- is_reverse_only_parent_lemma = False
- if reverse_only_parent_lemmas is not None:
- for entry in reverse_only_parent_lemmas:
- if entry[0] == doc[parent.token_index]._.holmes.lemma \
- and entry[1] == doc[parent.token_index].pos_:
- is_reverse_only_parent_lemma = True
- if parent_lemma not in stop_lemmas and child_lemma not in \
- stop_lemmas and not (
- is_reverse_only_parent_lemma
- and not include_reverse_only):
- add_new_phraselet_info(
- phraselet_label, phraselet_template, match_all_words,
- is_reverse_only_parent_lemma,
- parent_lemma, parent_derived_lemma,
- doc[parent.token_index].pos_,
- child_lemma, child_derived_lemma,
- doc[child.token_index].pos_)
-
- # We do not check for matchability in order to catch pos_='X', tag_='TRUNC'. This
- # is not a problem as only a limited range of parts of speech receive subwords in
- # the first place.
- for subword in (
- subword for subword in token._.holmes.subwords if
- subword.dependent_index is not None):
- parent_subword_index = subword.index
- child_subword_index = subword.dependent_index
- if token._.holmes.subwords[parent_subword_index].containing_token_index != \
- token.i and \
- token._.holmes.subwords[child_subword_index].containing_token_index != \
- token.i:
- continue
- for phraselet_template in (
- phraselet_template for phraselet_template in
- self.semantic_analyzer.phraselet_templates if not
- phraselet_template.single_word() and (
- not phraselet_template.reverse_only or include_reverse_only)
- and subword.dependency_label in phraselet_template.dependency_labels and
- token.tag_ in phraselet_template.parent_tags):
- phraselet_doc = self.semantic_analyzer.parse(
- phraselet_template.template_sentence)
- parent_lemma, parent_derived_lemma = get_lemmas_from_index(Index(
- token.i, parent_subword_index))
- if self.ontology is not None and replace_with_hypernym_ancestors:
- parent_lemma, parent_derived_lemma = \
- replace_lemmas_with_most_general_ancestor(
- parent_lemma, parent_derived_lemma)
- child_lemma, child_derived_lemma = get_lemmas_from_index(Index(
- token.i, child_subword_index))
- if self.ontology is not None and replace_with_hypernym_ancestors:
- child_lemma, child_derived_lemma = \
- replace_lemmas_with_most_general_ancestor(
- child_lemma, child_derived_lemma)
- phraselet_doc[phraselet_template.parent_index]._.holmes.lemma = \
- parent_lemma
- phraselet_doc[phraselet_template.parent_index]._.holmes.derived_lemma = \
- parent_derived_lemma
- phraselet_doc[phraselet_template.child_index]._.holmes.lemma = \
- child_lemma
- phraselet_doc[phraselet_template.child_index]._.holmes.derived_lemma = \
- child_derived_lemma
- phraselet_label = ''.join((
- phraselet_template.label, ': ', parent_derived_lemma, '-',
- child_derived_lemma))
- add_new_phraselet_info(
- phraselet_label, phraselet_template, match_all_words,
- False, parent_lemma, parent_derived_lemma, token.pos_, child_lemma,
- child_derived_lemma, token.pos_)
- if len(phraselet_labels_to_phraselet_infos) == 0 and not match_all_words:
- for token in doc:
- process_single_word_phraselet_templates(
- token, None, False, token_indexes_to_multiword_lemmas)
-
- def create_search_phrases_from_phraselet_infos(self, phraselet_infos):
- """ Creates search phrases from phraselet info objects, returning a dictionary from
- phraselet labels to the created search phrases.
- """
-
- def create_phraselet_label(phraselet_info):
- if phraselet_info.child_lemma is not None:
- return ''.join((
- phraselet_info.template_label, ': ', phraselet_info.parent_derived_lemma, '-',
- phraselet_info.child_derived_lemma))
- else:
- return ''.join((
- phraselet_info.template_label, ': ', phraselet_info.parent_derived_lemma))
-
- def create_search_phrase_from_phraselet(phraselet_info):
- for phraselet_template in self.semantic_analyzer.phraselet_templates:
- if phraselet_info.template_label == phraselet_template.label:
- phraselet_doc = self.semantic_analyzer.parse(
- phraselet_template.template_sentence)
- phraselet_doc[phraselet_template.parent_index]._.holmes.lemma = \
- phraselet_info.parent_lemma
- phraselet_doc[phraselet_template.parent_index]._.holmes.derived_lemma = \
- phraselet_info.parent_derived_lemma
- if phraselet_info.child_lemma is not None:
- phraselet_doc[phraselet_template.child_index]._.holmes.lemma = \
- phraselet_info.child_lemma
- phraselet_doc[phraselet_template.child_index]._.holmes.derived_lemma = \
- phraselet_info.child_derived_lemma
- return self.create_search_phrase(
- 'topic match phraselet', phraselet_doc,
- create_phraselet_label(phraselet_info), phraselet_template,
- phraselet_info.created_without_matching_tags,
- phraselet_info.reverse_only_parent_lemma)
- raise RuntimeError(' '.join((
- 'Phraselet template', phraselet_info.template_label, 'not found.')))
-
- return {
- create_phraselet_label(phraselet_info) :
- create_search_phrase_from_phraselet(phraselet_info) for phraselet_info in
- phraselet_infos}
-
- def _redefine_multiwords_on_head_tokens(self, doc):
-
- def loop_textual_representations(multiword_span):
- for representation, _ in self._loop_textual_representations(multiword_span):
- yield representation, multiword_span.derived_lemma
- if self.analyze_derivational_morphology:
- for reverse_derived_lemma in \
- self.reverse_derived_lemmas_in_ontology(multiword_span):
- yield reverse_derived_lemma, multiword_span.derived_lemma
-
- if self.ontology is not None:
- for token in (token for token in doc if len(token._.holmes.lemma.split()) == 1):
- matched = False
- for multiword_span in self._multiword_spans_with_head_token(token):
- for representation, derived_lemma in \
- loop_textual_representations(multiword_span):
- if self.ontology.contains_multiword(representation):
- matched = True
- token._.holmes.lemma = representation.lower()
- token._.holmes.derived_lemma = derived_lemma
- # mark the dependent tokens as grammatical and non-matchable
- for multiword_token in (
- multiword_token for multiword_token in multiword_span.tokens
- if multiword_token.i != token.i):
- multiword_token._.holmes.children = [SemanticDependency(
- multiword_token.i, 0 - (token.i + 1), None)]
- multiword_token._.holmes.is_matchable = False
- break
- if matched:
- break
-
- def create_search_phrase(
- self, search_phrase_text, search_phrase_doc,
- label, phraselet_template, topic_match_phraselet_created_without_matching_tags,
- is_reverse_only_parent_lemma=False):
- """phraselet_template -- 'None' if this search phrase is not a topic match phraselet"""
-
- def replace_grammatical_root_token_recursively(token):
- """Where the syntactic root of a search phrase document is a grammatical token or is
- marked as non-matchable, loop through the semantic dependencies to find the
- semantic root.
- """
- for dependency in token._.holmes.children:
- if dependency.child_index < 0:
- return replace_grammatical_root_token_recursively(
- token.doc[(0 - dependency.child_index) - 1])
- if not token._.holmes.is_matchable:
- for dependency in token._.holmes.children:
- if dependency.child_index >= 0 and \
- dependency.child_token(token.doc)._.holmes.is_matchable:
- return replace_grammatical_root_token_recursively(
- token.doc[dependency.child_index])
- return token
-
- if phraselet_template is None:
- self._redefine_multiwords_on_head_tokens(search_phrase_doc)
- # where a multiword exists as an ontology entry, the multiword should be used for
- # matching rather than the individual words. Not relevant for topic matching
- # phraselets because the multiword will already have been set as the Holmes
- # lemma of the word.
-
- for token in search_phrase_doc:
- if len(token._.holmes.righthand_siblings) > 0:
- # SearchPhrases may not themselves contain conjunctions like 'and'
- # because then the matching becomes too complicated
- raise SearchPhraseContainsConjunctionError(search_phrase_text)
- if token._.holmes.is_negated:
- # SearchPhrases may not themselves contain negation
- # because then the matching becomes too complicated
- raise SearchPhraseContainsNegationError(search_phrase_text)
- if self.perform_coreference_resolution and token.pos_ == 'PRON' and \
- self.semantic_analyzer.is_involved_in_coreference(token):
- # SearchPhrases may not themselves contain coreferring pronouns
- # because then the matching becomes too complicated
- raise SearchPhraseContainsCoreferringPronounError(search_phrase_text)
-
- root_tokens = []
- tokens_to_match = []
- matchable_non_entity_tokens_to_lexemes = {}
- for token in search_phrase_doc:
- # check whether grammatical token
- if phraselet_template is not None and phraselet_template.parent_index != token.i and \
- phraselet_template.child_index != token.i:
- token._.holmes.is_matchable = False
- if phraselet_template is not None and phraselet_template.parent_index == token.i and \
- not phraselet_template.single_word() and \
- phraselet_template.assigned_dependency_label is not None:
- for dependency in (
- dependency for dependency in token._.holmes.children if \
- dependency.child_index == phraselet_template.child_index):
- dependency.label = phraselet_template.assigned_dependency_label
- if token._.holmes.is_matchable and not (
- len(token._.holmes.children) > 0 and
- token._.holmes.children[0].child_index < 0):
- tokens_to_match.append(token)
- if self.overall_similarity_threshold < 1.0 and not \
- self._is_entity_search_phrase_token(token, phraselet_template is not None):
- if phraselet_template is None and len(token._.holmes.lemma.split()) > 1:
- matchable_non_entity_tokens_to_lexemes[token.i] = \
- self.semantic_analyzer.nlp.vocab[token.lemma_]
- else:
- matchable_non_entity_tokens_to_lexemes[token.i] = \
- self.semantic_analyzer.nlp.vocab[token._.holmes.lemma]
- if token.dep_ == 'ROOT': # syntactic root
- root_tokens.append(replace_grammatical_root_token_recursively(token))
- if len(tokens_to_match) == 0:
- raise SearchPhraseWithoutMatchableWordsError(search_phrase_text)
- if len(root_tokens) > 1:
- raise SearchPhraseContainsMultipleClausesError(search_phrase_text)
- single_token_similarity_threshold = 1.0
- if self.overall_similarity_threshold < 1.0 and \
- len(matchable_non_entity_tokens_to_lexemes) > 0:
- single_token_similarity_threshold = \
- self.overall_similarity_threshold ** len(matchable_non_entity_tokens_to_lexemes)
- if phraselet_template is None:
- reverse_only = False
- else:
- reverse_only = is_reverse_only_parent_lemma or phraselet_template.reverse_only
- return self._SearchPhrase(
- search_phrase_doc, tokens_to_match, root_tokens[0],
- matchable_non_entity_tokens_to_lexemes, single_token_similarity_threshold, label,
- self.ontology, phraselet_template is not None,
- topic_match_phraselet_created_without_matching_tags, reverse_only, self)
-
- def index_document(self, parsed_document):
-
- def add_dict_entry(dictionary, word, token_index, subword_index, match_type):
- index = Index(token_index, subword_index)
- if match_type == 'entity':
- key_word = word
- else:
- key_word = word.lower()
- if key_word in dictionary.keys():
- if index not in dictionary[key_word]:
- dictionary[key_word].append((index, word, match_type == 'derivation'))
- else:
- dictionary[key_word] = [(index, word, match_type == 'derivation')]
-
- def get_ontology_defined_multiword(token):
- for multiword_span in self._multiword_spans_with_head_token(token):
- if self.ontology.contains_multiword(multiword_span.text):
- return multiword_span.text, 'direct'
- hyphen_normalized_text = self.semantic_analyzer.normalize_hyphens(
- multiword_span.text)
- if self.ontology.contains_multiword(hyphen_normalized_text):
- return hyphen_normalized_text, 'direct'
- elif self.ontology.contains_multiword(multiword_span.lemma):
- return multiword_span.lemma, 'direct'
- elif self.ontology.contains_multiword(multiword_span.derived_lemma):
- return multiword_span.derived_lemma, 'derivation'
- if self.analyze_derivational_morphology and self.ontology is not None:
- for reverse_lemma in self.reverse_derived_lemmas_in_ontology(
- multiword_span):
- return reverse_lemma, 'derivation'
- return None, None
-
- words_to_token_info_dict = {}
- for token in parsed_document:
-
- # parent check is necessary so we only find multiword entities once per
- # search phrase. sibling_marker_deps applies to siblings which would
- # otherwise be excluded because the main sibling would normally also match the
- # entity root word.
- if len(token.ent_type_) > 0 and (
- token.dep_ == 'ROOT' or token.dep_ in self.semantic_analyzer.sibling_marker_deps
- or token.ent_type_ != token.head.ent_type_):
- entity_label = ''.join(('ENTITY', token.ent_type_))
- add_dict_entry(words_to_token_info_dict, entity_label, token.i, None, 'entity')
- if self.ontology is not None:
- ontology_defined_multiword, match_type = get_ontology_defined_multiword(token)
- if ontology_defined_multiword is not None:
- add_dict_entry(
- words_to_token_info_dict, ontology_defined_multiword, token.i, None,
- match_type)
- continue
- entity_defined_multiword, _ = self.semantic_analyzer.get_entity_defined_multiword(token)
- if entity_defined_multiword is not None:
- add_dict_entry(
- words_to_token_info_dict, entity_defined_multiword, token.i, None, 'direct')
- for representation, match_type in self._loop_textual_representations(token):
- add_dict_entry(
- words_to_token_info_dict, representation, token.i, None, match_type)
- for subword in token._.holmes.subwords:
- for representation, match_type in self._loop_textual_representations(subword):
- add_dict_entry(
- words_to_token_info_dict, representation, token.i, subword.index,
- match_type)
- return self._IndexedDocument(parsed_document, words_to_token_info_dict)
-
- def _match_type(self, search_phrase_and_document_derived_lemmas_identical, *match_types):
- if 'ontology' in match_types and search_phrase_and_document_derived_lemmas_identical:
- # an ontology entry happens to have created a derivation word match before the
- # derivation match itself was processed, so mark the type as 'derivation'.
- return 'derivation'
- elif 'ontology' in match_types:
- return 'ontology'
- elif 'derivation' in match_types:
- return 'derivation'
- else:
- return 'direct'
-
- def _match_recursively(
- self, *, search_phrase, search_phrase_token, document, document_token,
- document_subword_index, search_phrase_tokens_to_word_matches,
- search_phrase_and_document_visited_table, is_uncertain,
- structurally_matched_document_token, compare_embeddings_on_non_root_words):
- """Called whenever matching is attempted between a search phrase token and a document
- token."""
-
- def handle_match(
- search_phrase_word, document_word, match_type, depth,
- *, similarity_measure=1.0, first_document_token=document_token,
- last_document_token=document_token):
- """Most of the variables are set from the outer call.
-
- Args:
-
- search_phrase_word -- the textual representation of the search phrase word that matched.
- document_word -- the textual representation of the document word that matched.
- match_type -- *direct*, *entity*, *embedding*, *ontology* or *derivation*
- similarity_measure -- the similarity between the two tokens. Defaults to 1.0 if the
- match did not involve embeddings.
- """
- for dependency in (
- dependency for dependency in search_phrase_token._.holmes.children
- if dependency.child_token(search_phrase_token.doc)._.holmes.is_matchable):
- at_least_one_document_dependency_tried = False
- at_least_one_document_dependency_matched = False
- # Loop through this token and any tokens linked to it by coreference
- if self.perform_coreference_resolution and document_subword_index is None:
- parents = [
- Index(token_index, None) for token_index in
- document_token._.holmes.token_and_coreference_chain_indexes]
- else:
- parents = [Index(document_token.i, document_subword_index)]
- for working_document_parent_index in parents:
- working_document_child_indexes = []
- document_parent_token = document_token.doc[
- working_document_parent_index.token_index]
- if not working_document_parent_index.is_subword() or \
- document_parent_token._.holmes.subwords[
- working_document_parent_index.subword_index].is_head:
- # is_head: e.g. 'Polizeiinformation über Kriminelle' should match
- # 'Information über Kriminelle'
- for document_dependency in (
- document_dependency for document_dependency in
- document_parent_token._.holmes.children if
- self.semantic_analyzer.dependency_labels_match(
- search_phrase_dependency_label=dependency.label,
- document_dependency_label=document_dependency.label)):
- document_child = document_dependency.child_token(document_token.doc)
- if self.perform_coreference_resolution:
- # wherever a dependency is found, loop through any tokens linked
- # to the child by coreference
- working_document_child_indexes = [
- Index(token_index, None) for token_index in
- document_child._.holmes.token_and_coreference_chain_indexes
- if document_token.doc[token_index].pos_ != 'PRON' or not
- self.semantic_analyzer.is_involved_in_coreference(
- document_token.doc[token_index])]
- # otherwise where matching starts with a noun and there is
- # a dependency pointing back to the noun, matching will be
- # attempted against the pronoun only and will then fail.
- else:
- working_document_child_indexes = \
- [Index(document_dependency.child_index, None)]
- # Where a dependency points to an entire word that has subwords, check
- # the head subword as well as the entire word
- for working_document_child_index in \
- working_document_child_indexes.copy():
- working_document_child = \
- document_token.doc[working_document_child_index.token_index]
- for subword in (
- subword for subword in
- working_document_child._.holmes.subwords
- if subword.is_head):
- working_document_child_indexes.append(Index(
- working_document_child.i, subword.index))
- # Loop through the dependencies from each token
- for working_document_child_index in (
- working_index for working_index
- in working_document_child_indexes if working_index not in
- search_phrase_and_document_visited_table[dependency.child_index]
- ):
- at_least_one_document_dependency_tried = True
- if self._match_recursively(
- search_phrase=search_phrase,
- search_phrase_token=dependency.child_token(
- search_phrase_token.doc),
- document=document,
- document_token=document[
- working_document_child_index.token_index],
- document_subword_index=
- working_document_child_index.subword_index,
- search_phrase_tokens_to_word_matches=
- search_phrase_tokens_to_word_matches,
- search_phrase_and_document_visited_table=
- search_phrase_and_document_visited_table,
- is_uncertain=(
- document_dependency.is_uncertain and not
- dependency.is_uncertain),
- structurally_matched_document_token=document_child,
- compare_embeddings_on_non_root_words=
- compare_embeddings_on_non_root_words):
- at_least_one_document_dependency_matched = True
- if working_document_parent_index.is_subword():
- # examine relationship to dependent subword in the same word
- document_parent_subword = document_token.doc[
- working_document_parent_index.token_index]._.holmes.\
- subwords[working_document_parent_index.subword_index]
- if document_parent_subword.dependent_index is not None and \
- self.semantic_analyzer.dependency_labels_match(
- search_phrase_dependency_label=dependency.label,
- document_dependency_label=
- document_parent_subword.dependency_label):
- at_least_one_document_dependency_tried = True
- if self._match_recursively(
- search_phrase=search_phrase,
- search_phrase_token=dependency.child_token(
- search_phrase_token.doc),
- document=document,
- document_token=document_token,
- document_subword_index=
- document_parent_subword.dependent_index,
- search_phrase_tokens_to_word_matches=
- search_phrase_tokens_to_word_matches,
- search_phrase_and_document_visited_table=
- search_phrase_and_document_visited_table,
- is_uncertain=False,
- structurally_matched_document_token=document_token,
- compare_embeddings_on_non_root_words=
- compare_embeddings_on_non_root_words):
- at_least_one_document_dependency_matched = True
- if at_least_one_document_dependency_tried and not \
- at_least_one_document_dependency_matched:
- # it is already clear that the search phrase has not matched, so
- # there is no point in pursuing things any further
- return
- # store the word match
- if document_subword_index is None:
- document_subword = None
- else:
- document_subword = document_token._.holmes.subwords[document_subword_index]
- search_phrase_tokens_to_word_matches[search_phrase_token.i].append(WordMatch(
- search_phrase_token, search_phrase_word, document_token,
- first_document_token, last_document_token, document_subword,
- document_word, match_type, similarity_measure, is_negated, is_uncertain,
- structurally_matched_document_token, document_word, depth))
-
- def loop_search_phrase_word_representations():
- yield search_phrase_token._.holmes.lemma, 'direct', \
- search_phrase_token._.holmes.lemma_or_derived_lemma()
- hyphen_normalized_word = self.semantic_analyzer.normalize_hyphens(
- search_phrase_token._.holmes.lemma)
- if hyphen_normalized_word != search_phrase_token._.holmes.lemma:
- yield hyphen_normalized_word, 'direct', \
- search_phrase_token._.holmes.lemma_or_derived_lemma()
- if self.analyze_derivational_morphology and \
- search_phrase_token._.holmes.derived_lemma is not None:
- yield search_phrase_token._.holmes.derived_lemma, 'derivation', \
- search_phrase_token._.holmes.lemma_or_derived_lemma()
- if not search_phrase.topic_match_phraselet and \
- search_phrase_token._.holmes.lemma == search_phrase_token.lemma_ and \
- search_phrase_token._.holmes.lemma != search_phrase_token.text:
- # search phrase word is not multiword, phrasal or separable verb, so we can match
- # against its text as well as its lemma
- yield search_phrase_token.text, 'direct', \
- search_phrase_token._.holmes.lemma_or_derived_lemma()
- if self.analyze_derivational_morphology and self.ontology is not None:
- for reverse_lemma in self.reverse_derived_lemmas_in_ontology(
- search_phrase_token):
- yield reverse_lemma, 'ontology', \
- search_phrase_token._.holmes.lemma_or_derived_lemma()
-
- def document_word_representations():
- list_to_return = []
- if document_subword_index is not None:
- working_document_subword = document_token._.holmes.subwords[document_subword_index]
- list_to_return.append((
- working_document_subword.text, 'direct',
- working_document_subword.lemma_or_derived_lemma()))
- hyphen_normalized_word = self.semantic_analyzer.normalize_hyphens(
- working_document_subword.text)
- if hyphen_normalized_word != working_document_subword.text:
- list_to_return.append((
- hyphen_normalized_word, 'direct',
- working_document_subword.lemma_or_derived_lemma()))
- if working_document_subword.lemma != working_document_subword.text:
- list_to_return.append((
- working_document_subword.lemma, 'direct',
- working_document_subword.lemma_or_derived_lemma()))
- if self.analyze_derivational_morphology and \
- working_document_subword.derived_lemma is not None:
- list_to_return.append((
- working_document_subword.derived_lemma,
- 'derivation', working_document_subword.lemma_or_derived_lemma()))
- if self.analyze_derivational_morphology and self.ontology is not None:
- for reverse_lemma in self.reverse_derived_lemmas_in_ontology(
- working_document_subword):
- list_to_return.append((
- reverse_lemma, 'ontology',
- working_document_subword.lemma_or_derived_lemma()))
- else:
- list_to_return.append((
- document_token.text, 'direct',
- document_token._.holmes.lemma_or_derived_lemma()))
- hyphen_normalized_word = self.semantic_analyzer.normalize_hyphens(
- document_token.text)
- if hyphen_normalized_word != document_token.text:
- list_to_return.append((
- hyphen_normalized_word, 'direct',
- document_token._.holmes.lemma_or_derived_lemma()))
- if document_token._.holmes.lemma != document_token.text:
- list_to_return.append((
- document_token._.holmes.lemma, 'direct',
- document_token._.holmes.lemma_or_derived_lemma()))
- if self.analyze_derivational_morphology:
- if document_token._.holmes.derived_lemma is not None:
- list_to_return.append((
- document_token._.holmes.derived_lemma,
- 'derivation', document_token._.holmes.lemma_or_derived_lemma()))
- if self.analyze_derivational_morphology and self.ontology is not None:
- for reverse_lemma in self.reverse_derived_lemmas_in_ontology(document_token):
- list_to_return.append((
- reverse_lemma, 'ontology',
- document_token._.holmes.lemma_or_derived_lemma()))
- return list_to_return
-
- def loop_document_multiword_representations(multiword_span):
- yield multiword_span.text, 'direct', multiword_span.derived_lemma
- hyphen_normalized_word = self.semantic_analyzer.normalize_hyphens(multiword_span.text)
- if hyphen_normalized_word != multiword_span.text:
- yield hyphen_normalized_word, 'direct', multiword_span.derived_lemma
- if multiword_span.text != multiword_span.lemma:
- yield multiword_span.lemma, 'direct', multiword_span.derived_lemma
- if multiword_span.derived_lemma != multiword_span.lemma:
- yield multiword_span.derived_lemma, 'derivation', multiword_span.derived_lemma
- if self.analyze_derivational_morphology and self.ontology is not None:
- for reverse_lemma in self.reverse_derived_lemmas_in_ontology(multiword_span):
- yield reverse_lemma, 'ontology', multiword_span.derived_lemma
-
- index = Index(document_token.i, document_subword_index)
- search_phrase_and_document_visited_table[search_phrase_token.i].add(index)
- is_negated = document_token._.holmes.is_negated
- if document_token._.holmes.is_uncertain:
- is_uncertain = True
-
- if self._is_entity_search_phrase_token(
- search_phrase_token, search_phrase.topic_match_phraselet) and \
- document_subword_index is None:
- if self._entity_search_phrase_token_matches(
- search_phrase_token, search_phrase.topic_match_phraselet, document_token):
- for multiword_span in self._multiword_spans_with_head_token(document_token):
- for working_token in multiword_span.tokens:
- if not self._entity_search_phrase_token_matches(
- search_phrase_token, search_phrase.topic_match_phraselet,
- document_token):
- continue
- for working_token in multiword_span.tokens:
- search_phrase_and_document_visited_table[search_phrase_token.i].add(
- working_token.i)
- handle_match(
- search_phrase_token.text, multiword_span.text, 'entity', 0,
- first_document_token=multiword_span.tokens[0],
- last_document_token=multiword_span.tokens[-1])
- return True
- search_phrase_and_document_visited_table[search_phrase_token.i].add(
- document_token.i)
- handle_match(search_phrase_token.text, document_token.text, 'entity', 0)
- return True
- return False
-
- document_word_representations = document_word_representations()
- for search_phrase_word_representation, search_phrase_match_type, \
- search_phrase_derived_lemma in loop_search_phrase_word_representations():
- # multiword matches
- if document_subword_index is None:
- for multiword_span in self._multiword_spans_with_head_token(document_token):
- for multiword_span_representation, document_match_type, \
- multispan_derived_lemma in \
- loop_document_multiword_representations(multiword_span):
- if search_phrase_word_representation.lower() == \
- multiword_span_representation.lower():
- for working_token in multiword_span.tokens:
- search_phrase_and_document_visited_table[search_phrase_token.i].add(
- working_token.i)
- handle_match(
- search_phrase_token._.holmes.lemma,
- multiword_span_representation,
- self._match_type(
- search_phrase_derived_lemma == multispan_derived_lemma,
- search_phrase_match_type, document_match_type),
- 0, first_document_token=multiword_span.tokens[0],
- last_document_token=multiword_span.tokens[-1])
- return True
- if self.ontology is not None:
- entry = self.ontology.matches(
- search_phrase_word_representation.lower(),
- multiword_span_representation.lower())
- if entry is not None:
- for working_token in multiword_span.tokens:
- search_phrase_and_document_visited_table[
- search_phrase_token.i].add(working_token.i)
- handle_match(
- search_phrase_word_representation, entry.word,
- 'ontology', entry.depth,
- first_document_token=multiword_span.tokens[0],
- last_document_token=multiword_span.tokens[-1])
- return True
- for document_word_representation, document_match_type, document_derived_lemma in \
- document_word_representations:
- if search_phrase_word_representation.lower() == \
- document_word_representation.lower():
- handle_match(
- search_phrase_word_representation, document_word_representation,
- self._match_type(
- search_phrase_derived_lemma == document_derived_lemma,
- search_phrase_match_type, document_match_type)
- , 0)
- return True
- if self.ontology is not None:
- entry = self.ontology.matches(
- search_phrase_word_representation.lower(),
- document_word_representation.lower())
- if entry is not None:
- handle_match(
- search_phrase_word_representation, entry.word, 'ontology', entry.depth)
- return True
-
- if self.overall_similarity_threshold < 1.0 and (
- compare_embeddings_on_non_root_words or search_phrase.root_token.i ==
- search_phrase_token.i) and search_phrase_token.i in \
- search_phrase.matchable_non_entity_tokens_to_lexemes.keys() and \
- self.semantic_analyzer.embedding_matching_permitted(search_phrase_token):
- search_phrase_lexeme = \
- search_phrase.matchable_non_entity_tokens_to_lexemes[search_phrase_token.i]
- if document_subword_index is not None:
- if not self.semantic_analyzer.embedding_matching_permitted(
- document_token._.holmes.subwords[document_subword_index]):
- return False
- document_lemma = document_token._.holmes.subwords[document_subword_index].lemma
- else:
- if not self.semantic_analyzer.embedding_matching_permitted(document_token):
- return False
- if len(document_token._.holmes.lemma.split()) > 1:
- document_lemma = document_token.lemma_
- else:
- document_lemma = document_token._.holmes.lemma
-
- document_lexeme = self.semantic_analyzer.nlp.vocab[document_lemma]
- if search_phrase_lexeme.vector_norm > 0 and document_lexeme.vector_norm > 0:
- similarity_measure = search_phrase_lexeme.similarity(document_lexeme)
- if similarity_measure > search_phrase.single_token_similarity_threshold:
- if not search_phrase.topic_match_phraselet and \
- len(search_phrase_token._.holmes.lemma.split()) > 1:
- search_phrase_word_to_use = search_phrase_token.lemma_
- else:
- search_phrase_word_to_use = search_phrase_token._.holmes.lemma
- handle_match(
- search_phrase_word_to_use, document_token.lemma_, 'embedding', 0,
- similarity_measure=similarity_measure)
- return True
- return False
-
- def _is_entity_search_phrase_token(
- self, search_phrase_token, examine_lemma_rather_than_text):
- if examine_lemma_rather_than_text:
- word_to_check = search_phrase_token._.holmes.lemma
- else:
- word_to_check = search_phrase_token.text
- return word_to_check[:6] == 'ENTITY' and len(word_to_check) > 6
-
- def _is_entitynoun_search_phrase_token(
- self, search_phrase_token, examine_lemma_rather_than_text):
- if examine_lemma_rather_than_text:
- word_to_check = search_phrase_token._.holmes.lemma
- else:
- word_to_check = search_phrase_token.text
- return word_to_check == 'ENTITYNOUN'
-
- def _entity_search_phrase_token_matches(
- self, search_phrase_token, topic_match_phraselet, document_token):
- if topic_match_phraselet:
- word_to_check = search_phrase_token._.holmes.lemma
- else:
- word_to_check = search_phrase_token.text
- return (
- document_token.ent_type_ == word_to_check[6:] and
- len(document_token._.holmes.lemma.strip()) > 0) or (
- word_to_check == 'ENTITYNOUN' and
- document_token.pos_ in self.semantic_analyzer.noun_pos)
- # len(document_token._.holmes.lemma.strip()) > 0: in German spaCy sometimes
- # classifies whitespace as entities.
-
- def _loop_textual_representations(self, object):
- if isinstance(object, Token):
- yield object.text, 'direct'
- hyphen_normalized_text = self.semantic_analyzer.normalize_hyphens(object.text)
- if hyphen_normalized_text != object.text:
- yield hyphen_normalized_text, 'direct'
- if object._.holmes.lemma != object.text:
- yield object._.holmes.lemma, 'direct'
- if self.analyze_derivational_morphology and object._.holmes.derived_lemma is not None:
- yield object._.holmes.derived_lemma, 'derivation'
- elif isinstance(object, Subword):
- yield object.text, 'direct'
- hyphen_normalized_text = self.semantic_analyzer.normalize_hyphens(object.text)
- if hyphen_normalized_text != object.text:
- yield hyphen_normalized_text, 'direct'
- if object.text != object.lemma:
- yield object.lemma, 'direct'
- if self.analyze_derivational_morphology and object.derived_lemma is not None:
- yield object.derived_lemma, 'derivation'
- elif isinstance(object, self._MultiwordSpan):
- yield object.text, 'direct'
- hyphen_normalized_text = self.semantic_analyzer.normalize_hyphens(object.text)
- if hyphen_normalized_text != object.text:
- yield hyphen_normalized_text, 'direct'
- if object.text != object.lemma:
- yield object.lemma, 'direct'
- if object.lemma != object.derived_lemma:
- yield object.derived_lemma, 'derivation'
- else:
- raise RuntimeError(': '.join(('Unsupported type', str(type(object)))))
-
- def _build_matches(
- self, *, search_phrase, document, search_phrase_tokens_to_word_matches, document_label):
- """Investigate possible matches when recursion is complete."""
-
- def mention_root_or_token_index(token):
- mri = token._.holmes.mention_root_index
- if mri is not None:
- return mri
- else:
- return token.i
-
- def filter_word_matches_based_on_coreference_resolution(word_matches):
- """ When coreference resolution is active, additional matches are sometimes
- returned that are filtered out again using this method. The use of
- mention_root_index means that only the first cluster is taken into account.
- """
- dict = {}
- # Find the structurally matching document tokens for this list of word matches
- for word_match in word_matches:
- structural_index = \
- mention_root_or_token_index(word_match.structurally_matched_document_token)
- if structural_index in dict.keys():
- dict[structural_index].append(word_match)
- else:
- dict[structural_index] = [word_match]
- new_word_matches = []
- for structural_index in dict:
- # For each structural token, find the best matching coreference mention
- relevant_word_matches = dict[structural_index]
- structurally_matched_document_token = \
- relevant_word_matches[0].document_token.doc[structural_index]
- already_added_document_token_indexes = set()
- if self.semantic_analyzer.is_involved_in_coreference(
- structurally_matched_document_token):
- working_index = -1
- for relevant_word_match in relevant_word_matches:
- this_index = mention_root_or_token_index(relevant_word_match.document_token)
- # The best mention should be as close to the structural
- # index as possible; if they are the same distance, the preceding mention
- # wins.
- if working_index == -1 or (
- abs(structural_index - this_index) <
- abs(structural_index - working_index)) or \
- ((abs(structural_index - this_index) ==
- abs(structural_index - working_index)) and
- this_index < working_index):
- working_index = this_index
- # Filter out any matches from mentions other than the best mention
- for relevant_word_match in relevant_word_matches:
- if working_index == \
- mention_root_or_token_index(relevant_word_match.document_token) \
- and relevant_word_match.document_token.i not in \
- already_added_document_token_indexes:
- already_added_document_token_indexes.add(
- relevant_word_match.document_token.i)
- new_word_matches.append(relevant_word_match)
- else:
- new_word_matches.extend(relevant_word_matches)
- return new_word_matches
-
- def revise_extracted_words_based_on_coreference_resolution(word_matches):
- """ When coreference resolution and ontology-based matching are both active,
- there may be a more specific piece of information elsewhere in the coreference
- chain of a token that has been matched, in which case this piece of information
- should be recorded in *word_match.extracted_word*.
-
- If and when subwords and coreference resolution are analyzed together (at present
- they subwords are available only for German, coreference resolution only for
- English), this method will need to be updated to handle this.
- """
-
-
- for word_match in (
- word_match for word_match in word_matches
- if word_match.type in ('direct', 'derivation', 'ontology')):
- working_entries = []
- # First loop through getting ontology entries for all mentions in the cluster
- for search_phrase_representation, _ in \
- self._loop_textual_representations(word_match.search_phrase_token):
- for mention in word_match.document_token._.holmes.mentions:
- mention_root_token = word_match.document_token.doc[mention.root_index]
- for mention_representation, _ in \
- self._loop_textual_representations(mention_root_token):
- working_entries.append(
- self.ontology.matches(
- search_phrase_representation, mention_representation))
- for multiword_span in \
- self._multiword_spans_with_head_token(mention_root_token):
- for multiword_representation, _ in \
- self._loop_textual_representations(multiword_span):
- working_entries.append(
- self.ontology.matches(
- search_phrase_representation, multiword_representation))
-
- # Now loop through the ontology entries to see if any are more specific than
- # the current value of *extracted_word*.
- for working_entry in working_entries:
- if working_entry is None:
- continue
- if working_entry.is_individual:
- word_match.extracted_word = working_entry.word
- break
- elif working_entry.depth > word_match.depth:
- word_match.extracted_word = working_entry.word
- return word_matches
-
- def match_already_contains_structurally_matched_document_token(
- match, document_token, document_subword_index):
- """Ensure that the same document token or subword does not match multiple search phrase
- tokens.
- """
- for word_match in match.word_matches:
- if document_token.i == word_match.structurally_matched_document_token.i:
- if word_match.document_subword is not None and document_subword_index == \
- word_match.document_subword.index:
- return True
- if word_match.document_subword is None and document_subword_index is None:
- return True
- return False
-
- def check_document_tokens_are_linked_by_dependency(
- parent_token, parent_subword, child_token, child_subword):
- """ The recursive nature of the main matching algorithm can mean that all the tokens
- in the search phrase have matched but that two of them are linked by a dependency
- that is absent from the document, which invalidates the match.
- """
- if parent_subword is not None:
- if child_subword is not None and parent_subword.dependent_index == \
- child_subword.index and parent_token.i == child_token.i:
- return True
- elif parent_subword.is_head and (child_subword is None or (
- child_subword.is_head and parent_subword.containing_token_index !=
- child_subword.containing_token_index)):
- return True
- else:
- return False
- if child_subword is not None and not child_subword.is_head:
- return False
- if self.perform_coreference_resolution and parent_subword is None:
- parents = parent_token._.holmes.token_and_coreference_chain_indexes
- children = child_token._.holmes.token_and_coreference_chain_indexes
- else:
- parents = [parent_token.i]
- children = [child_token.i]
- for parent in parents:
- for child in children:
- if parent_token.doc[parent]._.holmes.has_dependency_with_child_index(child):
- return True
- return False
-
- def match_with_subwords_involves_all_containing_document_tokens(word_matches):
- """ Where a match involves subwords and the subwords are involved in conjunction,
- we need to make sure there are no tokens involved in the match merely because they
- supply subwords to another token, as this would lead to double matching. An example
- is search phrase 'Extraktion der Information' and document
- 'Informationsextraktionsüberlegungen und -probleme'.
- """
- token_indexes = []
- containing_subword_token_indexes = []
- for word_match in word_matches:
- if word_match.document_subword is not None:
- token_indexes.append(word_match.document_token.i)
- containing_subword_token_indexes.append(
- word_match.document_subword.containing_token_index)
- return len([
- token_index for token_index in token_indexes if not token_index in
- containing_subword_token_indexes]) == 0
-
- matches = [Match(
- search_phrase.label, document_label,
- search_phrase.topic_match_phraselet and search_phrase.has_single_matchable_word,
- search_phrase.topic_match_phraselet_created_without_matching_tags,
- search_phrase.reverse_only)]
- for search_phrase_token in search_phrase.matchable_tokens:
- word_matches = search_phrase_tokens_to_word_matches[search_phrase_token.i]
- if len(word_matches) == 0:
- # if there is any search phrase token without a matching document token,
- # we have no match and can return
- return []
- if self.perform_coreference_resolution:
- word_matches = filter_word_matches_based_on_coreference_resolution(word_matches)
- if self.ontology is not None:
- word_matches = revise_extracted_words_based_on_coreference_resolution(
- word_matches)
- # handle any conjunction by distributing the matches amongst separate match objects
- working_matches = []
- for word_match in word_matches:
- for match in matches:
- working_match = copy.copy(match)
- if word_match.document_subword is None:
- subword_index = None
- else:
- subword_index = word_match.document_subword.index
- if not match_already_contains_structurally_matched_document_token(
- working_match, word_match.structurally_matched_document_token,
- subword_index):
- working_match.word_matches.append(word_match)
- if word_match.is_negated:
- working_match.is_negated = True
- if word_match.is_uncertain:
- working_match.is_uncertain = True
- if search_phrase_token.i == search_phrase.root_token.i:
- working_match.index_within_document = word_match.document_token.i
- working_matches.append(working_match)
- matches = working_matches
-
- matches_to_return = []
- for match in matches:
- failed = False
- not_normalized_overall_similarity_measure = 1.0
- # now carry out the coherence check, if there are two or fewer word matches (which
- # is the case during topic matching) no check is necessary
- if len(match.word_matches) > 2:
- for parent_word_match in match.word_matches:
- for search_phrase_dependency in \
- parent_word_match.search_phrase_token._.holmes.children:
- for child_word_match in (
- cwm for cwm in match.word_matches if cwm.search_phrase_token.i ==
- search_phrase_dependency.child_index):
- if not check_document_tokens_are_linked_by_dependency(
- parent_word_match.document_token,
- parent_word_match.document_subword,
- child_word_match.document_token,
- child_word_match.document_subword):
- failed = True
- if failed:
- break
- if failed:
- break
- if failed:
- continue
-
- if not match_with_subwords_involves_all_containing_document_tokens(match.word_matches):
- continue
-
- for word_match in match.word_matches:
- not_normalized_overall_similarity_measure *= word_match.similarity_measure
- if not_normalized_overall_similarity_measure < 1.0:
- overall_similarity_measure = \
- round(not_normalized_overall_similarity_measure ** \
- (1 / len(search_phrase.matchable_non_entity_tokens_to_lexemes)), 8)
- else:
- overall_similarity_measure = 1.0
- if overall_similarity_measure == 1.0 or \
- overall_similarity_measure >= self.overall_similarity_threshold:
- match.overall_similarity_measure = str(
- overall_similarity_measure)
- matches_to_return.append(match)
- return matches_to_return
-
- def _get_matches_starting_at_root_word_match(
- self, search_phrase, document, document_token, document_subword_index, document_label,
- compare_embeddings_on_non_root_words):
- """Begin recursive matching where a search phrase root token has matched a document
- token.
- """
-
- matches_to_return = []
- # array of arrays where each entry corresponds to a search_phrase token and is itself an
- # array of WordMatch instances
- search_phrase_tokens_to_word_matches = [[] for token in search_phrase.doc]
- # array of sets to guard against endless looping during recursion. Each set
- # corresponds to the search phrase token with its index and contains the Index objects
- # for the document words for which a match to that search phrase token has been attempted.
- search_phrase_and_document_visited_table = [set() for token in search_phrase.doc]
- self._match_recursively(
- search_phrase=search_phrase,
- search_phrase_token=search_phrase.root_token,
- document=document,
- document_token=document_token,
- document_subword_index=document_subword_index,
- search_phrase_tokens_to_word_matches=search_phrase_tokens_to_word_matches,
- search_phrase_and_document_visited_table=search_phrase_and_document_visited_table,
- is_uncertain=document_token._.holmes.is_uncertain,
- structurally_matched_document_token=document_token,
- compare_embeddings_on_non_root_words=compare_embeddings_on_non_root_words)
- working_matches = self._build_matches(
- search_phrase=search_phrase,
- document=document,
- search_phrase_tokens_to_word_matches=search_phrase_tokens_to_word_matches,
- document_label=document_label)
- matches_to_return.extend(working_matches)
- return matches_to_return
-
- def match(
- self, *, indexed_documents, search_phrases,
- output_document_matching_message_to_console,
- match_depending_on_single_words,
- compare_embeddings_on_root_words,
- compare_embeddings_on_non_root_words,
- document_labels_to_indexes_for_reverse_matching_sets,
- document_labels_to_indexes_for_embedding_reverse_matching_sets,
- document_label_filter=None):
- """Finds and returns matches between search phrases and documents.
- match_depending_on_single_words -- 'True' to match only single word search phrases,
- 'False' to match only non-single-word search phrases and 'None' to match both.
- compare_embeddings_on_root_words -- if 'True', embeddings on root words are compared
- even if embedding_based_matching_on_root_words==False as long as
- overall_similarity_threshold < 1.0.
- compare_embeddings_on_non_root_words -- if 'False', embeddings on non-root words are not
- compared even if overall_similarity_threshold < 1.0.
- document_labels_to_indexes_for_reverse_matching_sets -- indexes for non-embedding
- reverse matching only.
- document_labels_to_indexes_for_embedding_reverse_matching_sets -- indexes for embedding
- and non-embedding reverse matching.
- document_label_filter -- a string with which the label of a document must begin for that
- document to be considered for matching, or 'None' if no filter is in use.
- """
-
- def get_indexes_to_consider(dictionary, document_label):
- if dictionary is None or document_label not in dictionary:
- return set()
- else:
- return dictionary[document_label]
-
- if self.embedding_based_matching_on_root_words:
- compare_embeddings_on_root_words = True
- if self.overall_similarity_threshold == 1.0:
- compare_embeddings_on_root_words = False
- compare_embeddings_on_non_root_words = False
- match_specific_indexes = document_labels_to_indexes_for_reverse_matching_sets is not None \
- or document_labels_to_indexes_for_embedding_reverse_matching_sets is not None
-
- if len(indexed_documents) == 0:
- raise NoSearchedDocumentError(
- 'At least one searched document is required to match.')
- if len(search_phrases) == 0:
- raise NoSearchPhraseError('At least one search_phrase is required to match.')
- matches = []
- for document_label, registered_document in indexed_documents.items():
- if document_label_filter is not None and document_label is not None and not \
- document_label.startswith(str(document_label_filter)):
- continue
- if output_document_matching_message_to_console:
- print('Processing document', document_label)
- doc = registered_document.doc
- # Dictionary used to improve performance when embedding-based matching for root tokens
- # is active and there are multiple search phrases with the same root token word: the
- # same indexes in the document will then match all the search phrase root tokens.
- root_lexeme_to_indexes_to_match_dict = {}
- if match_specific_indexes:
- reverse_matching_indexes = get_indexes_to_consider(
- document_labels_to_indexes_for_reverse_matching_sets, document_label)
- embedding_reverse_matching_indexes = get_indexes_to_consider(
- document_labels_to_indexes_for_embedding_reverse_matching_sets,
- document_label)
-
- for search_phrase in search_phrases:
- if not search_phrase.has_single_matchable_word and match_depending_on_single_words:
- continue
- if search_phrase.has_single_matchable_word and \
- match_depending_on_single_words == False:
- continue
- if not match_specific_indexes and (search_phrase.reverse_only or \
- search_phrase.treat_as_reverse_only_during_initial_relation_matching):
- continue
- if search_phrase.has_single_matchable_word and \
- not compare_embeddings_on_root_words and \
- not self._is_entity_search_phrase_token(
- search_phrase.root_token, search_phrase.topic_match_phraselet):
- # We are only matching a single word without embedding, so to improve
- # performance we avoid entering the subgraph matching code.
- search_phrase_token = [
- token for token in search_phrase.doc if token._.holmes.is_matchable][0]
- existing_minimal_match_indexes = []
- for word_matching_root_token in search_phrase.words_matching_root_token:
- if word_matching_root_token in \
- registered_document.words_to_token_info_dict.keys():
- search_phrase_match_type, depth = \
- search_phrase.root_word_to_match_info_dict[
- word_matching_root_token]
- for index, document_word_representation, \
- document_match_type_is_derivation in \
- registered_document.words_to_token_info_dict[
- word_matching_root_token]:
- if index in existing_minimal_match_indexes:
- continue
- if document_match_type_is_derivation:
- document_match_type = 'derivation'
- else:
- document_match_type = 'direct'
- match_type = self._match_type(
- False, search_phrase_match_type, document_match_type)
- minimal_match = Match(
- search_phrase.label, document_label, True,
- search_phrase.
- topic_match_phraselet_created_without_matching_tags,
- search_phrase.reverse_only)
- minimal_match.index_within_document = index.token_index
- matched = False
- if len(word_matching_root_token.split()) > 1:
- for multiword_span in self._multiword_spans_with_head_token(
- doc[index.token_index]):
- for textual_representation, _ in \
- self._loop_textual_representations(multiword_span):
- if textual_representation == \
- word_matching_root_token:
- matched = True
- minimal_match.word_matches.append(WordMatch(
- search_phrase_token,
- search_phrase_token._.holmes.lemma,
- doc[index.token_index],
- multiword_span.tokens[0],
- multiword_span.tokens[-1],
- None,
- document_word_representation,
- match_type,
- 1.0, False, False, doc[index.token_index],
- document_word_representation, depth))
- break
- if matched:
- break
- if not matched:
- token = doc[index.token_index]
- if index.is_subword():
- subword = token._.holmes.subwords[index.subword_index]
- else:
- subword = None
- minimal_match.word_matches.append(WordMatch(
- search_phrase_token,
- search_phrase_token._.holmes.lemma,
- token,
- token,
- token,
- subword,
- document_word_representation,
- match_type,
- 1.0, token._.holmes.is_negated, False, token,
- document_word_representation, depth))
- if token._.holmes.is_negated:
- minimal_match.is_negated = True
- existing_minimal_match_indexes.append(index)
- matches.append(minimal_match)
- continue
- direct_matching_indexes = []
- if self._is_entitynoun_search_phrase_token(
- search_phrase.root_token,
- search_phrase.topic_match_phraselet): # phraselets are not generated for
- # ENTITYNOUN roots
- for token in doc:
- if token.pos_ in self.semantic_analyzer.noun_pos:
- matches.extend(
- self._get_matches_starting_at_root_word_match(
- search_phrase, doc, token, None, document_label,
- compare_embeddings_on_non_root_words))
- continue
- else:
- matched_indexes_set = set()
- if self._is_entity_search_phrase_token(
- search_phrase.root_token, search_phrase.topic_match_phraselet):
- if search_phrase.topic_match_phraselet:
- entity_label = search_phrase.root_token._.holmes.lemma
- else:
- entity_label = search_phrase.root_token.text
- if entity_label in registered_document.words_to_token_info_dict.keys():
- entity_matching_indexes = [
- index for index, _, _ in
- registered_document.words_to_token_info_dict[entity_label]]
- if match_specific_indexes:
- entity_matching_indexes = [
- index for index in entity_matching_indexes
- if index in reverse_matching_indexes
- or index in embedding_reverse_matching_indexes
- and not index.is_subword()]
- matched_indexes_set.update(entity_matching_indexes)
- else:
- for word_matching_root_token in search_phrase.words_matching_root_token:
- if word_matching_root_token in \
- registered_document.words_to_token_info_dict.keys():
- direct_matching_indexes = [
- index for index, _, _ in
- registered_document.words_to_token_info_dict[
- word_matching_root_token]]
- if match_specific_indexes:
- direct_matching_indexes = [
- index for index in direct_matching_indexes
- if index in reverse_matching_indexes
- or index in embedding_reverse_matching_indexes]
- matched_indexes_set.update(direct_matching_indexes)
- if compare_embeddings_on_root_words and not \
- self._is_entity_search_phrase_token(
- search_phrase.root_token, search_phrase.topic_match_phraselet) \
- and not search_phrase.reverse_only and \
- self.semantic_analyzer.embedding_matching_permitted(
- search_phrase.root_token):
- if not search_phrase.topic_match_phraselet and \
- len(search_phrase.root_token._.holmes.lemma.split()) > 1:
- root_token_lemma_to_use = search_phrase.root_token.lemma_
- else:
- root_token_lemma_to_use = search_phrase.root_token._.holmes.lemma
- if root_token_lemma_to_use in root_lexeme_to_indexes_to_match_dict:
- matched_indexes_set.update(
- root_lexeme_to_indexes_to_match_dict[root_token_lemma_to_use])
- else:
- working_indexes_to_match_for_cache_set = set()
- for document_word in registered_document.words_to_token_info_dict.keys():
- indexes_to_match = [
- index for index, _, _ in
- registered_document.words_to_token_info_dict[document_word]]
- if match_specific_indexes:
- indexes_to_match = [
- index for index in indexes_to_match
- if index in embedding_reverse_matching_indexes
- and index not in direct_matching_indexes]
- if len(indexes_to_match) == 0:
- continue
- search_phrase_lexeme = \
- search_phrase.matchable_non_entity_tokens_to_lexemes[
- search_phrase.root_token.i]
- example_index = indexes_to_match[0]
- example_document_token = doc[example_index.token_index]
- if example_index.is_subword():
- if not self.semantic_analyzer.embedding_matching_permitted(
- example_document_token._.holmes.subwords[
- example_index.subword_index]):
- continue
- document_lemma = example_document_token._.holmes.subwords[
- example_index.subword_index].lemma
- else:
- if not self.semantic_analyzer.embedding_matching_permitted(
- example_document_token):
- continue
- if len(example_document_token._.holmes.lemma.split()) > 1:
- document_lemma = example_document_token.lemma_
- else:
- document_lemma = example_document_token._.holmes.lemma
- document_lexeme = self.semantic_analyzer.nlp.vocab[document_lemma]
- if search_phrase_lexeme.vector_norm > 0 and \
- document_lexeme.vector_norm > 0:
- similarity_measure = search_phrase_lexeme.similarity(
- document_lexeme)
- if similarity_measure >= \
- search_phrase.single_token_similarity_threshold:
- matched_indexes_set.update(indexes_to_match)
- working_indexes_to_match_for_cache_set.update(indexes_to_match)
- root_lexeme_to_indexes_to_match_dict[root_token_lemma_to_use] = \
- working_indexes_to_match_for_cache_set
- for index_to_match in sorted(matched_indexes_set):
- matches.extend(self._get_matches_starting_at_root_word_match(
- search_phrase, doc, doc[index_to_match.token_index],
- index_to_match.subword_index, document_label,
- compare_embeddings_on_non_root_words))
- return sorted(matches, key=lambda match: 1 - float(match.overall_similarity_measure))
diff --git a/holmes_extractor/tests/common/test_multiprocessing.py b/holmes_extractor/tests/common/test_multiprocessing.py
deleted file mode 100644
index c203674..0000000
--- a/holmes_extractor/tests/common/test_multiprocessing.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import unittest
-import holmes_extractor as holmes
-import os
-import time
-from threading import Thread
-from queue import Queue
-from time import sleep
-
-NUMBER_OF_THREADS = 50
-
-script_directory = os.path.dirname(os.path.realpath(__file__))
-ontology = holmes.Ontology(os.sep.join(
- (script_directory, 'test_ontology.owl')))
-
-
-class MultiprocessingTest(unittest.TestCase):
- # We use en_core_web_sm to prevent memory exhaustion during the tests.
-
- def test_workers_specified(self):
- m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology, number_of_workers=2,
- verbose=False)
- m.parse_and_register_documents({'specific': "I saw a dog. It was chasing a cat",
- 'exact': "The dog chased the animal",
- 'specific-reversed': "The cat chased the dog",
- 'exact-reversed': "The animal chased the dog"})
- self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
- 'specific-reversed'])
- self.assertEqual(m.topic_match_documents_returning_dictionaries_against(
- "A dog chases an animal"),
- [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[4, 7, 'overlapping_relation', False, "Matches DOG directly."], [8, 14, 'overlapping_relation', False, "Matches CHASE directly."], [19, 25, 'overlapping_relation', True, "Matches ANIMAL directly."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 81.94686666666668, 'word_infos': [[8, 11, 'overlapping_relation', False, "Matches DOG directly."], [20, 27, 'overlapping_relation', False, "Is a synonym of CHASE in the ontology."], [30, 33, 'overlapping_relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[4, 10, 'single', False, "Matches ANIMAL directly."], [11, 17, 'relation', False, "Matches CHASE directly."], [22, 25, 'relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 34.486666666666665, 'word_infos': [[4, 7, 'single', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'relation', False, "Matches CHASE directly."], [19, 22, 'relation', True, "Is a child of ANIMAL in the ontology."]]}])
- m.close()
-
- def test_workers_not_specified(self):
- m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology)
- m.parse_and_register_documents({'specific': "I saw a dog. It was chasing a cat",
- 'exact': "The dog chased the animal",
- 'specific-reversed': "The cat chased the dog",
- 'exact-reversed': "The animal chased the dog"})
- self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
- 'specific-reversed'])
- self.assertEqual(m.topic_match_documents_returning_dictionaries_against(
- "A dog chases an animal"),
- [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[4, 7, 'overlapping_relation', False, "Matches DOG directly."], [8, 14, 'overlapping_relation', False, "Matches CHASE directly."], [19, 25, 'overlapping_relation', True, "Matches ANIMAL directly."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 81.94686666666668, 'word_infos': [[8, 11, 'overlapping_relation', False, "Matches DOG directly."], [20, 27, 'overlapping_relation', False, "Is a synonym of CHASE in the ontology."], [30, 33, 'overlapping_relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[4, 10, 'single', False, "Matches ANIMAL directly."], [11, 17, 'relation', False, "Matches CHASE directly."], [22, 25, 'relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 34.486666666666665, 'word_infos': [[4, 7, 'single', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'relation', False, "Matches CHASE directly."], [19, 22, 'relation', True, "Is a child of ANIMAL in the ontology."]]}])
- m.close()
-
- def test_deserialized_documents(self):
- normal_manager = holmes.Manager(
- 'en_core_web_sm', perform_coreference_resolution=False)
- normal_manager.parse_and_register_document(
- "I saw a dog. It was chasing a cat", 'specific')
- normal_manager.parse_and_register_document(
- "The dog chased the animal", 'exact')
- normal_manager.parse_and_register_document(
- "The cat chased the dog", 'specific-reversed')
- normal_manager.parse_and_register_document(
- "The animal chased the dog", 'exact-reversed')
- specific = normal_manager.serialize_document('specific')
- exact = normal_manager.serialize_document('exact')
- specific_reversed = normal_manager.serialize_document(
- 'specific-reversed')
- exact_reversed = normal_manager.serialize_document('exact-reversed')
- m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology, number_of_workers=2,
- verbose=False, perform_coreference_resolution=False)
- m.deserialize_and_register_documents({'specific': specific,
- 'exact': exact,
- 'specific-reversed': specific_reversed,
- 'exact-reversed': exact_reversed})
- self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
- 'specific-reversed'])
- self.assertEqual(m.topic_match_documents_returning_dictionaries_against(
- "A dog chases an animal"),
- [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[4, 7, 'overlapping_relation', False, "Matches DOG directly."], [8, 14, 'overlapping_relation', False, "Matches CHASE directly."], [19, 25, 'overlapping_relation', True, "Matches ANIMAL directly."]]}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[4, 10, 'single', False, "Matches ANIMAL directly."], [11, 17, 'relation', False, "Matches CHASE directly."], [22, 25, 'relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 34.486666666666665, 'word_infos': [[4, 7, 'single', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'relation', False, "Matches CHASE directly."], [19, 22, 'relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 31.88346666666667, 'word_infos': [[8, 11, 'single', False, "Matches DOG directly."], [20, 27, 'relation', False, "Is a synonym of CHASE in the ontology."], [30, 33, 'relation', True, "Is a child of ANIMAL in the ontology."]]}])
- m.close()
-
- def test_number_of_results(self):
- m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology, number_of_workers=2,
- verbose=False)
- m.parse_and_register_documents({'specific': "I saw a dog. It was chasing a cat",
- 'exact': "The dog chased the animal",
- 'specific-reversed': "The cat chased the dog",
- 'exact-reversed': "The animal chased the dog"})
- self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
- 'specific-reversed'])
- self.assertEqual(m.topic_match_documents_returning_dictionaries_against(
- "A dog chases an animal", number_of_results=3),
- [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[4, 7, 'overlapping_relation', False, "Matches DOG directly."], [8, 14, 'overlapping_relation', False, "Matches CHASE directly."], [19, 25, 'overlapping_relation', True, "Matches ANIMAL directly."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 81.94686666666668, 'word_infos': [[8, 11, 'overlapping_relation', False, "Matches DOG directly."], [20, 27, 'overlapping_relation', False, "Is a synonym of CHASE in the ontology."], [30, 33, 'overlapping_relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[4, 10, 'single', False, "Matches ANIMAL directly."], [11, 17, 'relation', False, "Matches CHASE directly."], [22, 25, 'relation', True, "Is a child of ANIMAL in the ontology."]]}])
- m.close()
-
- def test_parsed_document_registration_multithreaded(self):
-
- def add_document(counter):
- m.parse_and_register_documents({' '.join(('Irrelevant', str(counter))):
- "People discuss irrelevancies"})
-
- m = holmes.MultiprocessingManager(
- 'en_core_web_sm', number_of_workers=4)
-
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=add_document, args=(i,))
- t.start()
-
- last_number_of_matches = 0
- for counter in range(50):
- document_labels = m.document_labels()
- for label in document_labels:
- self.assertTrue(label.startswith("Irrelevant"))
- if len(document_labels) == NUMBER_OF_THREADS:
- break
- self.assertFalse(counter == 49)
- sleep(0.5)
-
- def test_deserialized_document_registration_multithreaded(self):
-
- def add_document(counter):
- m.deserialize_and_register_documents({' '.join(('Irrelevant', str(counter))):
- irrelevant_doc})
-
- normal_m = holmes.Manager(
- 'en_core_web_sm', perform_coreference_resolution=False)
- normal_m.parse_and_register_document(
- "People discuss irrelevancies", 'irrelevant')
- irrelevant_doc = normal_m.serialize_document('irrelevant')
- m = holmes.MultiprocessingManager('en_core_web_sm', number_of_workers=4,
- perform_coreference_resolution=False)
-
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=add_document, args=(i,))
- t.start()
-
- last_number_of_matches = 0
- for counter in range(50):
- document_labels = m.document_labels()
- for label in document_labels:
- self.assertTrue(label.startswith("Irrelevant"))
- if len(document_labels) == NUMBER_OF_THREADS:
- break
- self.assertFalse(counter == 49)
- sleep(0.5)
-
- def _internal_test_multithreading_topic_matching(self, number_of_workers):
-
- def topic_match_within_thread():
- normal_dict = m.topic_match_documents_returning_dictionaries_against(
- "A dog chases an animal")
- reversed_dict = m.topic_match_documents_returning_dictionaries_against(
- "The animal chased the dog")
- queue.put((normal_dict, reversed_dict))
-
- m = holmes.MultiprocessingManager('en_core_web_sm', ontology=ontology,
- number_of_workers=number_of_workers, verbose=False)
- m.parse_and_register_documents({'specific': "I saw a dog. It was chasing a cat",
- 'exact': "The dog chased the animal",
- 'specific-reversed': "The cat chased the dog",
- 'exact-reversed': "The animal chased the dog"})
- queue = Queue()
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=topic_match_within_thread)
- t.start()
- for i in range(NUMBER_OF_THREADS):
- normal_dict, reversed_dict = queue.get(True, 60)
- self.assertEqual(normal_dict, [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 99.34666666666668, 'word_infos': [[4, 7, 'overlapping_relation', False, "Matches DOG directly."], [8, 14, 'overlapping_relation', False, "Matches CHASE directly."], [19, 25, 'overlapping_relation', True, "Matches ANIMAL directly."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 81.94686666666668, 'word_infos': [[8, 11, 'overlapping_relation', False, "Matches DOG directly."], [20, 27, 'overlapping_relation', False, "Is a synonym of CHASE in the ontology."], [30, 33, 'overlapping_relation', True, "Is a child of ANIMAL in the ontology."]]}, {
- 'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 35.39866666666667, 'word_infos': [[4, 10, 'single', False, "Matches ANIMAL directly."], [11, 17, 'relation', False, "Matches CHASE directly."], [22, 25, 'relation', True, "Is a child of ANIMAL in the ontology."]]}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 34.486666666666665, 'word_infos': [[4, 7, 'single', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'relation', False, "Matches CHASE directly."], [19, 22, 'relation', True, "Is a child of ANIMAL in the ontology."]]}])
- self.assertEqual(reversed_dict, [{'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'The animal chased the dog', 'rank': '1=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 96.93333333333334, 'word_infos': [[4, 10, 'overlapping_relation', False, "Matches ANIMAL directly."], [11, 17, 'overlapping_relation', True, "Matches CHASE directly."], [22, 25, 'overlapping_relation', False, "Matches DOG directly."]]}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'The animal chased the dog', 'rank': '1=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 22, 'score': 87.446, 'word_infos': [[4, 7, 'overlapping_relation', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'overlapping_relation', True, "Matches CHASE directly."], [19, 22, 'overlapping_relation',
- False, "Matches DOG directly."]]}, {'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'The animal chased the dog', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 30.598666666666666, 'word_infos': [[4, 7, 'relation', False, "Is a child of ANIMAL in the ontology."], [8, 14, 'relation', False, "Matches CHASE directly."], [19, 25, 'single', True, "Matches ANIMAL directly."]]}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'The animal chased the dog', 'rank': '3=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 33, 'score': 27.704, 'word_infos': [[8, 11, 'relation', False, "Is a child of ANIMAL in the ontology."], [20, 27, 'relation', True, "Is a synonym of CHASE in the ontology."], [30, 33, 'single', False, "Is a child of ANIMAL in the ontology."]]}])
-
- def test_multithreading_topic_matching_with_2_workers(self):
- self._internal_test_multithreading_topic_matching(2)
-
- def test_multithreading_topic_matching_with_4_workers(self):
- self._internal_test_multithreading_topic_matching(4)
-
- def test_multithreading_topic_matching_with_8_workers(self):
- self._internal_test_multithreading_topic_matching(8)
-
- def test_multithreading_filtering_with_topic_match_dictionaries(self):
- m = holmes.MultiprocessingManager('en_core_web_sm', number_of_workers=2,
- ontology=ontology, verbose=False)
-
- m.parse_and_register_documents({'T11': "The dog chased the cat",
- 'T12': "The dog chased the cat",
- 'T21': "The dog chased the cat",
- 'T22': "The dog chased the cat"})
- topic_match_dictionaries = \
- m.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat")
- self.assertEqual(len(topic_match_dictionaries), 4)
- topic_match_dictionaries = \
- m.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat", document_label_filter="T")
- self.assertEqual(len(topic_match_dictionaries), 4)
- topic_match_dictionaries = \
- m.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat", document_label_filter="T1")
- self.assertEqual(len(topic_match_dictionaries), 2)
- topic_match_dictionaries = \
- m.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat", document_label_filter="T22")
- self.assertEqual(len(topic_match_dictionaries), 1)
- topic_match_dictionaries = \
- m.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat", document_label_filter="X")
- self.assertEqual(len(topic_match_dictionaries), 0)
- m.close()
diff --git a/holmes_extractor/tests/common/test_multithreading.py b/holmes_extractor/tests/common/test_multithreading.py
deleted file mode 100644
index fae4470..0000000
--- a/holmes_extractor/tests/common/test_multithreading.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import unittest
-import holmes_extractor as holmes
-import os
-import json
-from threading import Thread
-from queue import Queue
-
-NUMBER_OF_THREADS = 50
-
-script_directory = os.path.dirname(os.path.realpath(__file__))
-ontology = holmes.Ontology(os.sep.join(
- (script_directory, 'test_ontology.owl')))
-manager = holmes.Manager(
- 'en_core_web_lg', ontology=ontology, overall_similarity_threshold=0.90)
-manager.parse_and_register_document(
- "The hungry lion chased the angry gnu.", 'lion')
-manager.parse_and_register_document(
- "The hungry tiger chased the angry gnu.", 'tiger')
-manager.parse_and_register_document(
- "The hungry panther chased the angry gnu.", 'panther')
-manager.parse_and_register_document(
- "I saw a donkey. It was chasing the angry gnu.", 'donkey')
-manager.parse_and_register_document("A foal", 'foal')
-manager.register_search_phrase('A gnu is chased')
-manager.register_search_phrase('An angry gnu')
-manager.register_search_phrase('A tiger chases')
-manager.register_search_phrase('I discussed various things with ENTITYPERSON')
-manager.register_search_phrase("A horse")
-sttb = manager.get_supervised_topic_training_basis(classification_ontology=ontology,
- oneshot=False, verbose=False)
-sttb.parse_and_register_training_document("A puppy", 'puppy', 'd0')
-sttb.parse_and_register_training_document("A pussy", 'cat', 'd1')
-sttb.parse_and_register_training_document("A dog on a lead", 'dog', 'd2')
-sttb.parse_and_register_training_document("Mimi Momo", 'Mimi Momo', 'd3')
-sttb.parse_and_register_training_document("An animal", 'animal', 'd4')
-sttb.parse_and_register_training_document("A computer", 'computers', 'd5')
-sttb.parse_and_register_training_document("A robot", 'computers', 'd6')
-sttb.register_additional_classification_label('parrot')
-sttb.register_additional_classification_label('hound')
-sttb.prepare()
-trainer = sttb.train(minimum_occurrences=0, cv_threshold=0, mlp_max_iter=10000)
-stc = trainer.classifier()
-
-
-class MultithreadingTest(unittest.TestCase):
-
- def _process_threads(self, method, first_argument, expected_output):
- queue = Queue()
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=method,
- args=(first_argument, queue))
- t.start()
- for i in range(NUMBER_OF_THREADS):
- output = queue.get(True, 5)
- self.assertEqual(output, expected_output)
-
- def _match_against_documents_within_thread(self, search_phrase, queue):
- queue.put(manager.match_documents_against(search_phrase))
-
- def _inner_match_against_documents(self, search_phrase, expected_output):
- self._process_threads(self._match_against_documents_within_thread,
- search_phrase, expected_output)
-
- def _match_against_search_phrases_within_thread(self, document, queue):
- queue.put(manager.match_search_phrases_against(document))
-
- def _inner_match_against_search_phrases(self, document, expected_output):
- self._process_threads(self._match_against_search_phrases_within_thread,
- document, expected_output)
-
- def _inner_classify(self, documents, expected_output):
- self._process_threads(self._classify_within_thread,
- documents, expected_output)
-
- def _classify_within_thread(self, documents, queue):
- output = []
- for document in documents:
- output.append(stc.parse_and_classify(document))
- queue.put(output)
-
- def test_multithreading_matching_against_documents_general(self):
- self._inner_match_against_documents("A gnu is chased",
- [{'search_phrase': 'A gnu is chased', 'document': 'lion', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}, {'search_phrase': 'A gnu is chased', 'document': 'tiger', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}, {'search_phrase': 'A gnu is chased', 'document': 'panther', 'index_within_document': 3, 'sentences_within_document': 'The hungry panther chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}, {'search_phrase': 'A gnu is chased', 'document': 'donkey', 'index_within_document': 7, 'sentences_within_document': 'It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'explanation': "Is a synonym of CHASE in the ontology."}]}])
- self._inner_match_against_documents("An angry gnu",
- [{'search_phrase': 'An angry gnu', 'document': 'lion', 'index_within_document': 6, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}, {'search_phrase': 'An angry gnu', 'document': 'tiger', 'index_within_document': 6, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}, {'search_phrase': 'An angry gnu', 'document': 'panther', 'index_within_document': 6, 'sentences_within_document': 'The hungry panther chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}, {'search_phrase': 'An angry gnu', 'document': 'donkey', 'index_within_document': 10, 'sentences_within_document': 'It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}])
-
- def test_multithreading_matching_against_documents_coreference(self):
- self._inner_match_against_documents("A donkey chases",
- [{'search_phrase': 'A donkey chases', 'document': 'donkey', 'index_within_document': 7, 'sentences_within_document': 'I saw a donkey. It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': True, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'donkey', 'document_word': 'donkey', 'document_phrase': 'a donkey', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': True, 'extracted_word': 'donkey', 'explanation': "Matches DONKEY directly."}, {'search_phrase_word': 'chase', 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'explanation': "Is a synonym of CHASE in the ontology."}]}])
-
- def test_multithreading_matching_against_documents_embedding_matching(self):
- self._inner_match_against_documents("A tiger chases a gnu",
- [{'search_phrase': 'A tiger chases a gnu', 'document': 'tiger', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'tiger', 'document_word': 'tiger', 'document_phrase': 'The hungry tiger', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'tiger', 'explanation': "Matches TIGER directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}, {'search_phrase': 'A tiger chases a gnu', 'document': 'lion', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '0.90286449', 'word_matches': [{'search_phrase_word': 'tiger', 'document_word': 'lion', 'document_phrase': 'The hungry lion', 'match_type': 'embedding', 'similarity_measure': '0.7359829', 'involves_coreference': False, 'extracted_word': 'lion', 'explanation': "Has a word embedding that is 73% similar to TIGER."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}])
-
- def test_multithreading_matching_against_documents_ontology_matching(self):
- self._inner_match_against_documents("A horse",
- [{'search_phrase': 'A horse', 'document': 'foal', 'index_within_document': 1, 'sentences_within_document': 'A foal', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'horse', 'document_word': 'foal', 'document_phrase': 'A foal', 'match_type': 'ontology', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'foal', 'explanation': "Is a child of HORSE in the ontology."}]}])
-
- def test_multithreading_matching_against_search_phrases_general(self):
- self._inner_match_against_search_phrases("The hungry lion chased the angry gnu.",
- [{'search_phrase': 'A gnu is chased', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}, {'search_phrase': 'An angry gnu', 'document': '', 'index_within_document': 6, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}])
- self._inner_match_against_search_phrases("The hungry tiger chased the angry gnu.",
- [{'search_phrase': 'A gnu is chased', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}, {'search_phrase': 'An angry gnu', 'document': '', 'index_within_document': 6, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}, {'search_phrase': 'A tiger chases', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'tiger', 'document_word': 'tiger', 'document_phrase': 'The hungry tiger', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'tiger', 'explanation': "Matches TIGER directly."}, {'search_phrase_word': 'chase', 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'explanation': "Matches CHASE directly."}]}])
- self._inner_match_against_search_phrases(
- "I saw a hungry panther. It was chasing an angry gnu.",
- [{'search_phrase': 'A gnu is chased', 'document': '', 'index_within_document': 8, 'sentences_within_document': 'It was chasing an angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'an angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}, {'search_phrase_word': 'chase', 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'explanation': "Is a synonym of CHASE in the ontology."}]}, {'search_phrase': 'An angry gnu', 'document': '', 'index_within_document': 11, 'sentences_within_document': 'It was chasing an angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'angry', 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'explanation': "Matches ANGRY directly."}, {'search_phrase_word': 'gnu', 'document_word': 'gnu', 'document_phrase': 'an angry gnu', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'explanation': "Matches GNU directly."}]}])
-
- def test_multithreading_matching_against_search_phrases_entity_matching(self):
- self._inner_match_against_search_phrases(
- "I discussed various things with Richard Hudson.",
- [{'search_phrase': 'I discussed various things with ENTITYPERSON', 'document': '', 'index_within_document': 1, 'sentences_within_document': 'I discussed various things with Richard Hudson.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'discuss', 'document_word': 'discuss', 'document_phrase': 'discussed', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'discuss', 'explanation': "Matches DISCUSS directly."}, {'search_phrase_word': 'various', 'document_word': 'various', 'document_phrase': 'various', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'various', 'explanation': "Matches VARIOUS directly."}, {'search_phrase_word': 'thing', 'document_word': 'thing', 'document_phrase': 'various things', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'thing', 'explanation': "Matches THING directly."}, {'search_phrase_word': 'with', 'document_word': 'with', 'document_phrase': 'with', 'match_type': 'direct', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'with', 'explanation': "Matches WITH directly."}, {'search_phrase_word': 'ENTITYPERSON', 'document_word': 'Richard Hudson', 'document_phrase': 'Richard Hudson', 'match_type': 'entity', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'Richard Hudson', 'explanation': 'Matches the ENTITYPERSON placeholder.'}]}])
-
- def test_multithreading_matching_against_search_phrases_ontology_matching(self):
- self._inner_match_against_search_phrases(
- "I saw a foal.",
- [{'search_phrase': 'A horse', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'I saw a foal.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_word': 'horse', 'document_word': 'foal', 'document_phrase': 'a foal', 'match_type': 'ontology', 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'foal', 'explanation': "Is a child of HORSE in the ontology."}]}])
-
- def test_multithreading_supervised_document_classification(self):
-
- self._inner_classify(["You are a robot.", "You are a cat",
- "My name is Charles and I like sewing.",
- "Your dog appears to be on a lead."],
- [['computers'], ['animal'], [], ['animal', 'dog', 'hound']])
-
- def test_multithreading_topic_matching(self):
-
- def topic_match_within_thread():
- topic_matches = manager.topic_match_documents_against(
- "Once upon a time a foal chased a hungry panther")
- output = [topic_matches[0].document_label, topic_matches[0].text,
- topic_matches[1].document_label, topic_matches[1].text]
- queue.put(output)
-
- queue = Queue()
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=topic_match_within_thread)
- t.start()
- for i in range(NUMBER_OF_THREADS):
- output = queue.get(True, 60)
- self.assertEqual(output, ['panther', 'The hungry panther chased the angry gnu.',
- 'foal', 'A foal'])
-
- def test_document_and_search_phrase_registration(self):
-
- def add_document_and_search_phrase(counter):
- manager.parse_and_register_document("People discuss irrelevancies",
- ' '.join(('Irrelevant', str(counter))))
- manager.register_search_phrase("People discuss irrelevancies")
-
- for i in range(NUMBER_OF_THREADS):
- t = Thread(target=add_document_and_search_phrase, args=(i,))
- t.start()
-
- last_number_of_matches = 0
- for counter in range(100):
- matches = [match for match in manager.match() if
- match.search_phrase_label == "People discuss irrelevancies"]
- for match in matches:
- self.assertTrue(match.document_label.startswith('Irrelevant'))
- self.assertFalse(match.is_negated)
- self.assertFalse(match.is_uncertain)
- self.assertFalse(match.involves_coreference)
- self.assertFalse(match.from_single_word_phraselet)
- self.assertEqual(match.overall_similarity_measure, '1.0')
- self.assertEqual(match.index_within_document, 1)
- self.assertEqual(match.word_matches[0].document_word, 'People')
- self.assertEqual(
- match.word_matches[0].search_phrase_word, 'people')
- self.assertEqual(match.word_matches[0].type, 'direct')
- self.assertEqual(match.word_matches[0].document_token.i, 0)
- self.assertEqual(
- match.word_matches[0].search_phrase_token.i, 0)
- self.assertEqual(
- match.word_matches[1].document_word, 'discuss')
- self.assertEqual(
- match.word_matches[1].search_phrase_word, 'discuss')
- self.assertEqual(match.word_matches[1].type, 'direct')
- self.assertEqual(match.word_matches[1].document_token.i, 1)
- self.assertEqual(
- match.word_matches[1].search_phrase_token.i, 1)
- self.assertEqual(
- match.word_matches[2].document_word, 'irrelevancy')
- self.assertEqual(
- match.word_matches[2].search_phrase_word, 'irrelevancy')
- self.assertEqual(match.word_matches[2].type, 'direct')
- self.assertEqual(match.word_matches[2].document_token.i, 2)
- self.assertEqual(
- match.word_matches[2].search_phrase_token.i, 2)
-
- this_number_of_matches = len(matches)
- self.assertFalse(this_number_of_matches < last_number_of_matches)
- last_number_of_matches = this_number_of_matches
- if this_number_of_matches == NUMBER_OF_THREADS * NUMBER_OF_THREADS:
- break
- self.assertFalse(counter == 99)
diff --git a/holmes_extractor/tests/common/test_serialization.py b/holmes_extractor/tests/common/test_serialization.py
deleted file mode 100644
index 168a96b..0000000
--- a/holmes_extractor/tests/common/test_serialization.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import unittest
-import os
-import holmes_extractor as holmes
-
-script_directory = os.path.dirname(os.path.realpath(__file__))
-ontology = holmes.Ontology(os.sep.join(
- (script_directory, 'test_ontology.owl')))
-nocoref_holmes_manager = holmes.Manager('en_core_web_lg',
- perform_coreference_resolution=False)
-nocoref_holmes_manager.register_search_phrase("A dog chases a cat")
-german_holmes_manager = holmes.Manager('de_core_news_md')
-
-
-class SerializationTest(unittest.TestCase):
-
- def test_matching_with_nocoref_holmes_manager_document_after_serialization(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "The cat was chased by the dog", 'pets')
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- self.assertEqual(len(nocoref_holmes_manager.match()), 1)
-
- def test_matching_with_reserialized_nocoref_holmes_manager_document(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "The cat was chased by the dog", 'pets')
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'pets')
- self.assertEqual(len(nocoref_holmes_manager.match()), 1)
-
- def test_matching_with_both_documents(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "The cat was chased by the dog", 'pets')
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'pets2')
- self.assertEqual(len(nocoref_holmes_manager.match()), 2)
-
- def test_document_to_serialize_does_not_exist(self):
- nocoref_holmes_manager.remove_all_documents()
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- self.assertEqual(serialized_doc, None)
-
- def test_matching_with_both_documents(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "The cat was chased by the dog", 'pets')
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'pets2')
- self.assertEqual(len(nocoref_holmes_manager.match()), 2)
-
- def test_parent_token_indexes(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "Houses in the village.", 'village')
- serialized_doc = nocoref_holmes_manager.serialize_document('village')
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'village2')
- old_doc = nocoref_holmes_manager.threadsafe_container.get_document(
- 'village')
- new_doc = nocoref_holmes_manager.threadsafe_container.get_document(
- 'village2')
- self.assertEqual(old_doc[0]._.holmes.string_representation_of_children(),
- '1:prep; 3:pobjp')
- self.assertEqual(old_doc[3]._.holmes.parent_dependencies, [
- [0, 'pobjp'], [1, 'pobj']])
- self.assertEqual(new_doc[0]._.holmes.string_representation_of_children(),
- '1:prep; 3:pobjp')
- self.assertEqual(new_doc[3]._.holmes.parent_dependencies, [
- [0, 'pobjp'], [1, 'pobj']])
-
- def test_subwords(self):
- german_holmes_manager.remove_all_documents()
- german_holmes_manager.parse_and_register_document(
- "Bundesoberbehörde.", 'bo')
- serialized_doc = german_holmes_manager.serialize_document('bo')
- german_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'bo2')
- old_doc = german_holmes_manager.threadsafe_container.get_document('bo')
- new_doc = german_holmes_manager.threadsafe_container.get_document(
- 'bo2')
- self.assertEqual(old_doc[0]._.holmes.subwords[0].text, 'Bundes')
- self.assertEqual(old_doc[0]._.holmes.subwords[0].lemma, 'bund')
- self.assertEqual(old_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
- self.assertEqual(old_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
- self.assertEqual(new_doc[0]._.holmes.subwords[0].text, 'Bundes')
- self.assertEqual(new_doc[0]._.holmes.subwords[0].lemma, 'bund')
- self.assertEqual(new_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
- self.assertEqual(new_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
-
- def test_derived_lemma(self):
- nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.parse_and_register_document(
- "A lot of information.", 'information')
- serialized_doc = nocoref_holmes_manager.serialize_document(
- 'information')
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'information2')
- old_doc = nocoref_holmes_manager.threadsafe_container.get_document(
- 'information')
- new_doc = nocoref_holmes_manager.threadsafe_container.get_document(
- 'information2')
- self.assertEqual(old_doc[3]._.holmes.derived_lemma, 'inform')
- self.assertEqual(new_doc[3]._.holmes.derived_lemma, 'inform')
diff --git a/holmes_extractor/topic_matching.py b/holmes_extractor/topic_matching.py
new file mode 100644
index 0000000..68d0e81
--- /dev/null
+++ b/holmes_extractor/topic_matching.py
@@ -0,0 +1,968 @@
+from .parsing import Index, CorpusWordPosition
+
+class TopicMatch:
+ """A topic match between some text and part of a document. Note that the end indexes refer
+ to the token in question rather than to the following token.
+
+ Properties:
+
+ document_label -- the document label.
+ index_within_document -- the index of the token within the document where 'score' was achieved.
+ subword_index -- the index of the subword within the token within the document where 'score'
+ was achieved, or *None* if the match involved the whole word.
+ start_index -- the start index of the topic match within the document.
+ end_index -- the end index of the topic match within the document.
+ sentences_start_index -- the start index within the document of the sentence that contains
+ 'start_index'
+ sentences_end_index -- the end index within the document of the sentence that contains
+ 'end_index'
+ score -- the similarity score of the topic match
+ text -- the text between 'sentences_start_index' and 'sentences_end_index'
+ structural_matches -- a list of `Match` objects that were used to derive this object.
+ """
+
+ def __init__(
+ self, document_label, index_within_document, subword_index, start_index, end_index,
+ sentences_start_index, sentences_end_index, score, text, structural_matches):
+ self.document_label = document_label
+ self.index_within_document = index_within_document
+ self.subword_index = subword_index
+ self.start_index = start_index
+ self.end_index = end_index
+ self.sentences_start_index = sentences_start_index
+ self.sentences_end_index = sentences_end_index
+ self.score = score
+ self.text = text
+ self.structural_matches = structural_matches
+
+ @property
+ def relative_start_index(self):
+ return self.start_index - self.sentences_start_index
+
+ @property
+ def relative_end_index(self):
+ return self.end_index - self.sentences_start_index
+
+class PhraseletActivationTracker:
+ """ Tracks the activation for a specific phraselet - the most recent score
+ and the position within the document at which that score was calculated.
+ """
+ def __init__(self, position, score):
+ self.position = position
+ self.score = score
+
+class PhraseletWordMatchInfo:
+ def __init__(self):
+ self.single_word_match_corpus_words = set()
+ # The indexes at which the single word phraselet for this word was matched.
+
+ self.phraselet_labels_to_parent_match_corpus_words = {}
+ # Dictionary from phraselets with this word as the parent to indexes where the
+ # phraselet was matched.
+
+ self.phraselet_labels_to_child_match_corpus_words = {}
+ # Dictionary from phraselets with this word as the child to indexes where the
+ # phraselet was matched.
+
+ self.parent_match_corpus_words_to_matches = {}
+ # Dictionary from indexes where phraselets with this word as the parent were matched
+ # to the match objects.
+
+ self.child_match_corpus_words_to_matches = {}
+ # Dictionary from indexes where phraselets with this word as the child were matched
+ # to the match objects.
+
+class TopicMatcher:
+ """A topic matcher object. See manager.py for details of the properties."""
+
+ def __init__(
+ self, *, structural_matcher, document_labels_to_documents, corpus_index_dict,
+ text_to_match, phraselet_labels_to_phraselet_infos, phraselet_labels_to_search_phrases,
+ maximum_activation_distance, overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold, relation_score,
+ reverse_only_relation_score, single_word_score, single_word_any_tag_score,
+ initial_question_word_answer_score, initial_question_word_behaviour,
+ different_match_cutoff_score, overlapping_relation_multiplier, embedding_penalty,
+ ontology_penalty, relation_matching_frequency_threshold,
+ embedding_matching_frequency_threshold, sideways_match_extent,
+ only_one_result_per_document, number_of_results, document_label_filter,
+ use_frequency_factor):
+ self.structural_matcher = structural_matcher
+ self.semantic_matching_helper = structural_matcher.semantic_matching_helper
+ self.document_labels_to_documents = document_labels_to_documents
+ self.corpus_index_dict = corpus_index_dict
+ self.text_to_match = text_to_match
+ self.phraselet_labels_to_phraselet_infos = phraselet_labels_to_phraselet_infos
+ self.phraselet_labels_to_search_phrases = phraselet_labels_to_search_phrases
+ self.maximum_activation_distance = maximum_activation_distance
+ self.overall_similarity_threshold = overall_similarity_threshold
+ self.initial_question_word_overall_similarity_threshold = \
+ initial_question_word_overall_similarity_threshold
+ self.relation_score = relation_score
+ self.reverse_only_relation_score = reverse_only_relation_score
+ self.single_word_score = single_word_score
+ self.single_word_any_tag_score = single_word_any_tag_score
+ self.initial_question_word_answer_score = initial_question_word_answer_score
+ self.initial_question_word_behaviour = initial_question_word_behaviour
+ self.different_match_cutoff_score = different_match_cutoff_score
+ self.overlapping_relation_multiplier = overlapping_relation_multiplier
+ self.embedding_penalty = embedding_penalty
+ self.ontology_penalty = ontology_penalty
+ self.relation_matching_frequency_threshold = relation_matching_frequency_threshold
+ self.relation_matching_frequency_threshold = \
+ relation_matching_frequency_threshold
+ self.embedding_matching_frequency_threshold = embedding_matching_frequency_threshold
+ self.sideways_match_extent = sideways_match_extent
+ self.only_one_result_per_document = only_one_result_per_document
+ self.number_of_results = number_of_results
+ self.document_label_filter = document_label_filter
+ self.use_frequency_factor = use_frequency_factor
+ self.words_to_phraselet_word_match_infos = {}
+
+ process_initial_question_words = initial_question_word_behaviour in ('process', 'exclusive')
+
+ # First get single-word matches
+ structural_matches = self.structural_matcher.match(
+ document_labels_to_documents=self.document_labels_to_documents,
+ corpus_index_dict=self.corpus_index_dict,
+ search_phrases=phraselet_labels_to_search_phrases.values(),
+ match_depending_on_single_words=True,
+ compare_embeddings_on_root_words=False,
+ compare_embeddings_on_non_root_words=False,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=None,
+ process_initial_question_words=process_initial_question_words,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold,
+ document_label_filter=self.document_label_filter)
+
+ # Now get normally matched relations
+ structural_matches.extend(self.structural_matcher.match(
+ document_labels_to_documents=self.document_labels_to_documents,
+ corpus_index_dict=self.corpus_index_dict,
+ search_phrases=phraselet_labels_to_search_phrases.values(),
+ match_depending_on_single_words=False,
+ compare_embeddings_on_root_words=False,
+ compare_embeddings_on_non_root_words=False,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=None,
+ process_initial_question_words=process_initial_question_words,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold,
+ document_label_filter=self.document_label_filter))
+
+ self.rebuild_document_info_dict(structural_matches, phraselet_labels_to_phraselet_infos)
+ parent_direct_retry_corpus_word_positions = set()
+ parent_embedding_retry_corpus_word_positions = set()
+ child_embedding_retry_corpus_word_positions = set()
+ for phraselet in (
+ phraselet_labels_to_search_phrases[phraselet_info.label] for
+ phraselet_info in phraselet_labels_to_phraselet_infos.values() if
+ phraselet_info.child_lemma is not None):
+ self.get_indexes_for_reverse_matching(
+ phraselet=phraselet,
+ phraselet_info=phraselet_labels_to_phraselet_infos[phraselet.label],
+ parent_direct_retry_corpus_word_positions=
+ parent_direct_retry_corpus_word_positions,
+ parent_embedding_retry_corpus_word_positions=
+ parent_embedding_retry_corpus_word_positions,
+ child_embedding_retry_corpus_word_positions=
+ child_embedding_retry_corpus_word_positions)
+ if len(parent_embedding_retry_corpus_word_positions) > 0 or \
+ len(parent_direct_retry_corpus_word_positions) > 0:
+
+ # Perform reverse matching at selected indexes
+ structural_matches.extend(self.structural_matcher.match(
+ document_labels_to_documents=self.document_labels_to_documents,
+ corpus_index_dict=self.corpus_index_dict,
+ search_phrases=phraselet_labels_to_search_phrases.values(),
+ match_depending_on_single_words=False,
+ compare_embeddings_on_root_words=True,
+ compare_embeddings_on_non_root_words=False,
+ reverse_matching_corpus_word_positions=
+ parent_direct_retry_corpus_word_positions,
+ embedding_reverse_matching_corpus_word_positions=
+ parent_embedding_retry_corpus_word_positions,
+ process_initial_question_words=process_initial_question_words,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold,
+ document_label_filter=self.document_label_filter))
+
+ if len(child_embedding_retry_corpus_word_positions) > 0:
+ # Retry normal matching at selected indexes with embedding-based matching on children
+ structural_matches.extend(self.structural_matcher.match(
+ document_labels_to_documents=self.document_labels_to_documents,
+ corpus_index_dict=self.corpus_index_dict,
+ search_phrases=phraselet_labels_to_search_phrases.values(),
+ match_depending_on_single_words=False,
+ compare_embeddings_on_root_words=False,
+ compare_embeddings_on_non_root_words=True,
+ reverse_matching_corpus_word_positions=None,
+ embedding_reverse_matching_corpus_word_positions=
+ child_embedding_retry_corpus_word_positions,
+ process_initial_question_words=process_initial_question_words,
+ overall_similarity_threshold=overall_similarity_threshold,
+ initial_question_word_overall_similarity_threshold=
+ initial_question_word_overall_similarity_threshold,
+ document_label_filter=self.document_label_filter))
+ if len(parent_direct_retry_corpus_word_positions) > 0 or \
+ len(parent_embedding_retry_corpus_word_positions) > 0 or \
+ len(child_embedding_retry_corpus_word_positions) > 0:
+ self.rebuild_document_info_dict(structural_matches, phraselet_labels_to_phraselet_infos)
+ structural_matches = list(filter(self.filter_superfluous_matches, structural_matches))
+ phraselet_labels_to_frequency_factors = {info.label: info.frequency_factor for info
+ in phraselet_labels_to_phraselet_infos.values()}
+ position_sorted_structural_matches = sorted(
+ structural_matches, key=lambda match:
+ (
+ match.document_label, match.index_within_document,
+ match.get_subword_index_for_sorting(),
+ len([1 for wm in match.word_matches if
+ wm.search_phrase_token._.holmes.is_initial_question_word]) == 0,
+ match.from_single_word_phraselet))
+ position_sorted_structural_matches = self.remove_duplicates(
+ position_sorted_structural_matches)
+
+ # Read through the documents measuring the activation based on where
+ # in the document structural matches were found
+ score_sorted_structural_matches = self.perform_activation_scoring(
+ position_sorted_structural_matches, phraselet_labels_to_frequency_factors)
+ self.topic_matches = self.generate_topic_matches(
+ score_sorted_structural_matches, position_sorted_structural_matches)
+
+ def get_phraselet_word_match_info(self, word):
+ if word in self.words_to_phraselet_word_match_infos:
+ return self.words_to_phraselet_word_match_infos[word]
+ else:
+ phraselet_word_match_info = PhraseletWordMatchInfo()
+ self.words_to_phraselet_word_match_infos[word] = phraselet_word_match_info
+ return phraselet_word_match_info
+
+ def get_indexes_for_reverse_matching(
+ self, *, phraselet, phraselet_info,
+ parent_direct_retry_corpus_word_positions,
+ parent_embedding_retry_corpus_word_positions,
+ child_embedding_retry_corpus_word_positions):
+ """
+ parent_direct_retry_corpus_word_positions -- indexes where matching against a reverse
+ matching phraselet should be attempted. These are ascertained by examining the child
+ words.
+ parent_embedding_retry_corpus_word_positions -- indexes where matching
+ against a phraselet should be attempted with embedding-based matching on the
+ parent (root) word. These are ascertained by examining the child words.
+ child_embedding_retry_corpus_word_positions -- indexes where matching
+ against a phraselet should be attempted with embedding-based matching on the
+ child (non-root) word. These are ascertained by examining the parent words.
+ """
+
+ parent_token = phraselet.root_token
+ parent_word = parent_token._.holmes.lemma_or_derived_lemma()
+ child_token = [token for token in phraselet.matchable_tokens if token.i !=
+ parent_token.i][0]
+ child_word = child_token._.holmes.lemma_or_derived_lemma()
+ if parent_word in self.words_to_phraselet_word_match_infos and ((not \
+ phraselet.reverse_only and not \
+ phraselet.treat_as_reverse_only_during_initial_relation_matching)
+ or child_token._.holmes.has_initial_question_word_in_phrase):
+ parent_phraselet_word_match_info = self.words_to_phraselet_word_match_infos[
+ parent_word]
+ parent_single_word_match_corpus_words = \
+ parent_phraselet_word_match_info.single_word_match_corpus_words
+ if phraselet.label in parent_phraselet_word_match_info.\
+ phraselet_labels_to_parent_match_corpus_words:
+ parent_relation_match_corpus_words = \
+ parent_phraselet_word_match_info.\
+ phraselet_labels_to_parent_match_corpus_words[phraselet.label]
+ else:
+ parent_relation_match_corpus_words = []
+ if phraselet_info.parent_frequency_factor >= \
+ self.embedding_matching_frequency_threshold or \
+ child_token._.holmes.has_initial_question_word_in_phrase:
+ child_embedding_retry_corpus_word_positions.update(cwp for cwp in
+ parent_single_word_match_corpus_words.difference(
+ parent_relation_match_corpus_words))
+ if child_word in self.words_to_phraselet_word_match_infos:
+ child_phraselet_word_match_info = \
+ self.words_to_phraselet_word_match_infos[child_word]
+ child_single_word_match_corpus_words = \
+ child_phraselet_word_match_info.single_word_match_corpus_words
+ if phraselet.label in child_phraselet_word_match_info.\
+ phraselet_labels_to_child_match_corpus_words:
+ child_relation_match_corpus_words = child_phraselet_word_match_info.\
+ phraselet_labels_to_child_match_corpus_words[phraselet.label]
+ else:
+ child_relation_match_corpus_words = []
+
+ if phraselet_info.child_frequency_factor >= self.embedding_matching_frequency_threshold\
+ or parent_token._.holmes.has_initial_question_word_in_phrase:
+ set_to_add_to = parent_embedding_retry_corpus_word_positions
+ elif phraselet_info.child_frequency_factor >= \
+ self.relation_matching_frequency_threshold \
+ and (phraselet.reverse_only or
+ phraselet.treat_as_reverse_only_during_initial_relation_matching):
+ set_to_add_to = parent_direct_retry_corpus_word_positions
+ else:
+ return
+ linking_dependency = parent_token._.holmes.get_label_of_dependency_with_child_index(
+ child_token.i)
+ for corpus_word_position in child_single_word_match_corpus_words.difference(
+ child_relation_match_corpus_words):
+ doc = self.document_labels_to_documents[corpus_word_position.document_label]
+ working_index = corpus_word_position.index
+ working_token = doc[working_index.token_index]
+ if not working_index.is_subword() or \
+ working_token._.holmes.subwords[working_index.subword_index].is_head:
+ for parent_dependency in \
+ working_token._.holmes.coreference_linked_parent_dependencies:
+ if self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=linking_dependency,
+ document_dependency_label=parent_dependency[1],
+ inverse_polarity=False):
+ working_index = Index(parent_dependency[0], None)
+ working_cwp = CorpusWordPosition(corpus_word_position.document_label,
+ working_index)
+ set_to_add_to.add(working_cwp)
+ for child_dependency in \
+ working_token._.holmes.coreference_linked_child_dependencies:
+ if self.structural_matcher.use_reverse_dependency_matching and \
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=linking_dependency,
+ document_dependency_label=child_dependency[1],
+ inverse_polarity=True):
+ working_index = Index(child_dependency[0], None)
+ working_cwp = CorpusWordPosition(corpus_word_position.document_label,
+ working_index)
+ set_to_add_to.add(working_cwp)
+ else:
+ working_subword = \
+ working_token._.holmes.subwords[working_index.subword_index]
+ if self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=linking_dependency,
+ document_dependency_label=
+ working_subword.governing_dependency_label,
+ inverse_polarity=False):
+ working_index = Index(working_index.token_index,
+ working_subword.governor_index)
+ working_cwp = CorpusWordPosition(corpus_word_position.document_label,
+ working_index)
+ set_to_add_to.add(working_cwp)
+ if self.structural_matcher.use_reverse_dependency_matching and \
+ self.semantic_matching_helper.dependency_labels_match(
+ search_phrase_dependency_label=linking_dependency,
+ document_dependency_label=
+ working_subword.dependency_label,
+ inverse_polarity=True):
+ working_index = Index(working_index.token_index,
+ working_subword.dependent_index)
+ working_cwp = CorpusWordPosition(corpus_word_position.document_label,
+ working_index)
+ set_to_add_to.add(working_cwp)
+
+ def rebuild_document_info_dict(self, matches, phraselet_labels_to_phraselet_infos):
+
+ def process_word_match(match, parent): # 'True' -> parent, 'False' -> child
+ word_match = self.get_word_match_from_match(match, parent)
+ word = word_match.search_phrase_token._.holmes.lemma_or_derived_lemma()
+ phraselet_word_match_info = self.get_phraselet_word_match_info(word)
+ corpus_word_position = CorpusWordPosition(
+ match.document_label, word_match.get_document_index())
+ if parent:
+ self.add_to_dict_list(
+ phraselet_word_match_info.parent_match_corpus_words_to_matches,
+ corpus_word_position, match)
+ self.add_to_dict_list(
+ phraselet_word_match_info.phraselet_labels_to_parent_match_corpus_words,
+ match.search_phrase_label, corpus_word_position)
+ else:
+ self.add_to_dict_list(
+ phraselet_word_match_info.child_match_corpus_words_to_matches,
+ corpus_word_position, match)
+ self.add_to_dict_list(
+ phraselet_word_match_info.phraselet_labels_to_child_match_corpus_words,
+ match.search_phrase_label, corpus_word_position)
+
+ self.words_to_phraselet_word_match_infos = {}
+ for match in matches:
+ if match.from_single_word_phraselet:
+ phraselet_info = phraselet_labels_to_phraselet_infos[match.search_phrase_label]
+ word = phraselet_info.parent_derived_lemma
+ phraselet_word_match_info = self.get_phraselet_word_match_info(word)
+ word_match = match.word_matches[0]
+ phraselet_word_match_info.single_word_match_corpus_words.add(
+ CorpusWordPosition(match.document_label, word_match.get_document_index()))
+ else:
+ process_word_match(match, True)
+ process_word_match(match, False)
+
+ def filter_superfluous_matches(self, match):
+
+ def get_other_matches_at_same_word(match, parent): # 'True' -> parent, 'False' -> child
+ word_match = self.get_word_match_from_match(match, parent)
+ word = word_match.search_phrase_token._.holmes.lemma_or_derived_lemma()
+ phraselet_word_match_info = self.get_phraselet_word_match_info(word)
+ corpus_word_position = CorpusWordPosition(
+ match.document_label, word_match.get_document_index())
+ if parent:
+ match_dict = phraselet_word_match_info.parent_match_corpus_words_to_matches
+ else:
+ match_dict = phraselet_word_match_info.child_match_corpus_words_to_matches
+ return match_dict[corpus_word_position]
+
+ def check_for_sibling_match_with_higher_similarity(
+ match, other_match, word_match, other_word_match):
+ # We do not want the same phraselet to match multiple siblings, so choose
+ # the sibling that is most similar to the search phrase token.
+ if self.overall_similarity_threshold == 1.0:
+ return True
+ if word_match.document_token.i == other_word_match.document_token.i:
+ return True
+ working_sibling = word_match.document_token.doc[
+ word_match.document_token._.holmes.token_or_lefthand_sibling_index]
+ for sibling in \
+ working_sibling._.holmes.loop_token_and_righthand_siblings(
+ word_match.document_token.doc):
+ if match.search_phrase_label == other_match.search_phrase_label and \
+ other_word_match.document_token.i == sibling.i and \
+ other_word_match.similarity_measure > word_match.similarity_measure:
+ return False
+ return True
+
+ def perform_checks_at_pole(match, parent): # pole is 'True' -> parent, 'False' -> child
+ this_this_pole_word_match = self.get_word_match_from_match(match, parent)
+ this_pole_index = this_this_pole_word_match.document_token.i
+ this_other_pole_word_match = self.get_word_match_from_match(match, not parent)
+ for other_this_pole_match in get_other_matches_at_same_word(match, parent):
+ other_other_pole_word_match = \
+ self.get_word_match_from_match(other_this_pole_match, not parent)
+ if this_other_pole_word_match.document_subword is not None:
+ this_other_pole_subword_index = this_other_pole_word_match.\
+ document_subword.index
+ else:
+ this_other_pole_subword_index = None
+ if other_other_pole_word_match.document_subword is not None:
+ other_other_pole_subword_index = other_other_pole_word_match.\
+ document_subword.index
+ else:
+ other_other_pole_subword_index = None
+ if this_other_pole_word_match.document_token.i == other_other_pole_word_match.\
+ document_token.i and this_other_pole_subword_index == \
+ other_other_pole_subword_index and \
+ other_other_pole_word_match.similarity_measure > \
+ this_other_pole_word_match.similarity_measure:
+ # The other match has a higher similarity measure at the other pole than
+ # this match. The matched tokens are the same. The matching phraselets
+ # must be different.
+ return False
+ if this_other_pole_word_match.document_token.i == other_other_pole_word_match.\
+ document_token.i and this_other_pole_subword_index is not None \
+ and other_other_pole_subword_index is None:
+ # This match is with a subword where the other match has matched the entire
+ # word, so this match should be removed.
+ return False
+ # Check unnecessary if parent==True as it has then already
+ # been carried out during structural matching.
+ if not parent and this_other_pole_word_match.document_token.i != \
+ other_other_pole_word_match.document_token.i and \
+ other_other_pole_word_match.document_token.i in \
+ this_other_pole_word_match.document_token._.\
+ holmes.token_and_coreference_chain_indexes and \
+ match.search_phrase_label == other_this_pole_match.search_phrase_label \
+ and (
+ (
+ abs(this_pole_index -
+ this_other_pole_word_match.document_token.i) >
+ abs(this_pole_index -
+ other_other_pole_word_match.document_token.i)
+ )
+ or
+ (
+ abs(this_pole_index -
+ this_other_pole_word_match.document_token.i) ==
+ abs(this_pole_index -
+ other_other_pole_word_match.document_token.i) and
+ this_other_pole_word_match.document_token.i >
+ other_other_pole_word_match.document_token.i
+ )
+ ):
+ # The document tokens at the other poles corefer with each other and
+ # the other match's token is closer to the second document token (the
+ # one at this pole). Both matches are from the same phraselet.
+ # If the tokens from the two matches are the same distance from the document
+ # token at this pole but on opposite sides of it, the preceding one beats
+ # the succeeding one simply because we have to choose one or the other.
+ return False
+
+ if not check_for_sibling_match_with_higher_similarity(
+ match, other_this_pole_match, this_other_pole_word_match,
+ other_other_pole_word_match):
+ return False
+ return True
+
+ if match.from_single_word_phraselet:
+ return True
+ if not perform_checks_at_pole(match, True):
+ return False
+ if not perform_checks_at_pole(match, False):
+ return False
+ return True
+
+ def remove_duplicates(self, matches):
+ # Situations where the same document tokens have been matched by multiple phraselets
+ matches_to_return = []
+ if len(matches) == 0:
+ return matches_to_return
+ else:
+ matches_to_return.append(matches[0])
+ if len(matches) > 1:
+ previous_whole_word_single_word_match = None
+ for counter in range(1, len(matches)):
+ this_match = matches[counter]
+ previous_match = matches[counter-1]
+ if this_match.index_within_document == previous_match.index_within_document:
+ if previous_match.from_single_word_phraselet and \
+ previous_match.get_subword_index() is None:
+ previous_whole_word_single_word_match = previous_match
+ if this_match.get_subword_index() is not None and \
+ previous_whole_word_single_word_match is not None and \
+ this_match.index_within_document == \
+ previous_whole_word_single_word_match.index_within_document:
+ # This match is against a subword where the whole word has also been
+ # matched, so reject it
+ continue
+ if this_match.document_label != previous_match.document_label:
+ matches_to_return.append(this_match)
+ elif len(this_match.word_matches) != len(previous_match.word_matches):
+ matches_to_return.append(this_match)
+ else:
+ this_word_matches_indexes = [
+ word_match.get_document_index() for word_match in
+ this_match.word_matches]
+ previous_word_matches_indexes = [
+ word_match.get_document_index() for word_match in
+ previous_match.word_matches]
+ # In some circumstances the two phraselets may have matched the same
+ # tokens the opposite way round
+ if sorted(this_word_matches_indexes) != \
+ sorted(previous_word_matches_indexes):
+ matches_to_return.append(this_match)
+ return matches_to_return
+
+ def get_word_match_from_match(self, match, parent):
+ ## child if parent==False
+ for word_match in match.word_matches:
+ if parent and word_match.search_phrase_token.dep_ == 'ROOT':
+ return word_match
+ if not parent and word_match.search_phrase_token.dep_ != 'ROOT':
+ return word_match
+ raise RuntimeError(''.join(('Word match not found with parent==', str(parent))))
+
+ def add_to_dict_list(self, dictionary, key, value):
+ if key in dictionary:
+ dictionary[key].append(value)
+ else:
+ dictionary[key] = [value]
+
+ def add_to_dict_set(self, dictionary, key, value):
+ if not key in dictionary:
+ dictionary[key] = set()
+ dictionary[key].add(value)
+
+ def perform_activation_scoring(self, position_sorted_structural_matches,
+ phraselet_labels_to_frequency_factors):
+ """
+ Read through the documents measuring the activation based on where
+ in the document structural matches were found.
+ """
+ def get_set_from_dict(dictionary, key):
+ if key in dictionary:
+ return dictionary[key]
+ else:
+ return set()
+
+ def is_intcompound_match_within_same_document_word(match):
+ # Where a relationship match involves subwords of the same word both on the
+ # searched text and on the document side, it should receive the same activation as a
+ # single-word match.
+ return (match.search_phrase_label.startswith('intcompound') and
+ len({wm.document_token.i for wm in match.word_matches}) == 1)
+
+ def get_current_activation_for_phraselet(phraselet_activation_tracker, current_index):
+ distance_to_last_match = current_index - phraselet_activation_tracker.position
+ tailoff_quotient = distance_to_last_match / self.maximum_activation_distance
+ tailoff_quotient = min(tailoff_quotient, 1.0)
+ return (1-tailoff_quotient) * phraselet_activation_tracker.score
+
+ document_labels_to_indexes_to_phraselet_labels = {}
+ for match in (
+ match for match in position_sorted_structural_matches if not
+ match.from_single_word_phraselet and
+ not is_intcompound_match_within_same_document_word(match)):
+ if match.document_label in document_labels_to_indexes_to_phraselet_labels:
+ inner_dict = document_labels_to_indexes_to_phraselet_labels[match.document_label]
+ else:
+ inner_dict = {}
+ document_labels_to_indexes_to_phraselet_labels[match.document_label] = inner_dict
+ parent_word_match = self.get_word_match_from_match(match, True)
+ self.add_to_dict_set(
+ inner_dict, parent_word_match.get_document_index(), match.search_phrase_label)
+ child_word_match = self.get_word_match_from_match(match, False)
+ self.add_to_dict_set(
+ inner_dict, child_word_match.get_document_index(), match.search_phrase_label)
+ current_document_label = None
+ for pssm_index, match in enumerate(position_sorted_structural_matches):
+ match.original_index_within_list = pssm_index # store for later use after resorting
+ if match.document_label != current_document_label or pssm_index == 0:
+ current_document_label = match.document_label
+ phraselet_labels_to_phraselet_activation_trackers = {}
+ indexes_to_phraselet_labels = document_labels_to_indexes_to_phraselet_labels.get(
+ current_document_label, {})
+ match.is_overlapping_relation = False
+ if match.from_single_word_phraselet or \
+ is_intcompound_match_within_same_document_word(match):
+ if match.from_topic_match_phraselet_created_without_matching_tags:
+ this_match_score = self.single_word_any_tag_score
+ else:
+ this_match_score = self.single_word_score
+ else:
+ if match.from_reverse_only_topic_match_phraselet:
+ this_match_score = self.reverse_only_relation_score
+ else:
+ this_match_score = self.relation_score
+ for word_match in match.word_matches:
+ if word_match.search_phrase_initial_question_word:
+ this_match_score = self.initial_question_word_answer_score
+ this_match_parent_word_match = self.get_word_match_from_match(match, True)
+ this_match_parent_index = this_match_parent_word_match.get_document_index()
+ this_match_child_word_match = self.get_word_match_from_match(match, False)
+ this_match_child_index = this_match_child_word_match.get_document_index()
+ other_relevant_phraselet_labels = get_set_from_dict(
+ indexes_to_phraselet_labels,
+ this_match_parent_index) | \
+ get_set_from_dict(indexes_to_phraselet_labels, this_match_child_index)
+ other_relevant_phraselet_labels.remove(match.search_phrase_label)
+ if len(other_relevant_phraselet_labels) > 0:
+ match.is_overlapping_relation = True
+ this_match_score *= self.overlapping_relation_multiplier
+
+ if self.use_frequency_factor:
+ # multiply the score by the frequency factor
+ this_match_score *= phraselet_labels_to_frequency_factors[match.search_phrase_label]
+
+ overall_similarity_measure = float(match.overall_similarity_measure)
+ if overall_similarity_measure < 1.0:
+ this_match_score *= self.embedding_penalty * overall_similarity_measure
+ for word_match in (word_match for word_match in match.word_matches \
+ if word_match.word_match_type == 'ontology'):
+ this_match_score *= (self.ontology_penalty ** (abs(word_match.depth) + 1))
+ if match.search_phrase_label in phraselet_labels_to_phraselet_activation_trackers:
+ phraselet_activation_tracker = phraselet_labels_to_phraselet_activation_trackers[
+ match.search_phrase_label]
+ current_score = get_current_activation_for_phraselet(
+ phraselet_activation_tracker, match.index_within_document)
+ if this_match_score > current_score:
+ phraselet_activation_tracker.score = this_match_score
+ else:
+ phraselet_activation_tracker.score = current_score
+ phraselet_activation_tracker.position = match.index_within_document
+ else:
+ phraselet_labels_to_phraselet_activation_trackers[match.search_phrase_label] =\
+ PhraseletActivationTracker(match.index_within_document, this_match_score)
+ match.topic_score = 0
+ for phraselet_label in list(phraselet_labels_to_phraselet_activation_trackers):
+ phraselet_activation_tracker = phraselet_labels_to_phraselet_activation_trackers[
+ phraselet_label]
+ current_activation = get_current_activation_for_phraselet(
+ phraselet_activation_tracker, match.index_within_document)
+ if current_activation <= 0:
+ del phraselet_labels_to_phraselet_activation_trackers[phraselet_label]
+ else:
+ match.topic_score += current_activation
+ return sorted(position_sorted_structural_matches, key=lambda match: 0-match.topic_score)
+
+ def generate_topic_matches(
+ self, score_sorted_structural_matches, position_sorted_structural_matches):
+ """Resort the matches starting with the highest (most active) and
+ create topic match objects with information about the surrounding sentences.
+ """
+
+ def match_contained_within_existing_topic_match(topic_matches, match):
+ for topic_match in topic_matches:
+ if match.document_label == topic_match.document_label and \
+ match.index_within_document >= topic_match.start_index and \
+ match.index_within_document <= topic_match.end_index:
+ return True
+ return False
+
+ def alter_start_and_end_indexes_for_match(start_index, end_index, match):
+ for word_match in match.word_matches:
+ if word_match.first_document_token.i < start_index:
+ start_index = word_match.first_document_token.i
+ if word_match.document_subword is not None and \
+ word_match.document_subword.containing_token_index < start_index:
+ start_index = word_match.document_subword.containing_token_index
+ if word_match.last_document_token.i > end_index:
+ end_index = word_match.last_document_token.i
+ if word_match.document_subword is not None and \
+ word_match.document_subword.containing_token_index > end_index:
+ end_index = word_match.document_subword.containing_token_index
+ return start_index, end_index
+
+ if self.only_one_result_per_document:
+ existing_document_labels = []
+ topic_matches = []
+ counter = 0
+ for score_sorted_match in score_sorted_structural_matches:
+ if counter >= self.number_of_results:
+ break
+ if match_contained_within_existing_topic_match(topic_matches, score_sorted_match):
+ continue
+ if self.only_one_result_per_document and score_sorted_match.document_label \
+ in existing_document_labels:
+ continue
+ start_index, end_index = alter_start_and_end_indexes_for_match(
+ score_sorted_match.index_within_document,
+ score_sorted_match.index_within_document,
+ score_sorted_match)
+ previous_index_within_list = score_sorted_match.original_index_within_list
+ while previous_index_within_list > 0 and position_sorted_structural_matches[
+ previous_index_within_list-1].document_label == \
+ score_sorted_match.document_label and position_sorted_structural_matches[
+ previous_index_within_list].topic_score > self.different_match_cutoff_score:
+ # previous_index_within_list rather than previous_index_within_list -1 :
+ # when a complex structure is matched, it will often begin with a single noun
+ # that should be included within the topic match indexes
+ if match_contained_within_existing_topic_match(
+ topic_matches, position_sorted_structural_matches[
+ previous_index_within_list-1]):
+ break
+ if score_sorted_match.index_within_document - position_sorted_structural_matches[
+ previous_index_within_list-1].index_within_document > \
+ self.sideways_match_extent:
+ break
+ previous_index_within_list -= 1
+ start_index, end_index = alter_start_and_end_indexes_for_match(
+ start_index, end_index,
+ position_sorted_structural_matches[previous_index_within_list])
+ next_index_within_list = score_sorted_match.original_index_within_list
+ while next_index_within_list + 1 < len(score_sorted_structural_matches) and \
+ position_sorted_structural_matches[next_index_within_list+1].document_label == \
+ score_sorted_match.document_label and \
+ position_sorted_structural_matches[next_index_within_list+1].topic_score >= \
+ self.different_match_cutoff_score:
+ if match_contained_within_existing_topic_match(
+ topic_matches, position_sorted_structural_matches[
+ next_index_within_list+1]):
+ break
+ if position_sorted_structural_matches[
+ next_index_within_list+1].index_within_document - \
+ score_sorted_match.index_within_document > self.sideways_match_extent:
+ break
+ next_index_within_list += 1
+ start_index, end_index = alter_start_and_end_indexes_for_match(
+ start_index, end_index,
+ position_sorted_structural_matches[next_index_within_list])
+ working_document = self.document_labels_to_documents[score_sorted_match.document_label]
+ relevant_sentences = [
+ sentence for sentence in working_document.sents
+ if sentence.end > start_index and sentence.start <= end_index]
+ sentences_start_index = relevant_sentences[0].start
+ sentences_end_index = relevant_sentences[-1].end
+ text = working_document[sentences_start_index: sentences_end_index].text
+ topic_matches.append(
+ TopicMatch(
+ score_sorted_match.document_label,
+ score_sorted_match.index_within_document,
+ score_sorted_match.get_subword_index(),
+ start_index, end_index, sentences_start_index, sentences_end_index - 1,
+ score_sorted_match.topic_score, text, position_sorted_structural_matches[
+ previous_index_within_list:next_index_within_list+1]))
+ if self.only_one_result_per_document:
+ existing_document_labels.append(score_sorted_match.document_label)
+ counter += 1
+ # If two matches have the same score, order them by length
+ return sorted(
+ topic_matches, key=lambda topic_match: (
+ 0-topic_match.score, topic_match.start_index - topic_match.end_index))
+
+ def get_topic_match_dictionaries(self):
+
+ class WordInfo:
+
+ def __init__(self, relative_start_index, relative_end_index, typ, explanation):
+ self.relative_start_index = relative_start_index
+ self.relative_end_index = relative_end_index
+ self.word_match_type = typ
+ self.explanation = explanation
+ self.is_highest_activation = False
+
+ def __eq__(self, other):
+ return isinstance(other, WordInfo) and \
+ self.relative_start_index == other.relative_start_index and \
+ self.relative_end_index == other.relative_end_index
+
+ def __hash__(self):
+ return hash((self.relative_start_index, self.relative_end_index))
+
+ def get_containing_word_info_key(word_infos_to_word_infos, this_word_info):
+ for other_word_info in word_infos_to_word_infos:
+ if this_word_info.relative_start_index > other_word_info.relative_start_index and \
+ this_word_info.relative_end_index <= other_word_info.relative_end_index:
+ return other_word_info
+ if this_word_info.relative_start_index >= other_word_info.relative_start_index and\
+ this_word_info.relative_end_index < other_word_info.relative_end_index:
+ return other_word_info
+ return None
+
+ topic_match_dicts = []
+ for topic_match_counter, topic_match in enumerate(self.topic_matches):
+ doc = self.document_labels_to_documents[topic_match.document_label]
+ sentences_character_start_index_in_document = doc[topic_match.sentences_start_index].idx
+ sentences_character_end_index_in_document = doc[topic_match.sentences_end_index].idx + \
+ len(doc[topic_match.sentences_end_index].text)
+ word_infos_to_word_infos = {}
+ answers_set = set()
+ for match in topic_match.structural_matches:
+ for word_match in match.word_matches:
+ if word_match.document_subword is not None:
+ subword = word_match.document_subword
+ relative_start_index = doc[subword.containing_token_index].idx + \
+ subword.char_start_index - \
+ sentences_character_start_index_in_document
+ relative_end_index = relative_start_index + len(subword.text)
+ else:
+ relative_start_index = word_match.first_document_token.idx - \
+ sentences_character_start_index_in_document
+ relative_end_index = word_match.last_document_token.idx + \
+ len(word_match.last_document_token.text) - \
+ sentences_character_start_index_in_document
+ if match.is_overlapping_relation:
+ word_info = WordInfo(
+ relative_start_index, relative_end_index, 'overlapping_relation',
+ word_match.explain())
+ elif match.from_single_word_phraselet: # two subwords within word:
+ word_info = WordInfo(
+ relative_start_index, relative_end_index, 'single',
+ word_match.explain())
+ else:
+ word_info = WordInfo(
+ relative_start_index, relative_end_index, 'relation',
+ word_match.explain())
+ if word_match.search_phrase_initial_question_word:
+ if word_match.document_subword is not None:
+ answer_relative_start_index = word_match.document_token.idx - \
+ sentences_character_start_index_in_document
+ answer_relative_end_index = relative_end_index
+ else:
+ subtree_without_conjunction = \
+ self.semantic_matching_helper.get_subtree_list_for_question_answer(
+ word_match.document_token)
+ answer_relative_start_index = subtree_without_conjunction[0].idx - \
+ sentences_character_start_index_in_document
+ answer_relative_end_index = subtree_without_conjunction[-1].idx + \
+ len(subtree_without_conjunction[-1].text) - \
+ sentences_character_start_index_in_document
+ answers_set.add((answer_relative_start_index, answer_relative_end_index))
+ if word_info in word_infos_to_word_infos:
+ existing_word_info = word_infos_to_word_infos[word_info]
+ if not existing_word_info.word_match_type == 'overlapping_relation':
+ if match.is_overlapping_relation:
+ existing_word_info.word_match_type = 'overlapping_relation'
+ elif not match.from_single_word_phraselet:
+ existing_word_info.word_match_type = 'relation'
+ else:
+ word_infos_to_word_infos[word_info] = word_info
+ for word_info in list(word_infos_to_word_infos.keys()):
+ if get_containing_word_info_key(word_infos_to_word_infos, word_info) is not None:
+ del word_infos_to_word_infos[word_info]
+ if self.initial_question_word_behaviour != 'exclusive' or len(answers_set) > 0:
+ if topic_match.subword_index is not None:
+ subword = doc[topic_match.index_within_document]._.holmes.subwords\
+ [topic_match.subword_index]
+ highest_activation_relative_start_index = \
+ doc[subword.containing_token_index].idx + \
+ subword.char_start_index - \
+ sentences_character_start_index_in_document
+ highest_activation_relative_end_index = \
+ highest_activation_relative_start_index + len(subword.text)
+ else:
+ highest_activation_relative_start_index = \
+ doc[topic_match.index_within_document].idx - \
+ sentences_character_start_index_in_document
+ highest_activation_relative_end_index = \
+ doc[topic_match.index_within_document].idx \
+ + len(doc[topic_match.index_within_document].text) - \
+ sentences_character_start_index_in_document
+ highest_activation_word_info = WordInfo(
+ highest_activation_relative_start_index, highest_activation_relative_end_index,
+ 'temp', 'temp')
+ containing_word_info = get_containing_word_info_key(
+ word_infos_to_word_infos, highest_activation_word_info)
+ if containing_word_info is not None:
+ highest_activation_word_info = containing_word_info
+ word_infos_to_word_infos[highest_activation_word_info].is_highest_activation = True
+ word_infos = sorted(
+ word_infos_to_word_infos.values(), key=lambda word_info: (
+ word_info.relative_start_index, word_info.relative_end_index))
+ answers = list(answers_set)
+ answers.sort(key=lambda answer:(answer[0], answer[1]))
+ for answer in answers.copy():
+ if len([1 for other_answer in answers if other_answer[0] < answer[0] and
+ other_answer[1] >= answer[1]]) > 0:
+ answers.remove(answer)
+ elif len([1 for other_answer in answers if other_answer[0] == answer[0] and
+ other_answer[1] > answer[1]]) > 0:
+ answers.remove(answer)
+ topic_match_dict = {
+ 'document_label': topic_match.document_label,
+ 'text': topic_match.text,
+ 'text_to_match': self.text_to_match,
+ 'rank': str(topic_match_counter + 1), # ties are corrected by
+ # TopicMatchDictionaryOrderer
+ 'index_within_document': topic_match.index_within_document,
+ 'subword_index': topic_match.subword_index,
+ 'start_index': topic_match.start_index,
+ 'end_index': topic_match.end_index,
+ 'sentences_start_index': topic_match.sentences_start_index,
+ 'sentences_end_index': topic_match.sentences_end_index,
+ 'sentences_character_start_index': sentences_character_start_index_in_document,
+ 'sentences_character_end_index': sentences_character_end_index_in_document,
+ 'score': topic_match.score,
+ 'word_infos': [
+ [
+ word_info.relative_start_index, word_info.relative_end_index,
+ word_info.word_match_type, word_info.is_highest_activation,
+ word_info.explanation]
+ for word_info in word_infos],
+ # The word infos are labelled by array index alone to prevent the JSON from
+ # becoming too bloated,
+ 'answers': [[answer[0], answer[1]] for answer in answers]
+ }
+ topic_match_dicts.append(topic_match_dict)
+ return topic_match_dicts
+
+class TopicMatchDictionaryOrderer:
+ # in its own class as it is called from the main process rather than from the workers
+
+ def order(self, topic_match_dicts, number_of_results, tied_result_quotient):
+
+ topic_match_dicts = sorted(
+ topic_match_dicts, key=lambda dict: (
+ 0-dict['score'], 0-len(dict['text'].split()), dict['document_label'],
+ dict['word_infos'][0][0]))
+ topic_match_dicts = topic_match_dicts[0:number_of_results]
+ topic_match_counter = 0
+ while topic_match_counter < len(topic_match_dicts):
+ topic_match_dicts[topic_match_counter]['rank'] = str(topic_match_counter + 1)
+ following_topic_match_counter = topic_match_counter + 1
+ while following_topic_match_counter < len(topic_match_dicts) and \
+ topic_match_dicts[following_topic_match_counter]['score'] / topic_match_dicts[
+ topic_match_counter]['score'] > tied_result_quotient:
+ working_rank = ''.join((str(topic_match_counter + 1), '='))
+ topic_match_dicts[topic_match_counter]['rank'] = working_rank
+ topic_match_dicts[following_topic_match_counter]['rank'] = working_rank
+ following_topic_match_counter += 1
+ topic_match_counter = following_topic_match_counter
+ return topic_match_dicts
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..8d91941
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+ "setuptools",
+ "wheel"
+]
+build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
index 96c4273..e881c16 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = holmes-extractor
-version = 2.2.1
+version = 3.0.0
description = Information extraction from English and German texts based on predicate logic
long_description = file: SHORTREADME.md
long_description_content_type = text/markdown
@@ -22,7 +22,25 @@ classifiers =
License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
Natural Language :: English
Natural Language :: German
- Programming Language :: Python :: 3.7
+ Programming Language :: Python :: 3.9
Topic :: Scientific/Engineering :: Artificial Intelligence
Topic :: Scientific/Engineering :: Information Analysis
Topic :: Text Processing :: Linguistic
+
+[options]
+include_package_data = True
+packages = find:
+python_requires = >=3.9
+install_requires =
+ spacy>=3.1.0,<3.2.0
+ coreferee~=1.1.0
+ scipy
+ sklearn
+ bs4
+ rdflib
+ jsonpickle
+ msgpack-numpy
+ falcon
+ torch
+[options.package_data]
+* = *.cfg, *.csv
diff --git a/setup.py b/setup.py
deleted file mode 100644
index ec5e670..0000000
--- a/setup.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
- packages=find_packages(),
- package_data={
- "holmes_extractor": ["data/*"]
- },
- # versions of spaCy > 2.1.0 do not currently work with neuralcoref
- install_requires=[
- 'spacy==2.1.0', 'neuralcoref==4.0.0', 'numpy', 'scipy', 'sklearn', 'bs4',
- 'rdflib', 'jsonpickle', 'msgpack-numpy', 'falcon']
-)
diff --git a/holmes_extractor/tests/common/test_errors.py b/tests/common/test_errors.py
similarity index 62%
rename from holmes_extractor/tests/common/test_errors.py
rename to tests/common/test_errors.py
index 805ffdd..7d1be56 100644
--- a/holmes_extractor/tests/common/test_errors.py
+++ b/tests/common/test_errors.py
@@ -3,11 +3,11 @@
from holmes_extractor.errors import *
import jsonpickle
-nocoref_holmes_manager = holmes.Manager('en_core_web_lg', analyze_derivational_morphology=False,
- perform_coreference_resolution=False)
+nocoref_holmes_manager = holmes.Manager('en_core_web_trf', analyze_derivational_morphology=False,
+ perform_coreference_resolution=False, number_of_workers=2)
coref_holmes_manager = holmes.Manager(
- 'en_core_web_lg', perform_coreference_resolution=True)
-german_holmes_manager = holmes.Manager('de_core_news_md')
+ 'en_core_web_trf', perform_coreference_resolution=True, number_of_workers=1)
+german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=1)
class ErrorsTest(unittest.TestCase):
@@ -22,24 +22,14 @@ def test_embedding_based_matching_on_root_node_where_no_embedding_based_matching
holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=1.0,
embedding_based_matching_on_root_words=True)
- def test_model_does_not_support_embeddings(self):
+ def test_number_of_workers_out_of_range(self):
with self.assertRaises(ValueError) as context:
holmes.Manager(model='en_core_web_sm',
- overall_similarity_threshold=0.85)
+ number_of_workers=0)
def test_language_not_supported(self):
with self.assertRaises(ValueError) as context:
- holmes.Manager(model='fr_core_news_sm')
-
- def test_coreference_resolution_not_supported_error(self):
- with self.assertRaises(ValueError) as context:
- holmes.Manager(model='de_core_news_md',
- perform_coreference_resolution=True)
-
- def test_coreference_resolution_not_supported_multiprocessing_manager_error(self):
- with self.assertRaises(ValueError) as context:
- holmes.MultiprocessingManager(model='de_core_news_md',
- perform_coreference_resolution=True)
+ holmes.Manager(model='pl_core_news_md')
def test_search_phrase_contains_conjunction(self):
with self.assertRaises(SearchPhraseContainsConjunctionError) as context:
@@ -53,21 +43,16 @@ def test_search_phrase_contains_negation(self):
nocoref_holmes_manager.register_search_phrase(
"A dog does not chase a cat")
- def test_search_phrase_contains_non_coreferring_pronoun(self):
- nocoref_holmes_manager.remove_all_search_phrases()
- nocoref_holmes_manager.register_search_phrase(
- "A cat has a dog chasing it")
-
def test_search_phrase_contains_pronoun_coreference_switched_off(self):
nocoref_holmes_manager.remove_all_search_phrases()
nocoref_holmes_manager.register_search_phrase(
- "A cat has a dog chasing it")
+ "A dog has a cat chasing it")
def test_search_phrase_contains_coreferring_pronoun(self):
with self.assertRaises(SearchPhraseContainsCoreferringPronounError) as context:
coref_holmes_manager.remove_all_search_phrases()
coref_holmes_manager.register_search_phrase(
- "A cat has a dog chasing it")
+ "A dog has a cat chasing it")
def test_search_phrase_contains_only_generic_pronoun(self):
with self.assertRaises(SearchPhraseWithoutMatchableWordsError) as context:
@@ -82,7 +67,7 @@ def test_search_phrase_contains_only_interrogative_pronoun(self):
def test_search_phrase_contains_only_grammatical_word(self):
with self.assertRaises(SearchPhraseWithoutMatchableWordsError) as context:
nocoref_holmes_manager.remove_all_search_phrases()
- nocoref_holmes_manager.register_search_phrase("A")
+ nocoref_holmes_manager.register_search_phrase("the")
def test_search_phrase_contains_two_normal_clauses(self):
with self.assertRaises(SearchPhraseContainsMultipleClausesError) as context:
@@ -108,77 +93,36 @@ def test_duplicate_document_with_parse_and_register_document(self):
nocoref_holmes_manager.parse_and_register_document("A", "A")
nocoref_holmes_manager.parse_and_register_document("A", "A")
- def test_duplicate_document_with_register_parsed_document(self):
- with self.assertRaises(DuplicateDocumentError) as context:
- nocoref_holmes_manager.remove_all_documents()
- holmes_doc = nocoref_holmes_manager.semantic_analyzer.parse("A")
- holmes_doc2 = nocoref_holmes_manager.semantic_analyzer.parse("B")
- nocoref_holmes_manager.register_parsed_document(holmes_doc, 'C')
- nocoref_holmes_manager.register_parsed_document(holmes_doc2, 'C')
-
- def test_duplicate_document_with_deserialize_and_register_document(self):
+ def test_duplicate_document_with_register_serialized_document(self):
with self.assertRaises(DuplicateDocumentError) as context:
nocoref_holmes_manager.remove_all_documents()
nocoref_holmes_manager.parse_and_register_document("A", '')
deserialized_doc = nocoref_holmes_manager.serialize_document('')
- nocoref_holmes_manager.deserialize_and_register_document(
+ nocoref_holmes_manager.register_serialized_document(
deserialized_doc, '')
- def test_duplicate_document_with_parse_and_register_documents_multiprocessing(self):
+ def test_duplicate_document_with_register_serialized_documents(self):
with self.assertRaises(DuplicateDocumentError) as context:
- m = holmes.MultiprocessingManager(
- 'en_core_web_sm', number_of_workers=2)
- m.parse_and_register_documents({'A': "A"})
- m.parse_and_register_documents({'A': "A"})
-
- def test_duplicate_document_with_deserialize_and_register_document_multiprocessing(self):
- with self.assertRaises(DuplicateDocumentError) as context:
- m_normal = holmes.Manager(
- 'en_core_web_sm', perform_coreference_resolution=False)
- m_normal.remove_all_documents()
- m_normal.parse_and_register_document("A", '')
- deserialized_doc = m_normal.serialize_document('')
- m = holmes.MultiprocessingManager('en_core_web_sm',
- perform_coreference_resolution=False, number_of_workers=2)
- m.deserialize_and_register_documents({'A': deserialized_doc})
- m.deserialize_and_register_documents({'A': deserialized_doc})
-
- def test_serialization_not_supported_on_serialization(self):
- with self.assertRaises(SerializationNotSupportedError) as context:
- coref_holmes_manager.remove_all_documents()
- coref_holmes_manager.parse_and_register_document("A", '')
- deserialized_doc = coref_holmes_manager.serialize_document('')
-
- def test_serialization_not_supported_on_serialization_multiprocessing(self):
- with self.assertRaises(SerializationNotSupportedError) as context:
- m_normal = holmes.Manager(
- 'en_core_web_sm', perform_coreference_resolution=False)
- m_normal.remove_all_documents()
- m_normal.parse_and_register_document("A", '')
- deserialized_doc = m_normal.serialize_document('')
- m = holmes.MultiprocessingManager(
- 'en_core_web_sm', number_of_workers=2)
- m.deserialize_and_register_documents({'A': deserialized_doc})
-
- def test_serialization_not_supported_on_deserialization(self):
- with self.assertRaises(SerializationNotSupportedError) as context:
nocoref_holmes_manager.remove_all_documents()
- coref_holmes_manager.remove_all_documents()
- coref_holmes_manager.deserialize_and_register_document("A", '')
nocoref_holmes_manager.parse_and_register_document("A", '')
deserialized_doc = nocoref_holmes_manager.serialize_document('')
- nocoref_holmes_manager.deserialize_and_register_document(
- deserialized_doc, '')
+ nocoref_holmes_manager.register_serialized_documents(
+ {'': deserialized_doc})
def test_no_search_phrase_error(self):
with self.assertRaises(NoSearchPhraseError) as context:
nocoref_holmes_manager.remove_all_search_phrases()
- nocoref_holmes_manager.match_search_phrases_against("Try this")
+ nocoref_holmes_manager.match(document_text="Try this")
+
+ def test_no_document_error_structural_match(self):
+ with self.assertRaises(NoDocumentError) as context:
+ nocoref_holmes_manager.remove_all_documents()
+ nocoref_holmes_manager.match(search_phrase_text="Try this")
- def test_no_document_error(self):
- with self.assertRaises(NoSearchedDocumentError) as context:
+ def test_no_document_error_topic_match(self):
+ with self.assertRaises(NoDocumentError) as context:
nocoref_holmes_manager.remove_all_documents()
- nocoref_holmes_manager.match_documents_against("Try this")
+ nocoref_holmes_manager.topic_match_documents_against(text_to_match="Try this")
def test_wrong_model_deserialization_error_documents(self):
with self.assertRaises(WrongModelDeserializationError) as context:
@@ -186,20 +130,18 @@ def test_wrong_model_deserialization_error_documents(self):
doc = nocoref_holmes_manager.parse_and_register_document(
"The cat was chased by the dog", 'pets')
serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- german_holmes_manager.deserialize_and_register_document(
+ german_holmes_manager.register_serialized_document(
serialized_doc, 'pets')
def test_wrong_version_deserialization_error_documents(self):
with self.assertRaises(WrongVersionDeserializationError) as context:
nocoref_holmes_manager.remove_all_documents()
- doc = nocoref_holmes_manager.parse_and_register_document(
+ nocoref_holmes_manager.parse_and_register_document(
"The cat was chased by the dog", 'pets')
- serialized_doc = nocoref_holmes_manager.serialize_document('pets')
- document = jsonpickle.decode(serialized_doc)
- document._version = 1
- serialized_doc = jsonpickle.encode(document)
- nocoref_holmes_manager.deserialize_and_register_document(
- serialized_doc, 'pets2')
+ doc = nocoref_holmes_manager.get_document('pets')
+ doc._.holmes_document_info.serialized_document_version = 1
+ nocoref_holmes_manager.register_serialized_document(
+ doc.to_bytes(), 'pets2')
def test_wrong_model_deserialization_error_supervised_models(self):
with self.assertRaises(WrongModelDeserializationError) as context:
@@ -262,19 +204,58 @@ def test_no_phraselets_after_filtering_error(self):
sttb.prepare()
sttb.train()
- def test_embedding_threshold_higher_than_relation_threshold_normal_manager(self):
- with self.assertRaises(EmbeddingThresholdGreaterThanRelationThresholdError) as context:
- m = holmes.Manager('en_core_web_sm')
+ def test_embedding_threshold_too_high(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ relation_matching_frequency_threshold=0.75, embedding_matching_frequency_threshold=1.5)
+
+ def test_embedding_threshold_too_low(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ relation_matching_frequency_threshold=0.75, embedding_matching_frequency_threshold=-1.5)
+
+ def test_relation_threshold_too_high(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ relation_matching_frequency_threshold=1.75, embedding_matching_frequency_threshold=0.5)
+
+ def test_relation_threshold_too_low(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ relation_matching_frequency_threshold=-0.75, embedding_matching_frequency_threshold=-0.5)
+
+ def test_embedding_threshold_less_than_relation_threshold(self):
+ with self.assertRaises(EmbeddingThresholdLessThanRelationThresholdError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ relation_matching_frequency_threshold=0.75, embedding_matching_frequency_threshold=0.5)
+
+ def test_word_embedding_match_threshold_out_of_range(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ word_embedding_match_threshold=1.2)
+
+ def test_initial_question_word_embedding_match_threshold_out_of_range(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
+ m.parse_and_register_document("a")
+ coref_holmes_manager.topic_match_documents_against("b",
+ initial_question_word_embedding_match_threshold=-1.2)
+
+ def test_unrecognized_initial_question_word_behaviour(self):
+ with self.assertRaises(ValueError) as context:
+ m = holmes.Manager('en_core_web_sm', number_of_workers=1)
m.parse_and_register_document("a")
- coref_holmes_manager.topic_match_documents_returning_dictionaries_against("b",
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=2)
-
- def test_embedding_threshold_higher_than_relation_threshold_multiprocessing_manager(self):
- with self.assertRaises(EmbeddingThresholdGreaterThanRelationThresholdError) as context:
- m = holmes.MultiprocessingManager(
- 'en_core_web_sm', number_of_workers=1)
- m.parse_and_register_documents({'': "a"})
- m.topic_match_documents_returning_dictionaries_against("b",
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=2)
+ coref_holmes_manager.topic_match_documents_against("b",
+ initial_question_word_behaviour='r')
diff --git a/holmes_extractor/tests/common/test_matching_modes.py b/tests/common/test_manager.py
similarity index 58%
rename from holmes_extractor/tests/common/test_matching_modes.py
rename to tests/common/test_manager.py
index 981fb37..3832602 100644
--- a/holmes_extractor/tests/common/test_matching_modes.py
+++ b/tests/common/test_manager.py
@@ -2,10 +2,9 @@
import holmes_extractor as holmes
holmes_manager = holmes.Manager(
- 'en_core_web_lg', perform_coreference_resolution=False)
+ 'en_core_web_trf', perform_coreference_resolution=False, number_of_workers=2)
-
-class MatchingModesTest(unittest.TestCase):
+class ManagerTest(unittest.TestCase):
def _register_multiple_documents_and_search_phrases(self):
holmes_manager.remove_all_search_phrases()
@@ -18,47 +17,66 @@ def _register_multiple_documents_and_search_phrases(self):
"A dog chases a cat", label="test")
holmes_manager.register_search_phrase(
"A lion eats a gnu", label="test")
+ holmes_manager.register_search_phrase(
+ "irrelevancy", label="alpha")
return
def test_multiple(self):
self._register_multiple_documents_and_search_phrases()
- self.assertEqual(len(holmes_manager.match_returning_dictionaries()), 2)
+ self.assertEqual(len(holmes_manager.match()), 2)
def test_remove_all_search_phrases(self):
self._register_multiple_documents_and_search_phrases()
holmes_manager.remove_all_search_phrases()
holmes_manager.register_search_phrase("A dog chases a cat")
- self.assertEqual(len(holmes_manager.match_returning_dictionaries()), 1)
+ self.assertEqual(len(holmes_manager.match()), 1)
def test_remove_all_documents(self):
self._register_multiple_documents_and_search_phrases()
holmes_manager.remove_all_documents()
holmes_manager.parse_and_register_document(
document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets')
- self.assertEqual(len(holmes_manager.match_returning_dictionaries()), 1)
+ self.assertEqual(len(holmes_manager.match()), 1)
def test_remove_document(self):
self._register_multiple_documents_and_search_phrases()
+ holmes_manager.parse_and_register_document(
+ document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets2')
+ self.assertEqual(len(holmes_manager.match()), 3)
holmes_manager.remove_document(label='pets')
holmes_manager.remove_document(label='safari')
- holmes_manager.parse_and_register_document(
- document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets')
- self.assertEqual(len(holmes_manager.match_returning_dictionaries()), 1)
+ matches = holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self.assertEqual(matches[0]['document'], 'pets2')
def test_match_search_phrases_against(self):
self._register_multiple_documents_and_search_phrases()
- self.assertEqual(len(holmes_manager.match_search_phrases_against(
+ self.assertEqual(len(holmes_manager.match(document_text=
"All the time I am testing here, dogs keep on chasing cats.")), 1)
def test_match_documents_against(self):
self._register_multiple_documents_and_search_phrases()
- self.assertEqual(len(holmes_manager.match_documents_against(
+ self.assertEqual(len(holmes_manager.match(search_phrase_text=
"A lion eats a gnu.")), 1)
+ def test_match_documents_and_search_phrases_against(self):
+ self._register_multiple_documents_and_search_phrases()
+ self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn",
+ document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1)
+ holmes_manager.remove_all_documents()
+ holmes_manager.remove_all_search_phrases()
+ self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn",
+ document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1)
+
def test_get_labels(self):
self._register_multiple_documents_and_search_phrases()
- self.assertEqual(holmes_manager.threadsafe_container.list_search_phrase_labels(),
- ['test'])
+ self.assertEqual(holmes_manager.list_search_phrase_labels(),
+ ['alpha', 'test'])
+
+ def test_get_document(self):
+ self._register_multiple_documents_and_search_phrases()
+ self.assertEqual(holmes_manager.get_document('safari')[5]._.holmes.lemma,
+ 'lion')
def test_remove_all_search_phrases_with_label(self):
holmes_manager.remove_all_search_phrases()
@@ -68,13 +86,13 @@ def test_remove_all_search_phrases_with_label(self):
holmes_manager.register_search_phrase("testd", label="test2")
holmes_manager.remove_all_search_phrases_with_label("test2")
holmes_manager.remove_all_search_phrases_with_label("testb")
- self.assertEqual(holmes_manager.threadsafe_container.list_search_phrase_labels(),
+ self.assertEqual(holmes_manager.list_search_phrase_labels(),
['test1'])
- self.assertEqual(len(holmes_manager.match_search_phrases_against(
+ self.assertEqual(len(holmes_manager.match(document_text=
"testa")), 1)
- self.assertEqual(len(holmes_manager.match_search_phrases_against(
+ self.assertEqual(len(holmes_manager.match(document_text=
"testb")), 1)
- self.assertEqual(len(holmes_manager.match_search_phrases_against(
+ self.assertEqual(len(holmes_manager.match(document_text=
"testc")), 0)
- self.assertEqual(len(holmes_manager.match_search_phrases_against(
+ self.assertEqual(len(holmes_manager.match(document_text=
"testd")), 0)
diff --git a/tests/common/test_multithreading.py b/tests/common/test_multithreading.py
new file mode 100644
index 0000000..354b0f8
--- /dev/null
+++ b/tests/common/test_multithreading.py
@@ -0,0 +1,263 @@
+import unittest
+import holmes_extractor as holmes
+import os
+import json
+from threading import Thread
+from queue import Queue
+
+NUMBER_OF_THREADS = 50
+
+script_directory = os.path.dirname(os.path.realpath(__file__))
+ontology = holmes.Ontology(os.sep.join(
+ (script_directory, 'test_ontology.owl')))
+manager = holmes.Manager(
+ 'en_core_web_trf', ontology=ontology, overall_similarity_threshold=0.90,
+ number_of_workers=2)
+manager.parse_and_register_document(
+ "The hungry lion chased the angry gnu.", 'lion')
+manager.parse_and_register_document(
+ "The hungry tiger chased the angry gnu.", 'tiger')
+manager.parse_and_register_document(
+ "The hungry panther chased the angry gnu.", 'panther')
+manager.parse_and_register_document(
+ "I saw a donkey. It was chasing the angry gnu.", 'donkey')
+manager.parse_and_register_document("A foal", 'foal')
+manager.register_search_phrase('A gnu is chased')
+manager.register_search_phrase('An angry gnu')
+manager.register_search_phrase('A tiger chases')
+manager.register_search_phrase('I discussed various things with ENTITYPERSON')
+manager.register_search_phrase("A horse")
+sttb = manager.get_supervised_topic_training_basis(classification_ontology=ontology,
+ oneshot=False, verbose=False)
+sttb.parse_and_register_training_document("A puppy", 'puppy', 'd0')
+sttb.parse_and_register_training_document("A pussy", 'cat', 'd1')
+sttb.parse_and_register_training_document("A dog on a lead", 'dog', 'd2')
+sttb.parse_and_register_training_document("Mimi Momo", 'Mimi Momo', 'd3')
+sttb.parse_and_register_training_document("An animal", 'animal', 'd4')
+sttb.parse_and_register_training_document("A computer", 'computers', 'd5')
+sttb.parse_and_register_training_document("A robot", 'computers', 'd6')
+sttb.register_additional_classification_label('parrot')
+sttb.register_additional_classification_label('hound')
+sttb.prepare()
+trainer = sttb.train(minimum_occurrences=0, cv_threshold=0, mlp_max_iter=10000)
+stc = trainer.classifier()
+
+
+class MultithreadingTest(unittest.TestCase):
+
+ def _process_threads(self, method, first_argument, expected_output):
+ queue = Queue()
+ for i in range(NUMBER_OF_THREADS):
+ t = Thread(target=method,
+ args=(first_argument, queue))
+ t.start()
+ for i in range(NUMBER_OF_THREADS):
+ output = queue.get(True, 20)
+ self.assertEqual(output, expected_output)
+
+ def _match_against_documents_within_thread(self, search_phrase, queue):
+ queue.put(manager.match(search_phrase_text=search_phrase))
+
+ def _inner_match_against_documents(self, search_phrase, expected_output):
+ self._process_threads(self._match_against_documents_within_thread,
+ search_phrase, expected_output)
+
+ def _match_against_search_phrases_within_thread(self, document_text, queue):
+ queue.put(manager.match(document_text=document_text))
+
+ def _inner_match_against_search_phrases(self, document_text, expected_output):
+ self._process_threads(self._match_against_search_phrases_within_thread,
+ document_text, expected_output)
+
+ def _inner_classify(self, documents, expected_output):
+ self._process_threads(self._classify_within_thread,
+ documents, expected_output)
+
+ def _classify_within_thread(self, documents, queue):
+ output = []
+ for document in documents:
+ output.append(stc.parse_and_classify(document))
+ queue.put(output)
+
+ def test_multithreading_matching_against_documents_general(self):
+ self._inner_match_against_documents("A gnu is chased",
+ [{'search_phrase_label': '', 'search_phrase_text': 'A gnu is chased', 'document': 'donkey', 'index_within_document': 7, 'sentences_within_document': 'It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 10, 'first_document_token_index': 10, 'last_document_token_index': 10, 'structurally_matched_document_token_index': 10, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 7, 'first_document_token_index': 7, 'last_document_token_index': 7, 'structurally_matched_document_token_index': 7, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'depth': 0, 'explanation': 'Is a synonym of CHASE in the ontology.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'A gnu is chased', 'document': 'lion', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'A gnu is chased', 'document': 'panther', 'index_within_document': 3, 'sentences_within_document': 'The hungry panther chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'A gnu is chased', 'document': 'tiger', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}])
+ self._inner_match_against_documents("An angry gnu",
+ [{'search_phrase_label': '', 'search_phrase_text': 'An angry gnu', 'document': 'donkey', 'index_within_document': 10, 'sentences_within_document': 'It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 9, 'first_document_token_index': 9, 'last_document_token_index': 9, 'structurally_matched_document_token_index': 9, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 10, 'first_document_token_index': 10, 'last_document_token_index': 10, 'structurally_matched_document_token_index': 10, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'An angry gnu', 'document': 'lion', 'index_within_document': 6, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 5, 'first_document_token_index': 5, 'last_document_token_index': 5, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'An angry gnu', 'document': 'panther', 'index_within_document': 6, 'sentences_within_document': 'The hungry panther chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 5, 'first_document_token_index': 5, 'last_document_token_index': 5, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'An angry gnu', 'document': 'tiger', 'index_within_document': 6, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 5, 'first_document_token_index': 5, 'last_document_token_index': 5, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}])
+
+ def test_multithreading_matching_against_documents_coreference(self):
+ self._inner_match_against_documents("A donkey chases",
+ [{'search_phrase_label': '', 'search_phrase_text': 'A donkey chases', 'document': 'donkey', 'index_within_document': 7, 'sentences_within_document': 'I saw a donkey. It was chasing the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': True, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'donkey', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'donkey', 'document_phrase': 'a donkey', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': True, 'extracted_word': 'donkey', 'depth': 0, 'explanation': 'Matches DONKEY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'chase', 'document_token_index': 7, 'first_document_token_index': 7, 'last_document_token_index': 7, 'structurally_matched_document_token_index': 7, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'depth': 0, 'explanation': 'Is a synonym of CHASE in the ontology.'}]}])
+
+ def test_multithreading_matching_against_documents_embedding_matching(self):
+ self._inner_match_against_documents("A tiger chases a gnu",
+ [{'search_phrase_label': '', 'search_phrase_text': 'A tiger chases a gnu', 'document': 'tiger', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'tiger', 'document_token_index': 2, 'first_document_token_index': 2, 'last_document_token_index': 2, 'structurally_matched_document_token_index': 2, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'tiger', 'document_phrase': 'The hungry tiger', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'tiger', 'depth': 0, 'explanation': 'Matches TIGER directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}, {'search_phrase_token_index': 4, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}, {'search_phrase_label': '', 'search_phrase_text': 'A tiger chases a gnu', 'document': 'lion', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '0.90286449', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'tiger', 'document_token_index': 2, 'first_document_token_index': 2, 'last_document_token_index': 2, 'structurally_matched_document_token_index': 2, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'lion', 'document_phrase': 'The hungry lion', 'match_type': 'embedding', 'negated': False, 'uncertain': False, 'similarity_measure': '0.7359829', 'involves_coreference': False, 'extracted_word': 'lion', 'depth': 0, 'explanation': 'Has a word embedding that is 73% similar to TIGER.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}, {'search_phrase_token_index': 4, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}])
+
+ def test_multithreading_matching_against_documents_ontology_matching(self):
+ self._inner_match_against_documents("A horse",
+ [{'search_phrase_label': '', 'search_phrase_text': 'A horse', 'document': 'foal', 'index_within_document': 1, 'sentences_within_document': 'A foal', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'horse', 'document_token_index': 1, 'first_document_token_index': 1, 'last_document_token_index': 1, 'structurally_matched_document_token_index': 1, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'foal', 'document_phrase': 'A foal', 'match_type': 'ontology', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'foal', 'depth': 1, 'explanation': 'Is a child of HORSE in the ontology.'}]}])
+
+ def test_multithreading_matching_against_search_phrases_general(self):
+ self._inner_match_against_search_phrases("The hungry lion chased the angry gnu.",
+ [{'search_phrase_label': 'A gnu is chased', 'search_phrase_text': 'A gnu is chased', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}, {'search_phrase_label': 'An angry gnu', 'search_phrase_text': 'An angry gnu', 'document': '', 'index_within_document': 6, 'sentences_within_document': 'The hungry lion chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 5, 'first_document_token_index': 5, 'last_document_token_index': 5, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}])
+ self._inner_match_against_search_phrases("The hungry tiger chased the angry gnu.",
+ [{'search_phrase_label': 'A gnu is chased', 'search_phrase_text': 'A gnu is chased', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}, {'search_phrase_label': 'A tiger chases', 'search_phrase_text': 'A tiger chases', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'tiger', 'document_token_index': 2, 'first_document_token_index': 2, 'last_document_token_index': 2, 'structurally_matched_document_token_index': 2, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'tiger', 'document_phrase': 'The hungry tiger', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'tiger', 'depth': 0, 'explanation': 'Matches TIGER directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'chase', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chase', 'document_phrase': 'chased', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chase', 'depth': 0, 'explanation': 'Matches CHASE directly.'}]}, {'search_phrase_label': 'An angry gnu', 'search_phrase_text': 'An angry gnu', 'document': '', 'index_within_document': 6, 'sentences_within_document': 'The hungry tiger chased the angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 5, 'first_document_token_index': 5, 'last_document_token_index': 5, 'structurally_matched_document_token_index': 5, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 6, 'first_document_token_index': 6, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'the angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}])
+ self._inner_match_against_search_phrases(
+ "I saw a hungry panther. It was chasing an angry gnu.",
+ [{'search_phrase_label': 'A gnu is chased', 'search_phrase_text': 'A gnu is chased', 'document': '', 'index_within_document': 8, 'sentences_within_document': 'It was chasing an angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'gnu', 'document_token_index': 11, 'first_document_token_index': 11, 'last_document_token_index': 11, 'structurally_matched_document_token_index': 11, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'an angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'chase', 'document_token_index': 8, 'first_document_token_index': 8, 'last_document_token_index': 8, 'structurally_matched_document_token_index': 8, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'chasing', 'document_phrase': 'chasing', 'match_type': 'ontology', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'chasing', 'depth': 0, 'explanation': 'Is a synonym of CHASE in the ontology.'}]}, {'search_phrase_label': 'An angry gnu', 'search_phrase_text': 'An angry gnu', 'document': '', 'index_within_document': 11, 'sentences_within_document': 'It was chasing an angry gnu.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'angry', 'document_token_index': 10, 'first_document_token_index': 10, 'last_document_token_index': 10, 'structurally_matched_document_token_index': 10, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'angry', 'document_phrase': 'angry', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'angry', 'depth': 0, 'explanation': 'Matches ANGRY directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'gnu', 'document_token_index': 11, 'first_document_token_index': 11, 'last_document_token_index': 11, 'structurally_matched_document_token_index': 11, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'gnu', 'document_phrase': 'an angry gnu', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'gnu', 'depth': 0, 'explanation': 'Matches GNU directly.'}]}])
+
+ def test_multithreading_matching_against_search_phrases_entity_matching(self):
+ self._inner_match_against_search_phrases(
+ "I discussed various things with Richard Hudson.",
+ [{'search_phrase_label': 'I discussed various things with ENTITYPERSON', 'search_phrase_text': 'I discussed various things with ENTITYPERSON', 'document': '', 'index_within_document': 1, 'sentences_within_document': 'I discussed various things with Richard Hudson.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'discuss', 'document_token_index': 1, 'first_document_token_index': 1, 'last_document_token_index': 1, 'structurally_matched_document_token_index': 1, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'discuss', 'document_phrase': 'discussed', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'discuss', 'depth': 0, 'explanation': 'Matches DISCUSS directly.'}, {'search_phrase_token_index': 2, 'search_phrase_word': 'various', 'document_token_index': 2, 'first_document_token_index': 2, 'last_document_token_index': 2, 'structurally_matched_document_token_index': 2, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'various', 'document_phrase': 'various', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'various', 'depth': 0, 'explanation': 'Matches VARIOUS directly.'}, {'search_phrase_token_index': 3, 'search_phrase_word': 'thing', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'thing', 'document_phrase': 'various things', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'thing', 'depth': 0, 'explanation': 'Matches THING directly.'}, {'search_phrase_token_index': 4, 'search_phrase_word': 'with', 'document_token_index': 4, 'first_document_token_index': 4, 'last_document_token_index': 4, 'structurally_matched_document_token_index': 4, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'with', 'document_phrase': 'with', 'match_type': 'direct', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'with', 'depth': 0, 'explanation': 'Matches WITH directly.'}, {'search_phrase_token_index': 5, 'search_phrase_word': 'ENTITYPERSON', 'document_token_index': 6, 'first_document_token_index': 5, 'last_document_token_index': 6, 'structurally_matched_document_token_index': 6, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'Richard Hudson', 'document_phrase': 'Richard Hudson', 'match_type': 'entity', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'Richard Hudson', 'depth': 0, 'explanation': 'Has an entity label matching ENTITYPERSON.'}]}])
+
+ def test_multithreading_matching_against_search_phrases_ontology_matching(self):
+ self._inner_match_against_search_phrases(
+ "I saw a foal.",
+ [{'search_phrase_label': 'A horse', 'search_phrase_text': 'A horse', 'document': '', 'index_within_document': 3, 'sentences_within_document': 'I saw a foal.', 'negated': False, 'uncertain': False, 'involves_coreference': False, 'overall_similarity_measure': '1.0', 'word_matches': [{'search_phrase_token_index': 1, 'search_phrase_word': 'horse', 'document_token_index': 3, 'first_document_token_index': 3, 'last_document_token_index': 3, 'structurally_matched_document_token_index': 3, 'document_subword_index': None, 'document_subword_containing_token_index': None, 'document_word': 'foal', 'document_phrase': 'a foal', 'match_type': 'ontology', 'negated': False, 'uncertain': False, 'similarity_measure': '1.0', 'involves_coreference': False, 'extracted_word': 'foal', 'depth': 1, 'explanation': 'Is a child of HORSE in the ontology.'}]}])
+
+ def test_multithreading_supervised_document_classification(self):
+
+ self._inner_classify(["You are a robot.", "You are a cat",
+ "My name is Charles and I like sewing.",
+ "Your dog appears to be on a lead."],
+ [['computers'], ['animal'], [], ['animal', 'dog', 'hound']])
+
+ def test_multithreading_topic_matching(self):
+
+ def topic_match_within_thread():
+ topic_matches = manager.topic_match_documents_against(
+ "Once upon a time a foal chased a hungry panther")
+ output = [topic_matches[0]['document_label'], topic_matches[0]['text'],
+ topic_matches[1]['document_label'], topic_matches[1]['text']]
+ queue.put(output)
+
+ queue = Queue()
+ for i in range(NUMBER_OF_THREADS):
+ t = Thread(target=topic_match_within_thread)
+ t.start()
+ for i in range(NUMBER_OF_THREADS):
+ output = queue.get(True, 180)
+ self.assertEqual(output, ['panther', 'The hungry panther chased the angry gnu.',
+ 'foal', 'A foal'])
+
+ def test_parsed_document_and_search_phrase_registration(self):
+
+ def add_document_and_search_phrase(counter):
+ manager.parse_and_register_document("People discuss relevancies",
+ ' '.join(('Relevant', str(counter))))
+ manager.register_search_phrase("People discuss relevancies")
+
+ manager.remove_all_documents()
+ manager.parse_and_register_document('something')
+
+ for i in range(NUMBER_OF_THREADS):
+ t = Thread(target=add_document_and_search_phrase, args=(i,))
+ t.start()
+
+ last_number_of_matches = 0
+ for counter in range(500):
+ matches = [match for match in manager.match() if
+ match['search_phrase_label'] == "People discuss relevancies"]
+ for match in matches:
+ self.assertTrue(match['document'].startswith('Relevant'))
+ self.assertFalse(match['negated'])
+ self.assertFalse(match['uncertain'])
+ self.assertFalse(match['involves_coreference'])
+ self.assertEqual(match['overall_similarity_measure'], '1.0')
+ self.assertEqual(match['index_within_document'], 1)
+ self.assertEqual(match['word_matches'][0]['document_word'], 'People')
+ self.assertEqual(
+ match['word_matches'][0]['search_phrase_word'], 'people')
+ self.assertEqual(match['word_matches'][0]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(
+ match['word_matches'][0]['search_phrase_token_index'], 0)
+ self.assertEqual(
+ match['word_matches'][1]['document_word'], 'discuss')
+ self.assertEqual(
+ match['word_matches'][1]['search_phrase_word'], 'discuss')
+ self.assertEqual(match['word_matches'][1]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][1]['document_token_index'], 1)
+ self.assertEqual(
+ match['word_matches'][1]['search_phrase_token_index'], 1)
+ self.assertEqual(
+ match['word_matches'][2]['document_word'], 'relevancy')
+ self.assertEqual(
+ match['word_matches'][2]['search_phrase_word'], 'relevancy')
+ self.assertEqual(match['word_matches'][2]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(
+ match['word_matches'][2]['search_phrase_token_index'], 2)
+
+ this_number_of_matches = len(matches)
+ self.assertFalse(this_number_of_matches < last_number_of_matches)
+ last_number_of_matches = this_number_of_matches
+ if this_number_of_matches == NUMBER_OF_THREADS * NUMBER_OF_THREADS:
+ break
+ self.assertFalse(counter == 499)
+ dictionary, maximum = manager.get_corpus_frequency_information()
+ self.assertEqual(dictionary['people'], NUMBER_OF_THREADS)
+ self.assertEqual(dictionary['discuss'], NUMBER_OF_THREADS)
+ self.assertEqual(dictionary['relevancy'], NUMBER_OF_THREADS)
+ self.assertEqual(maximum, NUMBER_OF_THREADS)
+
+ def test_serialized_document_and_search_phrase_registration(self):
+
+ def add_document_and_search_phrase(counter):
+ manager.register_serialized_document(serialized_document,
+ ' '.join(('Irrelevant', str(counter))))
+ manager.register_search_phrase("People discuss irrelevancies")
+
+ serialized_document = manager.nlp('People discuss irrelevancies').to_bytes()
+
+ manager.remove_all_documents()
+ manager.parse_and_register_document('something')
+ for i in range(NUMBER_OF_THREADS):
+ t = Thread(target=add_document_and_search_phrase, args=(i,))
+ t.start()
+
+ last_number_of_matches = 0
+ for counter in range(500):
+ matches = [match for match in manager.match() if
+ match['search_phrase_label'] == "People discuss irrelevancies"]
+ for match in matches:
+ self.assertTrue(match['document'].startswith('Irrelevant'))
+ self.assertFalse(match['negated'])
+ self.assertFalse(match['uncertain'])
+ self.assertFalse(match['involves_coreference'])
+ self.assertEqual(match['overall_similarity_measure'], '1.0')
+ self.assertEqual(match['index_within_document'], 1)
+ self.assertEqual(match['word_matches'][0]['document_word'], 'People')
+ self.assertEqual(
+ match['word_matches'][0]['search_phrase_word'], 'people')
+ self.assertEqual(match['word_matches'][0]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(
+ match['word_matches'][0]['search_phrase_token_index'], 0)
+ self.assertEqual(
+ match['word_matches'][1]['document_word'], 'discuss')
+ self.assertEqual(
+ match['word_matches'][1]['search_phrase_word'], 'discuss')
+ self.assertEqual(match['word_matches'][1]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][1]['document_token_index'], 1)
+ self.assertEqual(
+ match['word_matches'][1]['search_phrase_token_index'], 1)
+ self.assertEqual(
+ match['word_matches'][2]['document_word'], 'irrelevancy')
+ self.assertEqual(
+ match['word_matches'][2]['search_phrase_word'], 'irrelevancy')
+ self.assertEqual(match['word_matches'][2]['match_type'], 'direct')
+ self.assertEqual(match['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(
+ match['word_matches'][2]['search_phrase_token_index'], 2)
+
+ this_number_of_matches = len(matches)
+ self.assertFalse(this_number_of_matches < last_number_of_matches)
+ last_number_of_matches = this_number_of_matches
+ if this_number_of_matches == NUMBER_OF_THREADS * NUMBER_OF_THREADS:
+ break
+ self.assertFalse(counter == 499)
+ dictionary, maximum = manager.get_corpus_frequency_information()
+ self.assertEqual(dictionary['irrelevancy'], NUMBER_OF_THREADS)
+ self.assertEqual(maximum, NUMBER_OF_THREADS)
diff --git a/holmes_extractor/tests/common/test_ontology.owl b/tests/common/test_ontology.owl
similarity index 100%
rename from holmes_extractor/tests/common/test_ontology.owl
rename to tests/common/test_ontology.owl
diff --git a/holmes_extractor/tests/common/test_ontology.py b/tests/common/test_ontology.py
similarity index 100%
rename from holmes_extractor/tests/common/test_ontology.py
rename to tests/common/test_ontology.py
diff --git a/holmes_extractor/tests/common/test_ontology2.owl b/tests/common/test_ontology2.owl
similarity index 98%
rename from holmes_extractor/tests/common/test_ontology2.owl
rename to tests/common/test_ontology2.owl
index c288439..0de73cc 100644
--- a/holmes_extractor/tests/common/test_ontology2.owl
+++ b/tests/common/test_ontology2.owl
@@ -1,23 +1,23 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/common/test_serialization.py b/tests/common/test_serialization.py
new file mode 100644
index 0000000..a92daa9
--- /dev/null
+++ b/tests/common/test_serialization.py
@@ -0,0 +1,122 @@
+import unittest
+import os
+import holmes_extractor as holmes
+
+script_directory = os.path.dirname(os.path.realpath(__file__))
+ontology = holmes.Ontology(os.sep.join(
+ (script_directory, 'test_ontology.owl')))
+holmes_manager = holmes.Manager('en_core_web_trf', number_of_workers=2)
+holmes_manager.register_search_phrase("A dog chases a cat")
+german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=2)
+
+
+class SerializationTest(unittest.TestCase):
+
+ def test_matching_with_holmes_manager_document_after_serialization(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "The cat was chased by the dog", 'pets')
+ serialized_doc = holmes_manager.serialize_document('pets')
+ self.assertEqual(len(holmes_manager.match()), 1)
+
+ def test_matching_with_reserialized_holmes_manager_document(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "The cat was chased by the dog", 'pets')
+ serialized_doc = holmes_manager.serialize_document('pets')
+ holmes_manager.remove_all_documents()
+ holmes_manager.register_serialized_document(
+ serialized_doc, 'pets')
+ self.assertEqual(len(holmes_manager.match()), 1)
+
+ def test_matching_with_multiple_reserialized_holmes_manager_document(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "The cat was chased by the dog", 'pets')
+ serialized_doc = holmes_manager.serialize_document('pets')
+ working_dict = {'pets': serialized_doc, 'pets2': serialized_doc}
+ holmes_manager.remove_all_documents()
+ holmes_manager.register_serialized_documents(working_dict)
+ self.assertEqual(len(holmes_manager.match()), 2)
+
+ def test_serialization_with_coreference(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "I saw a cat. It was chased by the dog", 'pets')
+ serialized_doc = holmes_manager.serialize_document('pets')
+ holmes_manager.remove_all_documents()
+ holmes_manager.register_serialized_document(
+ serialized_doc, 'pets')
+ self.assertEqual(len(holmes_manager.match()), 1)
+
+ def test_matching_with_both_documents(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "The cat was chased by the dog", 'pets')
+ serialized_doc = holmes_manager.serialize_document('pets')
+ holmes_manager.register_serialized_document(
+ serialized_doc, 'pets2')
+ self.assertEqual(len(holmes_manager.match()), 2)
+
+ def test_document_to_serialize_does_not_exist(self):
+ holmes_manager.remove_all_documents()
+ serialized_doc = holmes_manager.serialize_document('pets')
+ self.assertEqual(serialized_doc, None)
+
+ def test_parent_token_indexes(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "Houses in the village.", 'village')
+ serialized_doc = holmes_manager.serialize_document('village')
+ holmes_manager.register_serialized_document(
+ serialized_doc, 'village2')
+ old_doc = holmes_manager.get_document(
+ 'village')
+ new_doc = holmes_manager.get_document(
+ 'village2')
+ self.assertEqual(old_doc[0]._.holmes.string_representation_of_children(),
+ '1:prep; 3:pobjp')
+ self.assertEqual(old_doc[3]._.holmes.string_representation_of_parents(),
+ '0:pobjp; 1:pobj')
+ self.assertEqual(old_doc[3]._.holmes.coreference_linked_parent_dependencies, [
+ [0, 'pobjp'], [1, 'pobj']])
+ self.assertEqual(new_doc[0]._.holmes.string_representation_of_children(),
+ '1:prep; 3:pobjp')
+ self.assertEqual(new_doc[3]._.holmes.coreference_linked_parent_dependencies, [
+ [0, 'pobjp'], [1, 'pobj']])
+ self.assertEqual(new_doc[3]._.holmes.string_representation_of_parents(),
+ '0:pobjp; 1:pobj')
+
+ def test_subwords(self):
+ german_holmes_manager.remove_all_documents()
+ german_holmes_manager.parse_and_register_document(
+ "Bundesoberbehörde.", 'bo')
+ serialized_doc = german_holmes_manager.serialize_document('bo')
+ german_holmes_manager.register_serialized_document(
+ serialized_doc, 'bo2')
+ old_doc = german_holmes_manager.get_document('bo')
+ new_doc = german_holmes_manager.get_document(
+ 'bo2')
+ self.assertEqual(old_doc[0]._.holmes.subwords[0].text, 'Bundes')
+ self.assertEqual(old_doc[0]._.holmes.subwords[0].lemma, 'bund')
+ self.assertEqual(old_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
+ self.assertEqual(old_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
+ self.assertEqual(new_doc[0]._.holmes.subwords[0].text, 'Bundes')
+ self.assertEqual(new_doc[0]._.holmes.subwords[0].lemma, 'bund')
+ self.assertEqual(new_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
+ self.assertEqual(new_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
+
+ def test_derived_lemma(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "A lot of information.", 'information')
+ serialized_doc = holmes_manager.serialize_document(
+ 'information')
+ holmes_manager.register_serialized_document(
+ serialized_doc, 'information2')
+ old_doc = holmes_manager.get_document(
+ 'information')
+ new_doc = holmes_manager.get_document(
+ 'information2')
+ self.assertEqual(old_doc[3]._.holmes.derived_lemma, 'inform')
+ self.assertEqual(new_doc[3]._.holmes.derived_lemma, 'inform')
diff --git a/holmes_extractor/tests/common/test_word_level_matching.py b/tests/common/test_word_level_matching.py
similarity index 70%
rename from holmes_extractor/tests/common/test_word_level_matching.py
rename to tests/common/test_word_level_matching.py
index 33dab05..68f96fa 100644
--- a/holmes_extractor/tests/common/test_word_level_matching.py
+++ b/tests/common/test_word_level_matching.py
@@ -4,15 +4,15 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join((script_directory,'test_ontology.owl')))
-holmes_manager_coref = holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=0.85,
+holmes_manager_coref = holmes.Manager(model='en_core_web_trf', overall_similarity_threshold=0.82,
embedding_based_matching_on_root_words=True, ontology=ontology,
- perform_coreference_resolution=False)
+ perform_coreference_resolution=False, number_of_workers=2)
holmes_manager_coref.register_search_phrase('A dog chases a cat')
holmes_manager_coref.register_search_phrase('An ENTITYPERSON chases a horse')
holmes_manager_coref.register_search_phrase('A king wakes up')
holmes_manager_coref.register_search_phrase('A cat creature jumps')
holmes_manager_coref.register_search_phrase('cat creature')
-holmes_manager_coref.register_search_phrase('An industrious king loved by all')
+holmes_manager_coref.register_search_phrase('An industrious king loved by all.')
holmes_manager_coref.register_search_phrase('A narcissistic king')
holmes_manager_coref.register_search_phrase('A splendid king')
holmes_manager_coref.register_search_phrase('A kind king')
@@ -21,23 +21,24 @@
holmes_manager_coref.register_search_phrase("A strong attraction")
symmetric_ontology = holmes.Ontology(os.sep.join((script_directory,'test_ontology.owl')),
symmetric_matching=True)
-second_holmes_manager_coref = holmes.Manager(model='en_core_web_lg', overall_similarity_threshold=0.85,
+second_holmes_manager_coref = holmes.Manager(model='en_core_web_trf', overall_similarity_threshold=0.82,
embedding_based_matching_on_root_words=False, ontology=symmetric_ontology,
- perform_coreference_resolution=False)
+ perform_coreference_resolution=False, number_of_workers=1)
second_holmes_manager_coref.register_search_phrase('A narcissistic king')
second_holmes_manager_coref.register_search_phrase('A king wakes up')
second_holmes_manager_coref.register_search_phrase('A kitten goes to bed')
second_holmes_manager_coref.register_search_phrase('Mimi Momo goes to bed')
second_holmes_manager_coref.register_search_phrase('A dog goes to bed')
+second_holmes_manager_coref.register_search_phrase('A man makes an announcement')
second_holmes_manager_coref.register_search_phrase('unouno')
second_holmes_manager_coref.register_search_phrase('sześć')
class WordMatchingTest(unittest.TestCase):
def test_direct_matching(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='The dog chased the cat')
+ text_matches = holmes_manager_coref.match(document_text='The dog chased the cat')
self.assertEqual(len(text_matches), 2)
- self.assertEqual(text_matches[0]['search_phrase'], 'A dog chases a cat')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A dog chases a cat')
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'direct')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Matches DOG directly.")
@@ -45,21 +46,21 @@ def test_direct_matching(self):
self.assertEqual(text_matches[0]['word_matches'][2]['match_type'], 'direct')
def test_entity_matching(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='Richard Hudson chased the horse')
+ text_matches = holmes_manager_coref.match(document_text='Richard Hudson chased the horse')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'entity')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
- "Matches the ENTITYPERSON placeholder.")
+ "Has an entity label matching ENTITYPERSON.")
def test_ontology_matching(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='The dog chased the kitten')
+ text_matches = holmes_manager_coref.match(document_text='The dog chased the kitten')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['word_matches'][2]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][2]['explanation'],
"Is a child of CAT in the ontology.")
def test_embedding_matching(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='The queen woke up')
+ text_matches = holmes_manager_coref.match(document_text='The queen woke up')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'embedding')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
@@ -68,7 +69,7 @@ def test_embedding_matching(self):
"Matches WAKE UP directly.")
def test_embedding_matching_on_root_node(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='An industrious queen loved by all')
+ text_matches = holmes_manager_coref.match(document_text='An industrious queen loved by all')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][1]['match_type'], 'embedding')
@@ -82,19 +83,19 @@ def test_embedding_matching_on_root_node_with_multiple_templates(self):
label='narcissistic toolbox')
holmes_manager_coref.parse_and_register_document('A splendid toolbox', label='splendid toolbox')
holmes_manager_coref.parse_and_register_document('A kind toolbox', label='kind toolbox')
- text_matches = holmes_manager_coref.match_returning_dictionaries()
+ text_matches = holmes_manager_coref.match()
self.assertEqual(len(text_matches), 3)
for text_match in text_matches:
self.assertTrue(text_match['document'].endswith('queen'))
def test_multiword_matching_multiword_in_document(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='Fido chased Mimi Momo')
+ text_matches = holmes_manager_coref.match(document_text='Fido chased Mimi Momo')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['word_matches'][2]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][2]['document_word'], 'Mimi Momo')
def test_multiword_matching_multiword_in_search_phrase(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='The cat jumped')
+ text_matches = holmes_manager_coref.match(document_text='The cat jumped')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['document_word'], 'cat')
@@ -104,7 +105,7 @@ def test_multiword_matching_multiword_in_search_phrase(self):
self.assertEqual(text_matches[1]['word_matches'][0]['search_phrase_word'], 'cat creature')
def test_multiword_matching_multiword_in_document_and_search_phrase(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='Mimi Momo jumped')
+ text_matches = holmes_manager_coref.match(document_text='Mimi Momo jumped')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['document_word'], 'Mimi Momo')
@@ -114,90 +115,90 @@ def test_multiword_matching_multiword_in_document_and_search_phrase(self):
self.assertEqual(text_matches[1]['word_matches'][0]['search_phrase_word'], 'cat creature')
def test_search_phrase_with_entity_root_single_word(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry=
- 'Peter went to Mallorca')
+ text_matches = holmes_manager_coref.match(document_text=
+ 'Mallorca is a large municipality.')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'entity')
self.assertEqual(text_matches[0]['word_matches'][0]['document_word'], 'Mallorca')
def test_search_phrase_with_entity_root_multiword(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry=
- 'Peter went to New York')
+ text_matches = holmes_manager_coref.match(document_text=
+ 'New York is a large municipality.')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'entity')
self.assertEqual(text_matches[0]['word_matches'][0]['document_word'], 'New York')
def test_ontology_multiword_matches_exactly(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(entry='a cat creature')
+ text_matches = holmes_manager_coref.match(document_text='a cat creature')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['word_matches'][0]['document_word'], 'cat')
self.assertEqual(text_matches[1]['word_matches'][0]['document_word'], 'cat creature')
def test_embedding_matching_on_root_node_when_inactive(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='A narcissistic queen')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='A narcissistic queen')
self.assertEqual(len(text_matches), 0)
def test_embedding_matching_when_embedding_root_node_inactive(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(entry='The queen woke up')
+ text_matches = second_holmes_manager_coref.match(document_text='The queen woke up')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'embedding')
def test_symmetric_ontology_single_word_match(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='an animal goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='an animal goes to bed')
self.assertEqual(len(text_matches), 3)
- self.assertEqual(text_matches[0]['search_phrase'], 'A kitten goes to bed')
- self.assertEqual(text_matches[1]['search_phrase'], 'Mimi Momo goes to bed')
- self.assertEqual(text_matches[2]['search_phrase'], 'A dog goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A kitten goes to bed')
+ self.assertEqual(text_matches[1]['search_phrase_label'], 'Mimi Momo goes to bed')
+ self.assertEqual(text_matches[2]['search_phrase_label'], 'A dog goes to bed')
def test_symmetric_ontology_multiword_word_match(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='a cat creature goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='a cat creature goes to bed')
self.assertEqual(len(text_matches), 2)
- self.assertEqual(text_matches[0]['search_phrase'], 'A kitten goes to bed')
- self.assertEqual(text_matches[1]['search_phrase'], 'Mimi Momo goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A kitten goes to bed')
+ self.assertEqual(text_matches[1]['search_phrase_label'], 'Mimi Momo goes to bed')
def test_symmetric_ontology_same_word_match_on_normal_word(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='a kitten goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='a kitten goes to bed')
self.assertEqual(len(text_matches), 2)
- self.assertEqual(text_matches[0]['search_phrase'], 'A kitten goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A kitten goes to bed')
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'direct')
- self.assertEqual(text_matches[1]['search_phrase'], 'A dog goes to bed')
+ self.assertEqual(text_matches[1]['search_phrase_label'], 'A dog goes to bed')
self.assertEqual(text_matches[1]['word_matches'][0]['match_type'], 'embedding')
def test_symmetric_ontology_same_word_match_on_individual(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='Mimi Momo goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='Mimi Momo goes to bed')
self.assertEqual(len(text_matches), 1)
- self.assertEqual(text_matches[0]['search_phrase'], 'Mimi Momo goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'Mimi Momo goes to bed')
def test_symmetric_ontology_hyponym_match_on_normal_word(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='A puppy goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='A puppy goes to bed')
self.assertEqual(len(text_matches), 2)
- self.assertEqual(text_matches[0]['search_phrase'], 'A dog goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A dog goes to bed')
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
- self.assertEqual(text_matches[1]['search_phrase'], 'A kitten goes to bed')
+ self.assertEqual(text_matches[1]['search_phrase_label'], 'A kitten goes to bed')
self.assertEqual(text_matches[1]['word_matches'][0]['match_type'], 'embedding')
def test_symmetric_ontology_hyponym_match_on_individual(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='Fido goes to bed')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='Fido goes to bed')
self.assertEqual(len(text_matches), 1)
- self.assertEqual(text_matches[0]['search_phrase'], 'A dog goes to bed')
+ self.assertEqual(text_matches[0]['search_phrase_label'], 'A dog goes to bed')
def test_index_within_document(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(
- entry='Last week a dog chased a cat')
+ text_matches = holmes_manager_coref.match(
+ document_text='Last week a dog chased a cat')
self.assertEqual(len(text_matches), 2)
self.assertEqual(text_matches[0]['index_within_document'], 4)
self.assertEqual(text_matches[1]['index_within_document'], 6)
def test_derivation_matching_1(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(
- entry='A strong belief')
+ text_matches = holmes_manager_coref.match(
+ document_text='A strong belief')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'derivation')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
@@ -205,96 +206,104 @@ def test_derivation_matching_1(self):
self.assertEqual(text_matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_matching_2(self):
- text_matches = holmes_manager_coref.match_search_phrases_against(
- entry='Someone is strongly attracted')
+ text_matches = holmes_manager_coref.match(
+ document_text='Someone is strongly attracted')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'derivation')
self.assertEqual(text_matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_ontology_matching_depth_0(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='oans')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='oans')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a synonym of UNOUNO in the ontology.")
def test_ontology_matching_depth_1(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='dos')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='dos')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a child of UNOUNO in the ontology.")
def test_ontology_matching_depth_2(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='tres')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='tres')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a grandchild of UNOUNO in the ontology.")
def test_ontology_matching_depth_3(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='cuatro')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='cuatro')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a great-grandchild of UNOUNO in the ontology.")
def test_ontology_matching_depth_4(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='cinco')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='cinco')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a descendant of UNOUNO in the ontology.")
def test_ontology_matching_depth_5(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='seis')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='seis')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a descendant of UNOUNO in the ontology.")
def test_ontology_matching_depth_minus_1(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='pięć')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='pięć')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a parent of SZEŚĆ in the ontology.")
def test_ontology_matching_depth_minus_2(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='cztery')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='cztery')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a grandparent of SZEŚĆ in the ontology.")
def test_ontology_matching_depth_minus_3(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='trzy')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='trzy')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is a great-grandparent of SZEŚĆ in the ontology.")
def test_ontology_matching_depth_minus_4(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='dwa')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='dwa')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is an ancestor of SZEŚĆ in the ontology.")
def test_ontology_matching_depth_minus_5(self):
- text_matches = second_holmes_manager_coref.match_search_phrases_against(
- entry='jeden')
+ text_matches = second_holmes_manager_coref.match(
+ document_text='jeden')
self.assertEqual(len(text_matches), 1)
self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'ontology')
self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
"Is an ancestor of SZEŚĆ in the ontology.")
+
+ def test_entity_embedding_matching(self):
+ text_matches = second_holmes_manager_coref.match(
+ document_text='Richard Hudson made an announcement')
+ self.assertEqual(len(text_matches), 1)
+ self.assertEqual(text_matches[0]['word_matches'][0]['match_type'], 'entity_embedding')
+ self.assertEqual(text_matches[0]['word_matches'][0]['explanation'],
+ "Has an entity label that is 55% similar to the word embedding corresponding to MAN.")
diff --git a/holmes_extractor/tests/de/test_ontology.owl b/tests/de/test_ontology.owl
similarity index 100%
rename from holmes_extractor/tests/de/test_ontology.owl
rename to tests/de/test_ontology.owl
diff --git a/holmes_extractor/tests/de/test_phraselet_production_DE.py b/tests/de/test_phraselet_production_DE.py
similarity index 79%
rename from holmes_extractor/tests/de/test_phraselet_production_DE.py
rename to tests/de/test_phraselet_production_DE.py
index 08e24ef..bc765d7 100644
--- a/holmes_extractor/tests/de/test_phraselet_production_DE.py
+++ b/tests/de/test_phraselet_production_DE.py
@@ -5,119 +5,139 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-holmes_manager = holmes.Manager('de_core_news_md', ontology=ontology)
+holmes_manager = holmes.Manager('de_core_news_lg', ontology=ontology, number_of_workers=1)
class GermanPhraseletProductionTest(unittest.TestCase):
def _check_equals(self, text_to_match, phraselet_labels, match_all_words=False,
- include_reverse_only=False, replace_with_hypernym_ancestors=False):
+ include_reverse_only=False, replace_with_hypernym_ancestors=False,
+ process_initial_question_words=False):
doc = holmes_manager.semantic_analyzer.parse(text_to_match)
phraselet_labels_to_phraselet_infos = {}
- holmes_manager.structural_matcher.add_phraselets_to_dict(doc,
+ holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=replace_with_hypernym_ancestors,
match_all_words=match_all_words,
ignore_relation_phraselets=False,
include_reverse_only=include_reverse_only,
- stop_lemmas=holmes_manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=process_initial_question_words)
self.assertEqual(
set(phraselet_labels_to_phraselet_infos.keys()),
set(phraselet_labels))
self.assertEqual(len(phraselet_labels_to_phraselet_infos.keys()),
len(phraselet_labels))
- def _get_phraselet_dict(self, manager, text_to_match):
+ def _get_phraselet_dict(self, manager, text_to_match, words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None):
manager.remove_all_search_phrases()
doc = manager.semantic_analyzer.parse(text_to_match)
phraselet_labels_to_phraselet_infos = {}
- manager.structural_matcher.add_phraselets_to_dict(doc,
+ manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=False,
match_all_words=True,
ignore_relation_phraselets=False,
include_reverse_only=True,
- stop_lemmas=manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,words_to_corpus_frequencies=words_to_corpus_frequencies, maximum_corpus_frequency=maximum_corpus_frequency,
+ process_initial_question_words=False)
return phraselet_labels_to_phraselet_infos
def test_verb_nom(self):
self._check_equals("Eine Pflanze wächst", [
- 'verb-nom: wachsen-pflanzen', 'word: pflanzen'])
+ 'verb-nom: wachsen-pflanz', 'word: pflanz'])
def test_separable_verb_nom(self):
self._check_equals("Eine Pflanze wächst auf",
- ['verb-nom: aufwachsen-pflanzen', 'word: pflanzen'])
+ ['verb-nom: aufwachsen-pflanz', 'word: pflanz'])
def test_verb_acc(self):
- self._check_equals("Eine Pflanze wird gepflanzt", ['verb-acc: pflanzen-pflanzen',
- 'word: pflanzen'])
+ self._check_equals("Eine Pflanze wird gepflanzt", ['verb-acc: pflanzen-pflanz',
+ 'word: pflanz'])
def test_verb_dat(self):
- self._check_equals("Jemand gibt einer Pflanze etwas", ['verb-dat: gabe-pflanzen',
- 'word: pflanzen'])
+ self._check_equals("Jemand gibt einer Pflanze etwas", ['verb-dat: gabe-pflanz',
+ 'word: pflanz'])
def test_noun_dependent_adjective(self):
- self._check_equals("Eine gesunde Pflanze", ['noun-dependent: pflanzen-gesund',
- 'word: pflanzen'])
+ self._check_equals("Eine gesunde Pflanze", ['noun-dependent: pflanz-gesund',
+ 'word: pflanz'])
def test_noun_dependent_noun(self):
- self._check_equals("Die Pflanze eines Gärtners", ['verb-acc: pflanzen-gärtner',
- 'word: gärtner', 'word: pflanzen'])
+ self._check_equals("Die Pflanze eines Gärtners", ['verb-acc: pflanz-gärtner',
+ 'word: gärtner', 'word: pflanz'])
def test_verb_adverb(self):
self._check_equals("lange schauen", ['verb-adverb: schau-lang'])
def test_combination(self):
self._check_equals("Der Gärtner gibt der netten Frau ihr Mittagessen",
- ['verb-nom: gabe-gärtnern', 'verb-acc: gabe-mittagessen',
+ ['verb-nom: gabe-gärtner', 'verb-acc: gabe-mittagessen',
'verb-dat: gabe-frau', 'noun-dependent: frau-nett',
- 'word: gärtnern', 'word: frau', 'word: mittagessen'])
+ 'noun-dependent: mittagessen-frau', 'word: gärtner', 'word: frau',
+ 'word: mittagessen'])
def test_phraselet_labels(self):
doc = holmes_manager.semantic_analyzer.parse(
"Der Gärtner gibt der netten Frau ihr Mittagessen")
phraselet_labels_to_phraselet_infos = {}
- holmes_manager.structural_matcher.add_phraselets_to_dict(doc,
+ holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=False,
match_all_words=False,
include_reverse_only=True,
ignore_relation_phraselets=False,
- stop_lemmas=holmes_manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
self.assertEqual(set(phraselet_labels_to_phraselet_infos.keys()),
- set(['verb-nom: gabe-gärtnern', 'verb-acc: gabe-mittagessen',
+ set(['verb-nom: gabe-gärtner', 'verb-acc: gabe-mittagessen',
'verb-dat: gabe-frau', 'noun-dependent: frau-nett',
- 'word: gärtnern', 'word: frau', 'word: mittagessen']))
+ 'noun-dependent: mittagessen-frau', 'word: gärtner', 'word: frau',
+ 'word: mittagessen']))
def test_phraselet_labels_with_intcompound(self):
doc = holmes_manager.semantic_analyzer.parse(
"Der Landschaftsgärtner gibt der netten Frau ihr Mittagessen")
phraselet_labels_to_phraselet_infos = {}
- holmes_manager.structural_matcher.add_phraselets_to_dict(doc,
+ holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=False,
match_all_words=False,
include_reverse_only=True,
ignore_relation_phraselets=False,
- stop_lemmas=holmes_manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
+
self.assertEqual(set(phraselet_labels_to_phraselet_infos.keys()),
set(['verb-nom: gabe-landschaftsgärtner', 'verb-acc: gabe-mittagessen',
'verb-dat: gabe-frau', 'noun-dependent: frau-nett',
- 'word: landschaftsgärtner', 'word: frau', 'word: mittagessen',
- 'intcompound: gärtnern-landschaft', 'verb-nom: gabe-gärtnern']))
+ 'noun-dependent: mittagessen-frau', 'word: landschaftsgärtner',
+ 'word: frau', 'word: mittagessen',
+ 'intcompound: gärtner-landschaft', 'verb-nom: gabe-gärtner']))
intcompound_phraselet_info = phraselet_labels_to_phraselet_infos[
- 'intcompound: gärtnern-landschaft']
- self.assertEqual(intcompound_phraselet_info.parent_lemma, 'gärtnern')
+ 'intcompound: gärtner-landschaft']
+ self.assertEqual(intcompound_phraselet_info.parent_lemma, 'gärtner')
self.assertEqual(
- intcompound_phraselet_info.parent_derived_lemma, 'gärtnern')
+ intcompound_phraselet_info.parent_derived_lemma, 'gärtner')
self.assertEqual(intcompound_phraselet_info.child_lemma, 'landschaft')
self.assertEqual(
intcompound_phraselet_info.child_derived_lemma, 'landschaft')
@@ -127,7 +147,7 @@ def test_reverse_only_parent_lemma(self):
['verb-adverb: haben-immer'], include_reverse_only=True)
def test_reverse_only_parent_lemma_auxiliary(self):
- self._check_equals("Immer hat er es gehabt",
+ self._check_equals("Er hat es immer gehabt",
['verb-adverb: haben-immer'], include_reverse_only=True)
def test_reverse_only_parent_lemma_modal(self):
@@ -148,12 +168,12 @@ def test_reverse_only_parent_lemma_suppressed_modal(self):
def test_phraselet_stop_words_governed(self):
self._check_equals("Dann tat er es zu Hause",
- ['word: hausen', 'prepgovernor-noun: tat-hausen',
- 'prep-noun: zu-hausen'], include_reverse_only=True)
+ ['word: haus', 'prepgovernor-noun: tat-haus',
+ 'prep-noun: zu-haus'], include_reverse_only=True)
def test_phraselet_stop_words_governed_suppressed(self):
self._check_equals("Dann tat er es zu Hause",
- ['word: hausen'], include_reverse_only=False)
+ ['word: haus'], include_reverse_only=False)
def test_only_verb(self):
self._check_equals("springen", ['word: sprung'])
@@ -163,10 +183,10 @@ def test_only_preposition(self):
def test_match_all_words(self):
self._check_equals("Der Gärtner gibt der netten Frau ihr Mittagessen",
- ['word: gärtnern', 'word: frau', 'word: mittagessen',
- 'word: gabe', 'word: nett', 'verb-nom: gabe-gärtnern',
+ ['word: gärtner', 'word: frau', 'word: mittagessen',
+ 'word: gabe', 'word: nett', 'verb-nom: gabe-gärtner',
'verb-dat: gabe-frau', 'verb-acc: gabe-mittagessen',
- 'noun-dependent: frau-nett'], True)
+ 'noun-dependent: frau-nett', 'noun-dependent: mittagessen-frau'], True)
def test_moposs(self):
self._check_equals("Er braucht eine Versicherung für fünf Jahre",
@@ -256,14 +276,14 @@ def test_subwords_replace_with_hypernym_ancestors_is_true_match_all_words(self):
match_all_words=True)
def test_subwords_with_conjunction_match_all_words(self):
- self._check_equals("Der König der Informationsinteressen-, -beschaffungs- und -problemmaßnahmen der Wettersituation",
+ self._check_equals("Der König von den Informationsinteressen-, -beschaffungs- und -problemmaßnahmen der Wettersituation",
['word: wettersituation',
'intcompound: beschaffen-information',
'word: könig',
'verb-acc: könig-maßnahm',
'intcompound: problem-information',
'verb-acc: maßnahm-wettersituation',
- 'intcompound: situation-wettern',
+ 'intcompound: situation-wetter',
'verb-acc: maßnahm-situation',
'intcompound: maßnahm-problem',
'intcompound: maßnahm-beschaffen',
@@ -273,21 +293,21 @@ def test_subwords_with_conjunction_match_all_words(self):
'word: information',
'word: interesse',
'word: beschaffen',
- 'word: wettern',
+ 'word: wetter',
'word: situation',
'word: maßnahm'
],
match_all_words=True)
def test_subwords_with_conjunction_not_match_all_words(self):
- self._check_equals("Der König der Informationsinteressen-, -beschaffungs- und -problemmaßnahmen der Wettersituation",
+ self._check_equals("Der König von den Informationsinteressen-, -beschaffungs- und -problemmaßnahmen der Wettersituation",
['word: wettersituation',
'intcompound: beschaffen-information',
'word: könig',
'verb-acc: könig-maßnahm',
'intcompound: problem-information',
'verb-acc: maßnahm-wettersituation',
- 'intcompound: situation-wettern',
+ 'intcompound: situation-wetter',
'verb-acc: maßnahm-situation',
'intcompound: maßnahm-problem',
'intcompound: maßnahm-beschaffen',
@@ -302,7 +322,7 @@ def test_subwords_with_conjunction_one_not_hyphenated_not_match_all_words(self):
'word: könig',
'verb-acc: könig-maßnahm',
'verb-acc: maßnahm-wettersituation',
- 'intcompound: situation-wettern',
+ 'intcompound: situation-wetter',
'verb-acc: maßnahm-situation',
'intcompound: maßnahm-beschaffen',
'word: beschaffungsmaßnahmen',
@@ -319,7 +339,7 @@ def test_subwords_with_conjunction_one_not_hyphenated_match_all_words(self):
'word: könig',
'verb-acc: könig-maßnahm',
'verb-acc: maßnahm-wettersituation',
- 'intcompound: situation-wettern',
+ 'intcompound: situation-wetter',
'verb-acc: maßnahm-situation',
'intcompound: maßnahm-beschaffen',
'word: beschaffungsmaßnahmen',
@@ -329,12 +349,24 @@ def test_subwords_with_conjunction_one_not_hyphenated_match_all_words(self):
'verb-acc: könig-beschaffungsmaßnahmen',
'word: information',
'word: beschaffen',
- 'word: wettern',
+ 'word: wetter',
'word: situation',
'word: maßnahm'
],
match_all_words=True)
+ def test_question_word(self):
+ self._check_equals("Wer kam?",
+ ['head-WHnom: kommen-wer'
+ ],
+ match_all_words=False, process_initial_question_words=True)
+
+ def test_question_word_control(self):
+ self._check_equals("Wer kam?",
+ ['word: kommen'
+ ],
+ match_all_words=True, process_initial_question_words=False)
+
def test_noun_lemmas_preferred_noun_lemma_first(self):
dict = self._get_phraselet_dict(holmes_manager,
"Sie besprachen die Amputation. Sie hatten ein Amputieren vor")
@@ -454,3 +486,16 @@ def test_intcompound_when_reverse_derived_lemma_in_ontology(self):
"Sammelabflug.")
self.assertEqual(set(dict.keys()), {'word: sammelabflug', 'word: sammel', 'word: abfliegen',
'intcompound: abfliegen-sammel'})
+
+ def test_frequency_factors_with_subwords(self):
+ dict = self._get_phraselet_dict(holmes_manager,
+ "Sprachwissenschaft",
+ words_to_corpus_frequencies={'sprach': 3,
+ 'sprachwissenschaft': 5, 'wissenschaft': 1},
+ maximum_corpus_frequency=5)
+ sprach_phraselet = dict['word: sprach']
+ self.assertEqual(str(sprach_phraselet.frequency_factor), '0.5693234419266069')
+ wissenschaft_phraselet = dict['word: wissenschaft']
+ self.assertEqual(str(wissenschaft_phraselet.frequency_factor), '1.0')
+ sprachwissenschaft_phraselet = dict['word: sprachwissenschaft']
+ self.assertEqual(str(sprachwissenschaft_phraselet.frequency_factor), '0.1386468838532139')
diff --git a/tests/de/test_questions_DE.py b/tests/de/test_questions_DE.py
new file mode 100644
index 0000000..5fa210a
--- /dev/null
+++ b/tests/de/test_questions_DE.py
@@ -0,0 +1,173 @@
+import unittest
+import holmes_extractor as holmes
+from holmes_extractor.topic_matching import TopicMatcher
+
+manager = holmes.Manager(model='de_core_news_lg', number_of_workers=1)
+
+class GermanInitialQuestionsTest(unittest.TestCase):
+
+ def _check_equals(self, text_to_match, document_text, highest_score, answer_start, answer_end,
+ word_embedding_match_threshold=0.42, initial_question_word_embedding_match_threshold=0.42,
+ use_frequency_factor=True):
+ manager.remove_all_documents()
+ manager.parse_and_register_document(document_text)
+ topic_matches = manager.topic_match_documents_against(text_to_match,
+ word_embedding_match_threshold=
+ word_embedding_match_threshold,
+ initial_question_word_embedding_match_threshold=initial_question_word_embedding_match_threshold,
+ initial_question_word_answer_score=40,
+ relation_score=20,
+ reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5,
+ different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=0.0,
+ embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=use_frequency_factor)
+ self.assertEqual(int(topic_matches[0]['score']), highest_score)
+ if answer_start is not None:
+ self.assertEqual(topic_matches[0]['answers'][0][0], answer_start)
+ self.assertEqual(topic_matches[0]['answers'][0][1], answer_end)
+ else:
+ self.assertEqual(len(topic_matches[0]['answers']), 0)
+
+ def test_basic_matching_with_subword(self):
+ self._check_equals("Was betrachtet man?", 'Informationsbetrachtung', 45, 0, 11)
+
+ def test_governed_interrogative_pronoun_with_subword(self):
+ self._check_equals("Welche Information betrachtet man?", 'Informationsbetrachtung', 55, 0, 11)
+
+ def test_governed_interrogative_pronoun_with_subword_control(self):
+ self._check_equals("Die Information betrachtet man.", 'Informationsbetrachtung', 35, None, None)
+
+ def test_governed_interrogative_pronoun_with_complex_subword(self):
+ self._check_equals("Welche Information betrachtet man?",
+ 'Extraktionsinformationsbetrachtung', 55, 0, 22)
+
+ def test_governed_interrogative_pronoun_with_complex_subword_control(self):
+ self._check_equals("Die Information betrachtet man.",
+ 'Extraktionsinformationsbetrachtung', 35, None, None)
+
+ def test_governed_interrogative_pronoun_with_subword_and_coreference(self):
+ self._check_equals("Welchen Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 54, 13, 29)
+
+ def test_governed_interrogative_pronoun_with_subword_and_coreference_control(self):
+ self._check_equals("Den Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 34, None, None)
+
+ def test_governed_interrogative_pronoun_with_subword_and_embedding_matching(self):
+ self._check_equals("Welchen Hund betrachten wir?", 'Leute betrachteten die Informationskatze', 25, 23, 40)
+
+ def test_governed_interrogative_pronoun_with_subword_and_embedding_matching_control(self):
+ self._check_equals("Den Hund betrachten wir.", 'Leute betrachteten den Informationskatze', 15, None, None)
+
+ def test_check_was_predicate_positive_case(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Das ist ein Haus.", 'q')
+ topic_matches = manager.topic_match_documents_against("Was ist das?")
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'Das ist ein Haus.', 'text_to_match': 'Was ist das?', 'rank': '1', 'index_within_document': 1, 'subword_index': None, 'start_index': 0, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 17, 'score': 620.0, 'word_infos': [[0, 3, 'relation', False, 'Matches the question word WAS.'], [4, 7, 'relation', True, 'Matches SEIN directly.'], [12, 16, 'relation', False, 'Matches the question word WAS.']], 'answers': [[0, 3], [8, 16]]}])
+
+ def test_check_wer_positive_case(self):
+ self._check_equals('Wer schaute in die Sonne?', 'Die Person schaute in die Sonne', 127, 0, 10)
+
+ def test_check_wer_wrong_syntax(self):
+ self._check_equals('Wer schaute in die Sonne?', 'Die Sonne schaute in den Mann', 19, None, None)
+
+ def test_check_wer_wrong_noun(self):
+ self._check_equals('Wer schaute in die Sonne?', 'Das Gebäude schaute in die Sonne', 70, None, None)
+
+ def test_check_wen_positive_case(self):
+ self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah die Person', 54, 16, 26)
+
+ def test_check_wen_wrong_syntax(self):
+ self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah das Gebäude', 34, None, None)
+
+ def test_check_was_acc(self):
+ self._check_equals('Was sah das Gebäude?', 'Das Gebäude sah das Gebäude', 104, 16, 27)
+
+ def test_check_wem_positive_case(self):
+ self._check_equals('Wem hilfst du?', 'Ich helfe der Person', 45, 10, 20)
+
+ def test_check_wo_positive_case(self):
+ self._check_equals('Wo wohnst du?', 'Ich wohne in einem Haus', 45, 10, 23)
+
+ def test_check_wo_positive_case_definite_preposition(self):
+ self._check_equals('Wo wohnst du?', 'Ich wohne im Haus', 45, 10, 17)
+
+ def test_check_wo_wrong_case_definite_preposition(self):
+ self._check_equals('Wo wohnst du?', 'Ich wohne ins Haus', 5, None, None)
+
+ def test_check_wo_wrong_case(self):
+ self._check_equals('Wo wohnst du?', 'Ich wohne in ein Haus', 5, None, None)
+
+ def test_check_wohin_positive_case(self):
+ self._check_equals('Wohin fährst du?', 'Ich fahre in ein Haus', 45, 10, 21)
+
+ def test_check_wohin_positive_case_definite_preposition(self):
+ self._check_equals('Wohin fährst du?', 'Ich fahre ins Haus', 45, 10, 18)
+
+ def test_check_wohin_wrong_case_definite_preposition(self):
+ self._check_equals('Wohin fährst du?', 'Ich fahre im Haus', 5, None, None)
+
+ def test_check_womit_positive_case(self):
+ self._check_equals('Womit fährst du?', 'Ich fahre mit meinem Auto', 45, 10, 25)
+
+ def test_check_womit_other_preposition(self):
+ self._check_equals('Womit fährst du?', 'Ich fahre ohne mein Auto', 5, None, None)
+
+ def test_check_wann_noun(self):
+ self._check_equals('Wann fährst du?', 'Ich fahre nächste Woche', 45, 10, 23)
+
+ def test_check_wann_preposition(self):
+ self._check_equals('Wann fährst du?', 'Ich fahre in zwei Wochen', 45, 10, 24)
+
+ def test_check_wann_wrong_preposition(self):
+ self._check_equals('Wann fährst du?', 'Ich fahre wegen des Problems', 5, None, None)
+
+ def test_check_wann_adverb(self):
+ self._check_equals('Wann fährst du?', 'Ich fahre morgen', 45, 10, 16)
+
+ def test_check_wann_verb_phrase(self):
+ self._check_equals('Wann fährst du?', 'Ich fahre, wenn du mitkommst.', 45, 11, 28)
+
+ def test_check_wie_preposition(self):
+ self._check_equals('Wie fährst du?', 'Ich fahre mit dem Auto', 45, 10, 22)
+
+ def test_check_wie_wrong_preposition(self):
+ self._check_equals('Wie fährst du?', 'Ich fahre wegen des Problems', 5, None, None)
+
+ def test_check_wie_adverb(self):
+ self._check_equals('Wie fährst du?', 'Ich fahre langsam', 45, 10, 17)
+
+ def test_check_wie_indem_phrase(self):
+ self._check_equals('Wie fährst du?', 'Ich fahre, indem ich per Anhalter fahre', 45, 11, 39)
+
+ def test_check_wie_other_phrase(self):
+ self._check_equals('Wie fährst du?', 'Ich fahre, weil ich per Anhalter fahre', 5, None, None)
+
+ def test_check_woher_preposition(self):
+ self._check_equals('Woher denkst Du es?', 'Ich denke es wegen der Evidenz', 45, 13, 30)
+
+ def test_check_woher_wrong_preposition(self):
+ self._check_equals('Woher denkst Du es?', 'Ich denke es trotz der Evidenz', 5, None, None)
+
+ def test_check_woher_weil(self):
+ self._check_equals('Woher denkst Du es?', 'Ich denke es, weil es stimmt', 45, 14, 28)
+
+ def test_check_woher_wrong_conjunction(self):
+ self._check_equals('Woher denkst Du es?', 'Ich denke es, obwohl es nicht stimmt', 5, None, None)
+
+ def test_check_warum_preposition(self):
+ self._check_equals('Warum machst Du es?', 'Ich mache es wegen der Evidenz', 45, 13, 30)
+
+ def test_check_warum_wrong_preposition(self):
+ self._check_equals('Warum machst Du es?', 'Ich mache es trotz der Evidenz', 5, None, None)
+
+ def test_check_warum_weil(self):
+ self._check_equals('Warum machst Du es?', 'Ich mache es, weil es stimmt', 45, 14, 28)
+
+ def test_check_warum_weil_sein(self):
+ self._check_equals('Warum machst Du es?', 'Ich mache es, weil es gut ist', 45, 14, 29)
+
+ def test_check_warum_damit(self):
+ self._check_equals('Wieso machst Du es?', 'Ich mache es, damit Du kommst', 45, 14, 29)
+
+ def test_check_warum_wrong_conjunction(self):
+ self._check_equals('Woher machst Du es?', 'Ich mache es, obwohl es nicht stimmt', 5, None, None)
diff --git a/holmes_extractor/tests/de/test_semantics_DE.py b/tests/de/test_semantics_DE.py
similarity index 88%
rename from holmes_extractor/tests/de/test_semantics_DE.py
rename to tests/de/test_semantics_DE.py
index ba3056c..5ccb4df 100644
--- a/holmes_extractor/tests/de/test_semantics_DE.py
+++ b/tests/de/test_semantics_DE.py
@@ -1,14 +1,16 @@
import unittest
-from holmes_extractor.semantics import SemanticAnalyzerFactory
-
-analyzer = SemanticAnalyzerFactory().semantic_analyzer(model='de_core_news_md',
- perform_coreference_resolution=False, debug=False)
+import spacy
+import coreferee
+import holmes_extractor
+nlp = spacy.load('de_core_news_lg')
+nlp.add_pipe('coreferee')
+nlp.add_pipe('holmes')
class GermanSemanticAnalyzerTest(unittest.TestCase):
def test_initialize_semantic_dependencies(self):
- doc = analyzer.parse("Der Hund jagte die Katze.")
+ doc = nlp("Der Hund jagte die Katze.")
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '1:sb; 4:oa')
self.assertEqual(
@@ -19,14 +21,14 @@ def test_initialize_semantic_dependencies(self):
doc[5]._.holmes.string_representation_of_children(), '')
def test_one_righthand_sibling_with_and_conjunction(self):
- doc = analyzer.parse("Der Hund und der Löwe jagten die Katze")
+ doc = nlp("Der Hund und der Löwe jagten die Katze")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4])
self.assertFalse(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertFalse(doc[4]._.holmes.is_involved_in_or_conjunction)
self.assertEqual(doc[4]._.holmes.righthand_siblings, [])
def test_many_righthand_siblings_with_and_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund, der Hund und der Löwe jagten die Katze")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4, 7])
self.assertFalse(doc[1]._.holmes.is_involved_in_or_conjunction)
@@ -36,14 +38,14 @@ def test_many_righthand_siblings_with_and_conjunction(self):
self.assertEqual(doc[7]._.holmes.righthand_siblings, [])
def test_one_righthand_sibling_with_or_conjunction(self):
- doc = analyzer.parse("Der Hund oder der Löwe jagten die Katze")
+ doc = nlp("Der Hund oder der Löwe jagten die Katze")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4])
self.assertTrue(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertTrue(doc[4]._.holmes.is_involved_in_or_conjunction)
self.assertEqual(doc[4]._.holmes.righthand_siblings, [])
def test_many_righthand_siblings_with_or_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Maus, der Hund oder der Löwe jagten die Katze")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4, 7])
self.assertTrue(doc[1]._.holmes.is_involved_in_or_conjunction)
@@ -53,20 +55,20 @@ def test_many_righthand_siblings_with_or_conjunction(self):
self.assertEqual(doc[7]._.holmes.righthand_siblings, [])
def test_righthand_siblings_of_semantic_children_two(self):
- doc = analyzer.parse("Der große und starke Hund kam heim")
+ doc = nlp("Der große und starke Hund kam heim")
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '1:nk; 3:nk')
self.assertEqual(doc[1]._.holmes.righthand_siblings, [3])
def test_righthand_siblings_of_semantic_children_many(self):
- doc = analyzer.parse("Der große, starke und scharfe Hund kam heim")
+ doc = nlp("Der große, starke und scharfe Hund kam heim")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:nk; 3:nk; 5:nk')
self.assertEqual(doc[1]._.holmes.righthand_siblings, [3, 5])
self.assertEqual(doc[3]._.holmes.righthand_siblings, [])
def test_semantic_children_of_righthand_siblings_two(self):
- doc = analyzer.parse("Der große Hund und Löwe")
+ doc = nlp("Der große Hund und Löwe")
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '1:nk; 3:cd')
self.assertEqual(doc[2]._.holmes.righthand_siblings, [4])
@@ -74,7 +76,7 @@ def test_semantic_children_of_righthand_siblings_two(self):
doc[4]._.holmes.string_representation_of_children(), '1:nk')
def test_semantic_children_of_righthand_siblings_many(self):
- doc = analyzer.parse("Der große Hund, Löwe und Elefant")
+ doc = nlp("Der große Hund, Löwe und Elefant")
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '1:nk; 4:cj')
self.assertEqual(
@@ -83,7 +85,7 @@ def test_semantic_children_of_righthand_siblings_many(self):
doc[6]._.holmes.string_representation_of_children(), '1:nk')
def test_predicative_adjective(self):
- doc = analyzer.parse("Der Hund war groß")
+ doc = nlp("Der Hund war groß")
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '3:nk')
self.assertEqual(
@@ -91,71 +93,71 @@ def test_predicative_adjective(self):
self.assertTrue(doc[2]._.holmes.is_matchable)
def test_predicative_adjective_with_conjunction(self):
- doc = analyzer.parse("Der Hund und die Katze waren groß und stark")
+ doc = nlp("Der Hund und die Katze waren groß und stark")
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '2:cd; 6:nk; 8:nk')
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '6:nk; 8:nk')
def test_negator_negation_within_clause(self):
- doc = analyzer.parse("Der Hund jagte die Katze nicht")
+ doc = nlp("Der Hund jagte die Katze nicht")
self.assertEqual(doc[2]._.holmes.is_negated, True)
def test_operator_negation_within_clause(self):
- doc = analyzer.parse("Kein Hund hat irgendeine Katze gejagt")
+ doc = nlp("Kein Hund hat irgendeine Katze gejagt")
self.assertEqual(doc[1]._.holmes.is_negated, True)
self.assertEqual(doc[2]._.holmes.is_negated, False)
self.assertFalse(doc[2]._.holmes.is_matchable)
def test_negator_negation_within_parent_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Er meinte nicht, dass der Hund die Katze gejagt hätte")
self.assertEqual(doc[9]._.holmes.is_negated, True)
self.assertFalse(doc[10]._.holmes.is_matchable)
def test_operator_negation_within_parent_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Keiner behauptete, dass der Hund die Katze jagte")
self.assertEqual(doc[5]._.holmes.is_negated, True)
def test_negator_negation_within_child_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund jagte die Katze, die nicht glücklich war")
self.assertEqual(doc[2]._.holmes.is_negated, False)
def test_operator_negation_within_child_clause(self):
- doc = analyzer.parse("Der Hund jagte die Katze die es keinem erzählte")
+ doc = nlp("Der Hund jagte die Katze die es keinem erzählte")
self.assertEqual(doc[2]._.holmes.is_negated, False)
def test_dass_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Er ist zuversichtlich, dass der Hund die Katze jagen wird")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '4:cp; 6:sb; 8:oa')
def test_active_perfect(self):
- doc = analyzer.parse("Der Hund hat die Katze gejagt")
+ doc = nlp("Der Hund hat die Katze gejagt")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-6:None')
def test_active_pluperfect(self):
- doc = analyzer.parse("Der Hund hatte die Katze gejagt")
+ doc = nlp("Der Hund hatte die Katze gejagt")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-6:None')
def test_active_future(self):
- doc = analyzer.parse("Der Hund wird die Katze jagen")
+ doc = nlp("Der Hund wird die Katze jagen")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-6:None')
def test_active_future_perfect(self):
- doc = analyzer.parse("Der Hund wird die Katze gejagt haben")
+ doc = nlp("Der Hund wird die Katze gejagt haben")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa')
self.assertEqual(
@@ -166,7 +168,7 @@ def test_active_future_perfect(self):
self.assertFalse(doc[6]._.holmes.is_matchable)
def test_von_passive_perfect(self):
- doc = analyzer.parse("Die Katze ist vom Hund gejagt worden")
+ doc = nlp("Die Katze ist vom Hund gejagt worden")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb')
self.assertEqual(
@@ -175,7 +177,7 @@ def test_von_passive_perfect(self):
doc[6]._.holmes.string_representation_of_children(), '-6:None')
def test_von_passive_pluperfect(self):
- doc = analyzer.parse("Die Katze war vom Hund gejagt worden")
+ doc = nlp("Die Katze war vom Hund gejagt worden")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb')
self.assertEqual(
@@ -184,7 +186,7 @@ def test_von_passive_pluperfect(self):
doc[6]._.holmes.string_representation_of_children(), '-6:None')
def test_von_passive_future(self):
- doc = analyzer.parse("Die Katze wird vom Hund gejagt werden")
+ doc = nlp("Die Katze wird vom Hund gejagt werden")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb')
self.assertEqual(
@@ -193,7 +195,7 @@ def test_von_passive_future(self):
doc[6]._.holmes.string_representation_of_children(), '-6:None')
def test_von_passive_future_perfect(self):
- doc = analyzer.parse("Die Katze wird vom Hund gejagt worden sein")
+ doc = nlp("Die Katze wird vom Hund gejagt worden sein")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb')
self.assertEqual(
@@ -204,51 +206,83 @@ def test_von_passive_future_perfect(self):
doc[7]._.holmes.string_representation_of_children(), '-7:None')
def test_complex_tense_noun_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund und der Löwe haben die Katze und die Maus gejagt")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:sb; 4:sb; 7:oa; 10:oa')
def test_complex_tense_noun_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus werden vom Hund und Löwen gejagt werden")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'1:oa; 4:oa; 7:sb; 9:sb')
- def test_complex_tense_verb_conjunction_active(self):
- doc = analyzer.parse(
+ def test_complex_tense_verb_conjunction_active_1(self):
+ doc = nlp(
"Der Hund wird die Katze gejagt und gefressen haben")
self.assertEqual(
- doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa; 6:cd')
+ doc[5]._.holmes.string_representation_of_children(), '1:sb; 4:oa; 6:cd; 8:oc')
+ self.assertEqual(
+ doc[7]._.holmes.string_representation_of_children(), '1:sb; 4:oa; 8:oc')
+
+ def test_complex_tense_verb_conjunction_active_2(self):
+ doc = nlp(
+ "Die Katze wird der Hund gejagt und gefressen haben")
+ self.assertEqual(
+ doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb; 6:cd; 8:oc')
+ self.assertEqual(
+ doc[7]._.holmes.string_representation_of_children(), '1:oa; 4:sb; 8:oc')
+
+ def test_complex_tense_verb_conjunction_active_3(self):
+ doc = nlp(
+ "Den Hund wird die Katze gejagt und gefressen haben")
+ self.assertEqual(
+ doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb; 6:cd; 8:oc')
self.assertEqual(
- doc[7]._.holmes.string_representation_of_children(), '1:sb')
+ doc[7]._.holmes.string_representation_of_children(), '1:oa; 4:sb; 8:oc')
def test_complex_tense_verb_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze wird vom Hund gejagt und gefressen werden")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 4:sb; 6:cd')
self.assertEqual(
doc[7]._.holmes.string_representation_of_children(), '1:oa; 4:sb')
- def test_conjunction_everywhere_active(self):
- doc = analyzer.parse(
+ def test_conjunction_everywhere_active_1(self):
+ doc = nlp(
"Der Hund und der Löwe werden die Katze und die Maus jagen und fressen")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:sb; 4:sb; 7:oa; 10:oa; 12:cd')
self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
- '7:oa; 10:oa')
+ '1:sb; 4:sb; 7:oa; 10:oa')
+
+ def test_conjunction_everywhere_active_2(self):
+ doc = nlp(
+ "Die Katze und die Maus werden der Hund und der Löwe jagen und fressen")
+ self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
+ '1:oa; 4:oa; 7:sb; 10:sb; 12:cd')
+ self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
+ '1:oa; 4:oa; 7:sb; 10:sb')
+
+ def test_conjunction_everywhere_active_3(self):
+ doc = nlp(
+ "Den Hund und den Löwen werden die Katze und die Maus jagen und fressen")
+ self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
+ '1:oa; 4:oa; 7:sb; 10:sb; 12:cd')
+ self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
+ '1:oa; 4:oa; 7:sb; 10:sb')
def test_conjunction_everywhere_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus werden durch den Hund und den Löwen gejagt und gefressen werden")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
- '1:oa; 4:oa; 8:sb; 11:sb; 13:cd')
+ '1:oa; 4:oa; 8:sb; 11:sb; 13:cd; 15:oc')
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
- '1:oa; 4:oa; 8:sb; 11:sb')
+ '1:oa; 4:oa; 8:sb; 11:sb; 15:oc')
def test_simple_modal_verb_active(self):
- doc = analyzer.parse("Der Hund soll die Katze jagen")
+ doc = nlp("Der Hund soll die Katze jagen")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:sb(U); 4:oa(U)')
self.assertEqual(
@@ -256,54 +290,49 @@ def test_simple_modal_verb_active(self):
self.assertFalse(doc[2]._.holmes.is_matchable)
def test_simple_modal_verb_passive(self):
- doc = analyzer.parse("Die Katze kann vom Hund gejagt werden")
+ doc = nlp("Die Katze kann vom Hund gejagt werden")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa(U); 4:sb(U)')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-7:None')
def test_negated_modal_verb(self):
- doc = analyzer.parse("Der Hund soll die Katze nicht jagen")
+ doc = nlp("Der Hund soll die Katze nicht jagen")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:sb(U); 4:oa(U); 5:ng(U)')
self.assertTrue(doc[6]._.holmes.is_negated)
def test_modal_verb_with_conjunction(self):
- doc = analyzer.parse(
- "Der Hund und der Löwe können die Katze und die Maus jagen")
+ doc = nlp(
+ "Die Katze und die Maus können den Hund und den Löwen jagen")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 7:oa(U); 10:oa(U)')
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '-12:None')
def test_relative_pronoun_nominative(self):
- doc = analyzer.parse("Der Hund, der die Katze jagte, war müde")
- self.assertEqual(
- doc[6]._.holmes.string_representation_of_children(), '1:sb; 5:oa')
-
- def test_relative_pronoun_welcher(self):
- doc = analyzer.parse("Der Hund, welcher die Katze jagte, war müde")
+ doc = nlp("Der Hund, der die Katze jagte, war müde")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:sb; 5:oa')
def test_relative_pronoun_nominative_with_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund, der die Katze und die Maus jagte, war müde")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '1:sb; 5:oa; 8:oa')
def test_relative_pronoun_nominative_with_passive(self):
- doc = analyzer.parse("Die Katze, die vom Hund gejagt wurde, war müde")
+ doc = nlp("Die Katze, die vom Hund gejagt wurde, war müde")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:oa; 5:sb')
def test_relative_pronoun_accusative(self):
- doc = analyzer.parse("Der Bär, den der Hund jagte, war müde")
+ doc = nlp("Der Bär, den der Hund jagte, war müde")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:oa; 5:sb')
def test_relative_pronoun_conjunction_everywhere_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund, der Elefant und der Bär, die die Katze und die Maus gejagt und gefressen haben, waren müde")
self.assertEqual(doc[15]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 7:sb; 11:oa; 14:oa; 16:cd')
@@ -311,7 +340,7 @@ def test_relative_pronoun_conjunction_everywhere_active(self):
'1:sb(U); 4:sb(U); 7:sb; 11:oa; 14:oa')
def test_relative_pronoun_conjunction_everywhere_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze, die Maus und der Vogel, die vom Bären, Löwen und Hund gejagt und gefressen worden sind, waren tot")
self.assertEqual(doc[16]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 7:oa; 11:sb; 13:sb; 15:sb; 17:cd')
@@ -319,34 +348,34 @@ def test_relative_pronoun_conjunction_everywhere_passive(self):
'1:oa(U); 4:oa(U); 7:oa; 11:sb; 13:sb; 15:sb')
def test_separable_verb(self):
- doc = analyzer.parse("Er nimmt die Situation auf")
+ doc = nlp("Er nimmt die Situation auf")
self.assertEqual(doc[1]._.holmes.lemma, 'aufnehmen')
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '0:sb; 3:oa')
def test_separable_verb_in_main_clause_but_infinitive_in_dependent_clause(self):
- doc = analyzer.parse("Der Mitarbeiter hatte vor, dies zu tun")
+ doc = nlp("Der Mitarbeiter hatte vor, dies zu tun")
self.assertEqual(doc[7]._.holmes.lemma, 'tun')
def test_separable_verb_in_main_clause_but_separable_infinitive_in_dependent_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Mitarbeiter hatte vor, eine Versicherung abzuschließen")
self.assertEqual(doc[7]._.holmes.lemma, 'abschließen')
def test_apprart(self):
- doc = analyzer.parse("Er geht zur Party")
+ doc = nlp("Er geht zur Party")
self.assertEqual(
- doc[1]._.holmes.string_representation_of_children(), '0:sb; 2:mo; 3:pobjp')
+ doc[1]._.holmes.string_representation_of_children(), '0:sb; 2:op; 3:pobjp')
self.assertEqual(doc[2].lemma_, 'zur')
self.assertEqual(doc[2]._.holmes.lemma, 'zu')
- def test_von_phrase(self):
- doc = analyzer.parse("Der Abschluss von einer Versicherung")
+ def test_von_phrase_1(self):
+ doc = nlp("Der Abschluss von einer Versicherung")
self.assertEqual(
- doc[1]._.holmes.string_representation_of_children(), '2:mnr; 4:pobjo')
+ doc[1]._.holmes.string_representation_of_children(), '2:pg; 4:pobjo')
def test_von_phrase_with_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Abschluss und Aufrechterhaltung von einer Versicherung und einem Vertrag")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'2:cd; 4:mnr; 6:pobjo; 9:pobjo')
@@ -354,99 +383,99 @@ def test_von_phrase_with_conjunction(self):
'4:mnr; 6:pobjo; 9:pobjo')
def test_von_and_durch_phrase(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Abschluss von einer Versicherung durch einen Makler")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
- '2:mnr; 4:pobjo; 5:mnr; 7:pobjb')
+ '2:pg; 4:pobjo; 5:mnr; 7:pobjb')
def test_genitive_and_durch_phrase(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Abschluss einer Versicherung durch einen Makler")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'3:ag; 4:mnr; 6:pobjb')
def test_subjective_zu_clause_complement_simple_active(self):
- doc = analyzer.parse("Der Hund überlegte, eine Katze zu jagen")
+ doc = nlp("Der Hund überlegte, eine Katze zu jagen")
self.assertEqual(
doc[7]._.holmes.string_representation_of_children(), '1:sb(U); 5:oa; 6:pm')
def test_subjective_zu_clause_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund und der Löwe entschlossen sich, eine Katze und eine Maus zu jagen")
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 9:oa; 12:oa; 13:pm')
def test_subjective_zu_clause_complement_with_relative_clause_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund, der überlegte, eine Katze zu jagen, kam nach Hause")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '1:sb(U); 7:oa; 8:pm')
def test_adjective_complement_simple_active(self):
- doc = analyzer.parse("Der Hund war darüber froh, eine Katze zu jagen")
+ doc = nlp("Der Hund war darüber froh, eine Katze zu jagen")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '1:sb(U); 7:oa; 8:pm')
def test_adjective_complement_with_conjunction_active(self):
- doc = analyzer.parse(
- "Der Hund war darüber besorgt, eine Katze und eine Maus zu jagen")
+ doc = nlp(
+ "Der Hund war darüber froh, eine Katze und eine Maus zu jagen")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
'1:sb(U); 7:oa; 10:oa; 11:pm')
def test_objective_zu_clause_complement_simple_active(self):
- doc = analyzer.parse("Der Löwe bat den Hund, eine Katze zu jagen")
+ doc = nlp("Der Löwe bat den Hund, eine Katze zu jagen")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '4:sb(U); 7:oa; 8:pm')
def test_objective_zu_clause_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Elefant schlag dem Hund und dem Löwen vor, eine Katze und eine Maus zu jagen")
self.assertEqual(doc[16]._.holmes.string_representation_of_children(),
'4:sb(U); 7:sb(U); 11:oa; 14:oa; 15:pm')
def test_passive_governing_clause_zu_clause_complement_simple_active(self):
- doc = analyzer.parse("Der Hund wurde gebeten, eine Katze zu jagen")
+ doc = nlp("Der Hund wurde gebeten, eine Katze zu jagen")
self.assertEqual(
doc[8]._.holmes.string_representation_of_children(), '1:sb(U); 6:oa; 7:pm')
def test_passive_governing_clause_zu_clause_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Dem Hund und dem Löwen wurde vorgeschlagen, eine Katze und eine Maus zu jagen")
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 9:oa; 12:oa; 13:pm')
def test_um_zu_clause_complement_simple_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Löwe benutzte den Hund, um eine Katze zu jagen")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'1:sb(U); 6:cp; 8:oa; 9:pm')
def test_um_zu_clause_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Elefant benutzte den Hund und den Löwen, um eine Katze und eine Maus zu jagen")
self.assertEqual(doc[16]._.holmes.string_representation_of_children(),
'1:sb(U); 9:cp; 11:oa; 14:oa; 15:pm')
def test_verb_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze dachte darüber nach, von einem Hund gejagt zu werden")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:oa(U); 8:sb; 10:pm')
def test_verb_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus dachten darüber nach, von einem Hund und einem Löwen gejagt zu werden")
self.assertEqual(doc[15]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 11:sb; 14:sb; 16:pm')
def test_verb_complement_with_conjunction_passive_second_pronominal_adverb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus dachten darüber und darüber nach, von einem Hund und einem Löwen gejagt zu werden")
self.assertEqual(doc[17]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 13:sb; 16:sb; 18:pm')
def test_verb_complement_with_conjunction_passive_second_dependent_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus dachten darüber nach, von einem Hund gejagt zu werden und von einem Löwen gejagt zu werden")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 11:sb; 13:pm; 15:cd')
@@ -454,66 +483,66 @@ def test_verb_complement_with_conjunction_passive_second_dependent_clause(self):
'1:oa(U); 4:oa(U); 18:sb; 20:pm')
def test_adjective_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze war darüber froh, von einem Hund gejagt zu werden")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:oa(U); 8:sb; 10:pm')
def test_adjective_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze war darüber froh, von einem Hund und einem Löwen gejagt zu werden")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
'1:oa(U); 8:sb; 11:sb; 13:pm')
def test_subjective_zu_clause_complement_simple_passive(self):
- doc = analyzer.parse("Die Katze entschied, vom Hund gejagt zu werden")
+ doc = nlp("Die Katze entschied, vom Hund gejagt zu werden")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:oa(U); 5:sb; 7:pm')
def test_subjective_zu_clause_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus entschlossen sich, vom Hund und Löwen gejagt zu werden")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 9:sb; 11:sb; 13:pm')
def test_objective_zu_clause_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Löwe bat die Katze, vom Hund gejagt zu werden")
self.assertEqual(
doc[8]._.holmes.string_representation_of_children(), '4:oa(U); 7:sb; 9:pm')
def test_objective_zu_clause_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Elefant schlag der Katze und der Maus vor, vom Hund und Löwen gejagt zu werden")
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
'4:oa(U); 7:oa(U); 11:sb; 13:sb; 15:pm')
def test_passive_governing_clause_zu_clause_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze wurde gebeten, von einem Hund gejagt zu werden")
self.assertEqual(
doc[8]._.holmes.string_representation_of_children(), '1:oa(U); 7:sb; 9:pm')
def test_passive_governing_clause_zu_clause_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Katze und der Maus wurde vorgeschlagen, von einem Löwen gejagt zu werden")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:oa(U); 4:oa(U); 10:sb; 12:pm')
def test_um_zu_clause_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Löwe benutzte die Katze, um vom Hund gejagt zu werden")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:oa(U); 6:cp; 8:sb; 10:pm')
def test_um_zu_clause_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Elefant benutzte die Katze und die Maus, um vom Hund und Löwen gejagt zu werden")
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
'1:oa(U); 9:cp; 11:sb; 13:sb; 15:pm')
def test_verb_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus haben entschieden, zu singen und zu schreien")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 8:pm; 10:cd')
@@ -521,7 +550,7 @@ def test_verb_complement_with_conjunction_of_dependent_verb(self):
'1:sb(U); 4:sb(U); 11:pm')
def test_subjective_zu_clause_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus entschlossen sich, zu singen und zu schreien")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 8:pm; 10:cd')
@@ -529,7 +558,7 @@ def test_subjective_zu_clause_complement_with_conjunction_of_dependent_verb(self
'1:sb(U); 4:sb(U); 11:pm')
def test_objective_zu_clause_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus baten den Löwen, zu singen und zu schreien")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'7:sb(U); 9:pm; 11:cd')
@@ -537,7 +566,7 @@ def test_objective_zu_clause_complement_with_conjunction_of_dependent_verb(self)
'7:sb(U); 12:pm')
def test_um_zu_clause_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Katze und die Maus benutzen den Löwen, um zu singen und zu schreien")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:sb(U); 4:sb(U); 9:cp; 10:pm; 12:cd')
@@ -545,7 +574,7 @@ def test_um_zu_clause_complement_with_conjunction_of_dependent_verb(self):
'1:sb(U); 4:sb(U); 9:cp; 13:pm')
def test_single_preposition_dependency_added_to_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Mitarbeiter braucht eine Versicherung für die nächsten fünf Jahre")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'1:sb; 4:oa; 5:moposs(U); 9:pobjp(U)')
@@ -553,95 +582,95 @@ def test_single_preposition_dependency_added_to_verb(self):
doc[4]._.holmes.string_representation_of_children(), '5:mnr; 9:pobjp')
def test_multiple_preposition_dependencies_added_to_noun(self):
- doc = analyzer.parse(
- "Der Mitarbeiter braucht eine Versicherung für die nächsten fünf Jahre und in Europa")
- self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
+ doc = nlp(
+ "Der Mitarbeiter wird eine Versicherung für die nächsten fünf Jahre und in Europa brauchen")
+ self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
'1:sb; 4:oa; 5:moposs(U); 9:pobjp(U); 11:moposs(U); 12:pobjp(U)')
self.assertEqual(doc[4]._.holmes.string_representation_of_children(
), '5:mnr; 9:pobjp; 11:mnr; 12:pobjp')
def test_no_exception_thrown_when_preposition_dependency_is_righthand_sibling(self):
- doc = analyzer.parse(
+ doc = nlp(
"Diese Funktionalität erreichen Sie über Datei/Konfiguration für C")
def test_phrase_in_parentheses_no_exception_thrown(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Tilgung beginnt in der Auszahlungsphase (d.h. mit der zweiten Auszahlung)")
def test_von_preposition_in_von_clause_unmatchable(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Kündigung von einer Versicherung")
self.assertFalse(doc[2]._.holmes.is_matchable)
def test_self_referring_dependencies_no_exception_thrown_1(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Version ist dabei mit der dieser Bug bereits gefixt sein sollte und nur noch nicht produktiv eingespielt.")
def test_self_referring_dependencies_no_exception_thrown_2(self):
- doc = analyzer.parse(
+ doc = nlp(
"Es sind Papiere, von denen SCD in den Simulationen dann eines auswählt.")
def test_stripping_adjectival_inflections(self):
- doc = analyzer.parse(
+ doc = nlp(
"Eine interessante Überlegung über gesunde Mittagessen.")
self.assertEqual(doc[1].lemma_, 'interessante')
self.assertEqual(doc[1]._.holmes.lemma, 'interessant')
self.assertEqual(
- doc[2]._.holmes.string_representation_of_children(), '1:nk; 3:mnr; 5:pobjp')
+ doc[2]._.holmes.string_representation_of_children(), '1:nk; 3:op; 5:pobjp')
self.assertEqual(doc[4].lemma_, 'gesunden')
self.assertEqual(doc[4]._.holmes.lemma, 'gesund')
def test_adjective_complement_proper_name(self):
- doc = analyzer.parse("Richard war froh, es zu verstehen.")
+ doc = nlp("Richard war froh, es zu verstehen.")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'0:sb(U); 4:oa; 5:pm')
def test_adjective_verb_clause_with_zu_subjective_zu_separate_simple(self):
- doc = analyzer.parse("Richard war froh zu verstehen.")
+ doc = nlp("Richard war froh zu verstehen.")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:arg(U); 2:mo; 3:pm')
def test_adjective_verb_clause_with_zu_subjective_zu_separate_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard und Thomas waren froh und erleichtert zu verstehen und zu begreifen.")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
- '0:arg(U); 2:arg(U); 4:mo; 6:mo; 7:pm; 9:cd')
+ '0:arg(U); 2:arg(U); 4:pd; 6:pd; 7:pm; 9:cd')
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
- '0:arg(U); 2:arg(U); 4:mo; 6:mo; 10:pm')
+ '0:arg(U); 2:arg(U); 4:pd; 6:pd; 10:pm')
def test_adjective_verb_clause_with_zu_objective_zu_separate_simple(self):
- doc = analyzer.parse("Richard war schwer zu erreichen.")
+ doc = nlp("Richard war schwer zu erreichen.")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:arg(U); 2:mo; 3:pm')
def test_adjective_verb_clause_with_zu_objective_zu_separate_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard und Thomas war schwer und schwierig zu erreichen und zu bekommen.")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
- '0:arg(U); 2:arg(U); 4:mo; 6:mo; 7:pm; 9:cd')
+ '0:arg(U); 2:arg(U); 4:pd; 6:pd; 7:pm; 9:cd')
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
- '0:arg(U); 2:arg(U); 4:mo; 6:mo; 10:pm')
+ '0:arg(U); 2:arg(U); 4:pd; 6:pd; 10:pm')
def test_adjective_verb_clause_with_zu_subjective_zu_integrated_simple(self):
- doc = analyzer.parse("Richard war froh hineinzugehen.")
+ doc = nlp("Richard war froh hineinzugehen.")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
- '0:sb; 2:mo')
+ '0:arg(U); 2:mo')
def test_adjective_verb_clause_with_zu_subjective_zu_integrated_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard und Thomas waren froh hineinzugehen und hinzugehen.")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
- '0:sb; 2:sb; 4:mo; 6:cd')
+ '0:arg(U); 2:arg(U); 4:mo; 6:cd')
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
- '0:sb; 2:sb; 4:mo')
+ '0:arg(U); 2:arg(U); 4:mo')
def test_adjective_verb_clause_with_zu_objective_zu_integrated_simple(self):
- doc = analyzer.parse("Richard war leicht einzubinden.")
+ doc = nlp("Richard war leicht einzubinden.")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'0:arg(U); 2:mo')
def test_adjective_verb_clause_with_zu_objective_zu_integrated_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard und Thomas waren leicht einzubinden und aufzugleisen.")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'0:arg(U); 2:arg(U); 4:mo; 6:cd')
@@ -649,18 +678,18 @@ def test_adjective_verb_clause_with_zu_objective_zu_integrated_compound(self):
'0:arg(U); 2:arg(U); 4:mo')
def test_ungrammatical_two_nominatives(self):
- doc = analyzer.parse("Der Hund jagt der Hund")
+ doc = nlp("Der Hund jagt der Hund")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'1:sb; 4:oa')
def test_ungrammatical_two_nominatives_with_noun_phrase_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund und der Hund jagen der Hund und der Hund")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:sb; 4:sb; 7:oa; 10:oa')
def test_ungrammatical_two_nominatives_with_noun_phrase_and_verb_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Hund und der Hund jagen und fressen der Hund und der Hund")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:sb; 4:sb; 6:cd')
@@ -668,85 +697,87 @@ def test_ungrammatical_two_nominatives_with_noun_phrase_and_verb_conjunction(sel
'1:sb; 4:sb; 9:oa; 12:oa')
def test_ungrammatical_two_accusatives(self):
- doc = analyzer.parse("Den Hund jagt den Hund")
+ doc = nlp("Den Hund jagt den Hund")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'1:sb; 4:oa')
def test_ungrammatical_two_accusatives_with_noun_phrase_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Den Hund und den Hund jagen den Hund und den Hund")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:sb; 4:sb; 7:oa; 10:oa')
def test_ungrammatical_two_accusatives_with_noun_phrase_and_verb_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"Den Hund und den Hund jagen und fressen den Hund und den Hund")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:oa; 4:oa; 6:cd')
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'9:oa; 12:oa')
- def test_uncertain_subject_and_subject(self):
- doc = analyzer.parse("Ich glaube, dass eine Pflanze wächst")
+ def test_subjects_in_main_and_subordinate_clauses(self):
+ doc = nlp("Ich glaube, dass eine Pflanze wächst")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
- '0:sb(U); 3:cp; 5:sb')
+ '3:cp; 5:sb')
def test_moposs_before_governing_verb(self):
- doc = analyzer.parse("Ich möchte ein Konto für mein Kind eröffnen")
+ doc = nlp("Ich möchte ein Konto für mein Kind eröffnen")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'0:sb(U); 3:oa(U); 4:moposs(U); 6:pobjp(U)')
def test_hat_vor_clause(self):
- doc = analyzer.parse("Ich habe vor, ein Konto zu eröffnen")
+ doc = nlp("Ich habe vor, ein Konto zu eröffnen")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'0:sb(U); 5:oa; 6:pm')
def test_simple_relative_prepositional_phrase(self):
- doc = analyzer.parse("Der Tisch, von welchem wir aßen.")
+ doc = nlp("Der Tisch, von welchem wir aßen.")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'-2:None')
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:pobjp; 3:mo; 5:sb')
def test_conjunction_relative_prepositional_phrase(self):
- doc = analyzer.parse(
+ doc = nlp(
"Der Tisch und der Stuhl, von denen du und ich aßen und tranken.")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'-5:None')
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
- '1:pobjp(U); 4:pobjp; 6:mo; 8:sb; 10:sb; 12:cd')
+ '1:pobjo; 4:pobjo; 6:op; 8:sb; 10:sb; 12:cd')
self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
- '1:pobjp(U); 4:pobjp; 6:mo; 8:sb; 10:sb')
+ '1:pobjo; 4:pobjo; 6:op; 8:sb; 10:sb')
def test_conjunction_with_subject_object_and_verb_further_right(self):
- doc = analyzer.parse("Der Mann aß das Fleisch und trank.")
+ doc = nlp("Der Mann aß das Fleisch und trank.")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:sb')
- def test_conjunction_with_subject_object_modal_and_verb_further_Right(self):
- doc = analyzer.parse(
+ def test_conjunction_with_subject_object_modal_and_verb_further_right(self):
+ doc = nlp(
"Der Mann hat das Fleisch gegessen und getrunken.")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:sb; 4:oa')
def test_conjunction_with_prepositional_phrase_and_noun_further_right(self):
- doc = analyzer.parse(
+ doc = nlp(
"Eine Versicherung für die nächsten fünf Jahre und eine Police")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '')
def test_parent_token_indexes(self):
- doc = analyzer.parse("Häuser im Dorf.")
- self.assertEqual(doc[2]._.holmes.parent_dependencies, [
+ doc = nlp("Häuser im Dorf.")
+ self.assertEqual(doc[2]._.holmes.coreference_linked_parent_dependencies, [
[0, 'pobjp'], [1, 'nk']])
+ self.assertEqual(doc[2]._.holmes.string_representation_of_parents(),
+ '0:pobjp; 1:nk')
- def test_von_phrase_with_op(self):
- doc = analyzer.parse("Die Verwandlung von einem Mädchen")
+ def test_von_phrase_2(self):
+ doc = nlp("Die Verwandlung von einem Mädchen")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
- '2:op; 4:pobjo')
+ '2:pg; 4:pobjo')
def test_subwords_without_fugen_s(self):
- doc = analyzer.parse("Telefaxnummer.")
+ doc = nlp("Telefaxnummer.")
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Telefax')
@@ -762,11 +793,11 @@ def test_subwords_without_fugen_s(self):
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 7)
def test_subwords_with_fugen_s(self):
- doc = analyzer.parse("Widerrufsbelehrung")
+ doc = nlp("Widerrufsbelehrung")
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Widerruf')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerrufen')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerruf')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -778,34 +809,34 @@ def test_subwords_with_fugen_s(self):
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 9)
def test_no_subwords_without_s(self):
- doc = analyzer.parse("Lappalie")
+ doc = nlp("Lappalie")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_no_subwords_with_s(self):
- doc = analyzer.parse("Datenschutz")
+ doc = nlp("Datenschutz")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_no_subwords_because_of_extra_letter_after_valid_subwords(self):
- doc = analyzer.parse("ZahlungsverkehrX")
+ doc = nlp("ZahlungsverkehrX")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_durch_phrase_simple(self):
- doc = analyzer.parse("Die Jagd durch den Hund")
+ doc = nlp("Die Jagd durch den Hund")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'2:mnr; 4:pobjb')
def test_durch_phrase_with_conjunction(self):
- doc = analyzer.parse("Die Jagd durch den Hund und die Katze")
+ doc = nlp("Die Jagd durch den Hund und die Katze")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'2:mnr; 4:pobjb; 7:pobjb')
def test_subwords_word_twice_in_document(self):
- doc = analyzer.parse(
+ doc = nlp(
"Widerrufsbelehrung und die widerrufsbelehrung waren interessant")
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Widerruf')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerrufen')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerruf')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -819,7 +850,7 @@ def test_subwords_word_twice_in_document(self):
self.assertEqual(len(doc[3]._.holmes.subwords), 2)
self.assertEqual(doc[3]._.holmes.subwords[0].text, 'widerruf')
- self.assertEqual(doc[3]._.holmes.subwords[0].lemma, 'widerrufen')
+ self.assertEqual(doc[3]._.holmes.subwords[0].lemma, 'widerruf')
self.assertEqual(doc[3]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[3]._.holmes.subwords[0].containing_token_index, 3)
self.assertEqual(doc[3]._.holmes.subwords[0].char_start_index, 0)
@@ -832,30 +863,7 @@ def test_subwords_word_twice_in_document(self):
def test_three_subwords_with_non_whitelisted_fugen_s(self):
- doc = analyzer.parse("Inhaltsverzeichnisanlage")
- self.assertEqual(len(doc[0]._.holmes.subwords), 3)
-
- self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Inhalt')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'inhalt')
- self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
- self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
- self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
-
- self.assertEqual(doc[0]._.holmes.subwords[1].text, 'verzeichnis')
- self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'verzeichnis')
- self.assertEqual(doc[0]._.holmes.subwords[1].index, 1)
- self.assertEqual(doc[0]._.holmes.subwords[1].containing_token_index, 0)
- self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 7)
-
- self.assertEqual(doc[0]._.holmes.subwords[2].text, 'anlage')
- self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'anlage')
- self.assertEqual(doc[0]._.holmes.subwords[2].index, 1)
- self.assertEqual(doc[0]._.holmes.subwords[2].containing_token_index, 0)
- self.assertEqual(doc[0]._.holmes.subwords[2].char_start_index, 18)
-
- def test_three_subwords_with_non_whitelisted_fugen_s(self):
-
- doc = analyzer.parse("Inhaltsverzeichnisanlage")
+ doc = nlp("Inhaltsverzeichnisanlage")
self.assertEqual(len(doc[0]._.holmes.subwords), 3)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Inhalt')
@@ -878,7 +886,7 @@ def test_three_subwords_with_non_whitelisted_fugen_s(self):
def test_four_subwords_with_whitelisted_fugen_s(self):
- doc = analyzer.parse("Finanzdienstleistungsaufsicht")
+ doc = nlp("Finanzdienstleistungsaufsicht")
self.assertEqual(len(doc[0]._.holmes.subwords), 4)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Finanz')
@@ -892,13 +900,13 @@ def test_four_subwords_with_whitelisted_fugen_s(self):
def test_inflected_main_word(self):
- doc = analyzer.parse("Verbraucherstreitbeilegungsgesetzes")
+ doc = nlp("Verbraucherstreitbeilegungsgesetzes")
self.assertEqual(len(doc[0]._.holmes.subwords), 4)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Verbraucher')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verbraucher')
self.assertEqual(doc[0]._.holmes.subwords[1].text, 'streit')
- self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'streiten')
+ self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'streit')
self.assertEqual(doc[0]._.holmes.subwords[2].text, 'beilegung')
self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'beilegung')
self.assertEqual(doc[0]._.holmes.subwords[3].text, 'gesetzes')
@@ -906,7 +914,7 @@ def test_inflected_main_word(self):
def test_inflected_subword_other_than_fugen_s(self):
- doc = analyzer.parse("Bundesoberbehörde")
+ doc = nlp("Bundesoberbehörde")
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Bundes')
@@ -916,7 +924,7 @@ def test_inflected_subword_other_than_fugen_s(self):
def test_initial_short_word(self):
- doc = analyzer.parse("Vorversicherung")
+ doc = nlp("Vorversicherung")
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Vor')
@@ -926,48 +934,46 @@ def test_initial_short_word(self):
def test_subwords_score_too_high(self):
- doc = analyzer.parse("Requalifizierung")
+ doc = nlp("Requalifizierung")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_final_blacklisted_subword(self):
- doc = analyzer.parse("Gemütlichkeit")
+ doc = nlp("Gemütlichkeit")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_subword_rejected_because_of_bigraphs(self):
- doc = analyzer.parse("Verantwortung")
+ doc = nlp("Verantwortung")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_nonsense_word(self):
- doc = analyzer.parse("WiderrufsbelehrungWiderrufsrechtSie")
- self.assertEqual(len(doc[0]._.holmes.subwords), 5)
+ doc = nlp("WiderrufsbelehrungWiderrufsrecht.")
+ self.assertEqual(len(doc[0]._.holmes.subwords), 4)
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Widerruf')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerrufen')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'widerruf')
self.assertEqual(doc[0]._.holmes.subwords[1].text, 'belehrung')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'belehrung')
self.assertEqual(doc[0]._.holmes.subwords[2].text, 'Widerruf')
- self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'widerrufen')
+ self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'widerruf')
self.assertEqual(doc[0]._.holmes.subwords[3].text, 'recht')
self.assertEqual(doc[0]._.holmes.subwords[3].lemma, 'recht')
- self.assertEqual(doc[0]._.holmes.subwords[4].text, 'Sie')
- self.assertEqual(doc[0]._.holmes.subwords[4].lemma, 'ich')
def test_nonsense_word_with_number(self):
- doc = analyzer.parse("Widerrufs3belehrungWiderrufsrechtSie")
+ doc = nlp("Widerrufs3belehrungWiderrufsrechtSie")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_nonsense_word_with_underscore(self):
- doc = analyzer.parse("Widerrufs_belehrungWiderrufsrechtSie")
+ doc = nlp("Widerrufs_belehrungWiderrufsrechtSie")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_negated_subword_with_caching(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Nichtbeachtung der Regeln. Die Nichtbeachtung der Regeln")
self.assertTrue(doc[1]._.holmes.is_negated)
self.assertFalse(doc[0]._.holmes.is_negated)
@@ -982,7 +988,7 @@ def test_negated_subword_with_caching(self):
def test_subword_conjunction_two_words_single_subwords_first_word_hyphenated(self):
- doc = analyzer.parse("Die Haupt- und Seiteneingänge")
+ doc = nlp("Die Haupt- und Seiteneingänge")
self.assertEqual(doc[1]._.holmes.subwords[0].text, 'Haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].lemma, 'haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].index, 0)
@@ -1009,7 +1015,7 @@ def test_subword_conjunction_two_words_single_subwords_first_word_hyphenated(sel
def test_caching(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Haupt- und Seiteneingänge. Die Haupt- und Seiteneingänge")
self.assertEqual(doc[6]._.holmes.subwords[0].text, 'Haupt')
self.assertEqual(doc[6]._.holmes.subwords[0].lemma, 'haupt')
@@ -1037,7 +1043,7 @@ def test_caching(self):
def test_subword_conjunction_three_words_single_subwords_first_word_hyphenated(self):
- doc = analyzer.parse("Die Haupt-, Neben- und Seiteneingänge")
+ doc = nlp("Die Haupt-, Neben- und Seiteneingänge")
self.assertEqual(doc[1]._.holmes.subwords[0].text, 'Haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].lemma, 'haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].index, 0)
@@ -1076,7 +1082,7 @@ def test_subword_conjunction_three_words_single_subwords_first_word_hyphenated(s
def test_subword_conjunction_two_words_multiple_subwords_first_word_hyphenated(self):
- doc = analyzer.parse("Die Haupt- und Seiteneingangsbeschränkungen")
+ doc = nlp("Die Haupt- und Seiteneingangsbeschränkungen")
self.assertEqual(doc[1]._.holmes.subwords[0].text, 'Haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].lemma, 'haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].index, 0)
@@ -1115,7 +1121,7 @@ def test_subword_conjunction_two_words_multiple_subwords_first_word_hyphenated(s
def test_subword_conjunction_three_words_multiple_subwords_first_word_hyphenated(self):
- doc = analyzer.parse(
+ doc = nlp(
"Die Haupt-, Neben- und Seiteneingangsbeschränkungen")
self.assertEqual(doc[1]._.holmes.subwords[0].text, 'Haupt')
self.assertEqual(doc[1]._.holmes.subwords[0].lemma, 'haupt')
@@ -1173,7 +1179,7 @@ def test_subword_conjunction_three_words_multiple_subwords_first_word_hyphenated
def test_subword_conjunction_adjectives(self):
- doc = analyzer.parse("Das Essen war vitamin- und eiweißhaltig")
+ doc = nlp("Das Essen war vitamin- und eiweißhaltig")
self.assertEqual(doc[3]._.holmes.subwords[0].text, 'vitamin')
self.assertEqual(doc[3]._.holmes.subwords[0].lemma, 'vitamin')
self.assertEqual(doc[3]._.holmes.subwords[0].index, 0)
@@ -1200,9 +1206,9 @@ def test_subword_conjunction_adjectives(self):
def test_subword_conjunction_two_words_single_subwords_last_word_hyphenated(self):
- doc = analyzer.parse("Verkehrslenkung und -überwachung")
+ doc = nlp("Verkehrslenkung und -überwachung")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -1214,7 +1220,7 @@ def test_subword_conjunction_two_words_single_subwords_last_word_hyphenated(self
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 8)
self.assertEqual(doc[2]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[2]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].char_start_index, 0)
@@ -1227,9 +1233,9 @@ def test_subword_conjunction_two_words_single_subwords_last_word_hyphenated(self
def test_subword_conjunction_three_words_single_subwords_last_word_hyphenated(self):
- doc = analyzer.parse("Verkehrslenkung, -überwachung und -betrachtung")
+ doc = nlp("Verkehrslenkung, -überwachung und -betrachtung")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -1241,7 +1247,7 @@ def test_subword_conjunction_three_words_single_subwords_last_word_hyphenated(se
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 8)
self.assertEqual(doc[2]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[2]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].char_start_index, 0)
@@ -1253,7 +1259,7 @@ def test_subword_conjunction_three_words_single_subwords_last_word_hyphenated(se
self.assertEqual(doc[2]._.holmes.subwords[1].char_start_index, 1)
self.assertEqual(doc[4]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[4]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[4]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[4]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[4]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[4]._.holmes.subwords[0].char_start_index, 0)
@@ -1266,10 +1272,10 @@ def test_subword_conjunction_three_words_single_subwords_last_word_hyphenated(se
def test_subword_conjunction_two_words_multiple_subwords_last_word_hyphenated(self):
- doc = analyzer.parse(
+ doc = nlp(
"Verkehrskontrolllenkung und -überwachungsprinzipien")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -1287,7 +1293,7 @@ def test_subword_conjunction_two_words_multiple_subwords_last_word_hyphenated(se
self.assertEqual(doc[0]._.holmes.subwords[2].char_start_index, 16)
self.assertEqual(doc[2]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[2]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].char_start_index, 0)
@@ -1312,10 +1318,10 @@ def test_subword_conjunction_two_words_multiple_subwords_last_word_hyphenated(se
def test_subword_conjunction_three_words_multiple_subwords_last_word_hyphenated(self):
- doc = analyzer.parse(
+ doc = nlp(
"Verkehrskontrolllenkung, -überwachungsprinzipien und -betrachtung")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[0]._.holmes.subwords[0].char_start_index, 0)
@@ -1333,7 +1339,7 @@ def test_subword_conjunction_three_words_multiple_subwords_last_word_hyphenated(
self.assertEqual(doc[0]._.holmes.subwords[2].char_start_index, 16)
self.assertEqual(doc[2]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[2]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[2]._.holmes.subwords[0].char_start_index, 0)
@@ -1357,7 +1363,7 @@ def test_subword_conjunction_three_words_multiple_subwords_last_word_hyphenated(
self.assertEqual(doc[2]._.holmes.subwords[3].char_start_index, 13)
self.assertEqual(doc[4]._.holmes.subwords[0].text, 'Verkehr')
- self.assertEqual(doc[4]._.holmes.subwords[0].lemma, 'verkehren')
+ self.assertEqual(doc[4]._.holmes.subwords[0].lemma, 'verkehr')
self.assertEqual(doc[4]._.holmes.subwords[0].index, 0)
self.assertEqual(doc[4]._.holmes.subwords[0].containing_token_index, 0)
self.assertEqual(doc[4]._.holmes.subwords[0].char_start_index, 0)
@@ -1376,7 +1382,7 @@ def test_subword_conjunction_three_words_multiple_subwords_last_word_hyphenated(
def test_subword_conjunction_two_words_single_subwords_first_and_last_words_hyphenated(self):
- doc = analyzer.parse("Textilgroß- und -einzelhandel")
+ doc = nlp("Textilgroß- und -einzelhandel")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Textil')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'textil')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1390,7 +1396,7 @@ def test_subword_conjunction_two_words_single_subwords_first_and_last_words_hyph
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 6)
self.assertEqual(doc[0]._.holmes.subwords[2].text, 'handel')
- self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'handeln')
+ self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'handel')
self.assertEqual(doc[0]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[0]._.holmes.subwords[2].containing_token_index, 2)
self.assertEqual(doc[0]._.holmes.subwords[2].char_start_index, 7)
@@ -1408,14 +1414,14 @@ def test_subword_conjunction_two_words_single_subwords_first_and_last_words_hyph
self.assertEqual(doc[2]._.holmes.subwords[1].char_start_index, 1)
self.assertEqual(doc[2]._.holmes.subwords[2].text, 'handel')
- self.assertEqual(doc[2]._.holmes.subwords[2].lemma, 'handeln')
+ self.assertEqual(doc[2]._.holmes.subwords[2].lemma, 'handel')
self.assertEqual(doc[2]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[2]._.holmes.subwords[2].containing_token_index, 2)
self.assertEqual(doc[2]._.holmes.subwords[2].char_start_index, 7)
def test_subword_conjunction_two_words_multiple_subwords_first_and_last_words_hyphenated(self):
- doc = analyzer.parse("Feintextilgroß- und -einzeldetailhandel")
+ doc = nlp("Feintextilgroß- und -einzeldetailhandel")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Fein')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'fein')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1441,7 +1447,7 @@ def test_subword_conjunction_two_words_multiple_subwords_first_and_last_words_hy
self.assertEqual(doc[0]._.holmes.subwords[3].char_start_index, 7)
self.assertEqual(doc[0]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[0]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[0]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[0]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[0]._.holmes.subwords[4].containing_token_index, 2)
self.assertEqual(doc[0]._.holmes.subwords[4].char_start_index, 13)
@@ -1471,14 +1477,14 @@ def test_subword_conjunction_two_words_multiple_subwords_first_and_last_words_hy
self.assertEqual(doc[2]._.holmes.subwords[3].char_start_index, 7)
self.assertEqual(doc[2]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[2]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[2]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[2]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[2]._.holmes.subwords[4].containing_token_index, 2)
self.assertEqual(doc[2]._.holmes.subwords[4].char_start_index, 13)
def test_subword_conjunction_three_words_single_subwords_first_and_last_words_hyphenated(self):
- doc = analyzer.parse("Textilgroß-, -klein- und -einzelhandel")
+ doc = nlp("Textilgroß-, -klein- und -einzelhandel")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Textil')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'textil')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1492,7 +1498,7 @@ def test_subword_conjunction_three_words_single_subwords_first_and_last_words_hy
self.assertEqual(doc[0]._.holmes.subwords[1].char_start_index, 6)
self.assertEqual(doc[0]._.holmes.subwords[2].text, 'handel')
- self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'handeln')
+ self.assertEqual(doc[0]._.holmes.subwords[2].lemma, 'handel')
self.assertEqual(doc[0]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[0]._.holmes.subwords[2].containing_token_index, 4)
self.assertEqual(doc[0]._.holmes.subwords[2].char_start_index, 7)
@@ -1510,7 +1516,7 @@ def test_subword_conjunction_three_words_single_subwords_first_and_last_words_hy
self.assertEqual(doc[2]._.holmes.subwords[1].char_start_index, 1)
self.assertEqual(doc[2]._.holmes.subwords[2].text, 'handel')
- self.assertEqual(doc[2]._.holmes.subwords[2].lemma, 'handeln')
+ self.assertEqual(doc[2]._.holmes.subwords[2].lemma, 'handel')
self.assertEqual(doc[2]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[2]._.holmes.subwords[2].containing_token_index, 4)
self.assertEqual(doc[2]._.holmes.subwords[2].char_start_index, 7)
@@ -1528,14 +1534,14 @@ def test_subword_conjunction_three_words_single_subwords_first_and_last_words_hy
self.assertEqual(doc[4]._.holmes.subwords[1].char_start_index, 1)
self.assertEqual(doc[4]._.holmes.subwords[2].text, 'handel')
- self.assertEqual(doc[4]._.holmes.subwords[2].lemma, 'handeln')
+ self.assertEqual(doc[4]._.holmes.subwords[2].lemma, 'handel')
self.assertEqual(doc[4]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[4]._.holmes.subwords[2].containing_token_index, 4)
self.assertEqual(doc[4]._.holmes.subwords[2].char_start_index, 7)
def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyphenated(self):
- doc = analyzer.parse(
+ doc = nlp(
"Feintextilgroß-, -klein-, -mittel- und -einzeldetailhandel")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Fein')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'fein')
@@ -1589,7 +1595,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
doc[0]._.holmes.subwords[3].governing_dependency_label, 'intcompound')
self.assertEqual(doc[0]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[0]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[0]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[0]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[0]._.holmes.subwords[4].containing_token_index, 6)
self.assertEqual(doc[0]._.holmes.subwords[4].char_start_index, 13)
@@ -1653,7 +1659,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
doc[2]._.holmes.subwords[3].governing_dependency_label, 'intcompound')
self.assertEqual(doc[2]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[2]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[2]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[2]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[2]._.holmes.subwords[4].containing_token_index, 6)
self.assertEqual(doc[2]._.holmes.subwords[4].char_start_index, 13)
@@ -1665,6 +1671,8 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
self.assertEqual(
doc[2]._.holmes.subwords[4].governing_dependency_label, None)
+ self.assertTrue(doc[2]._.holmes.is_matchable)
+
self.assertEqual(doc[4]._.holmes.subwords[0].text, 'Fein')
self.assertEqual(doc[4]._.holmes.subwords[0].lemma, 'fein')
self.assertEqual(doc[4]._.holmes.subwords[0].index, 0)
@@ -1691,7 +1699,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
doc[4]._.holmes.subwords[1].governing_dependency_label, 'intcompound')
self.assertEqual(doc[4]._.holmes.subwords[2].text, 'mittel')
- self.assertEqual(doc[4]._.holmes.subwords[2].lemma, 'mitteln')
+ self.assertEqual(doc[4]._.holmes.subwords[2].lemma, 'mittel')
self.assertEqual(doc[4]._.holmes.subwords[2].index, 2)
self.assertEqual(doc[4]._.holmes.subwords[2].containing_token_index, 4)
self.assertEqual(doc[4]._.holmes.subwords[2].char_start_index, 1)
@@ -1717,7 +1725,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
doc[4]._.holmes.subwords[3].governing_dependency_label, 'intcompound')
self.assertEqual(doc[4]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[4]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[4]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[4]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[4]._.holmes.subwords[4].containing_token_index, 6)
self.assertEqual(doc[4]._.holmes.subwords[4].char_start_index, 13)
@@ -1781,7 +1789,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
doc[6]._.holmes.subwords[3].governing_dependency_label, 'intcompound')
self.assertEqual(doc[6]._.holmes.subwords[4].text, 'handel')
- self.assertEqual(doc[6]._.holmes.subwords[4].lemma, 'handeln')
+ self.assertEqual(doc[6]._.holmes.subwords[4].lemma, 'handel')
self.assertEqual(doc[6]._.holmes.subwords[4].index, 4)
self.assertEqual(doc[6]._.holmes.subwords[4].containing_token_index, 6)
self.assertEqual(doc[6]._.holmes.subwords[4].char_start_index, 13)
@@ -1795,7 +1803,7 @@ def test_subword_conjunction_4_words_multiple_subwords_first_and_last_words_hyph
def test_inner_hyphens_single_word(self):
- doc = analyzer.parse("Mozart-Symphonien")
+ doc = nlp("Mozart-Symphonien")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1823,7 +1831,7 @@ def test_inner_hyphens_single_word(self):
def test_inner_hyphens_single_word_fugen_s(self):
- doc = analyzer.parse("Informations-Extraktion")
+ doc = nlp("Informations-Extraktion")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Information')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'information')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1838,24 +1846,24 @@ def test_inner_hyphens_single_word_fugen_s(self):
def test_extraneous_final_hyphen(self):
- doc = analyzer.parse("Mozart- und Leute")
+ doc = nlp("Mozart- und Leute")
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_extraneous_initial_hyphen(self):
- doc = analyzer.parse("Mozart und -Leute")
+ doc = nlp("Mozart und -Leute")
self.assertEqual(len(doc[2]._.holmes.subwords), 0)
def test_hyphen_alone(self):
- doc = analyzer.parse("Mozart und - Leute")
+ doc = nlp("Mozart und - Leute")
self.assertEqual(len(doc[2]._.holmes.subwords), 0)
self.assertEqual(doc[2].text, '-')
self.assertEqual(doc[2]._.holmes.lemma, '-')
def test_inner_hyphens_last_word_hyphenated(self):
- doc = analyzer.parse("Mozart-Symphonien und -Sonaten")
+ doc = nlp("Mozart-Symphonien und -Sonaten")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1882,7 +1890,7 @@ def test_inner_hyphens_last_word_hyphenated(self):
def test_inner_hyphens_last_word_hyphenated_fugen_s(self):
- doc = analyzer.parse("Informations-Extraktion und -beurteilung")
+ doc = nlp("Informations-Extraktion und -beurteilung")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Information')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'information')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1909,7 +1917,7 @@ def test_inner_hyphens_last_word_hyphenated_fugen_s(self):
def test_inner_hyphens_first_word_hyphenated(self):
- doc = analyzer.parse("Mozart-, Mahler- und Wagner-Symphonien")
+ doc = nlp("Mozart-, Mahler- und Wagner-Symphonien")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'mozart')
self.assertEqual(doc[0]._.holmes.subwords[0].index, 0)
@@ -1948,7 +1956,7 @@ def test_inner_hyphens_first_word_hyphenated(self):
def test_inner_hyphens_first_word_hyphenated_fugen_s(self):
- doc = analyzer.parse("Informations- und Extraktions-Beurteilung")
+ doc = nlp("Informations- und Extraktions-Beurteilung")
self.assertEqual(doc[0]._.holmes.subwords[0].text, 'Information')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'information')
self.assertEqual(doc[0]._.holmes.subwords[0].derived_lemma, None)
@@ -1981,176 +1989,206 @@ def test_inner_hyphens_first_word_hyphenated_fugen_s(self):
def test_conjunction_switched_round_with_hyphenated_subword_expression(self):
- doc = analyzer.parse(
+ doc = nlp(
"Ein Informationsextraktions- und Besprechungspaket wird aufgelöst")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:oa; 3:oa')
def test_conjunction_switched_round_with_hyphenated_subword_expression_and_relative_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"Das Informationsextraktions- und Besprechungspaket, welches aufgelöst wurde")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:oa(U); 3:oa')
def test_subword_is_abbreviation_no_error_thrown(self):
- doc = analyzer.parse("Briljanten")
+ doc = nlp("Briljanten")
def test_derived_lemma_from_dictionary(self):
- doc = analyzer.parse("Er schießt.")
+ doc = nlp("Er schießt.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'schuss')
def test_derived_lemma_root_word_from_dictionary(self):
- doc = analyzer.parse("Der Schuss war laut.")
+ doc = nlp("Der Schuss war laut.")
self.assertEqual(doc[1]._.holmes.derived_lemma, None)
def test_derived_lemma_ung(self):
- doc = analyzer.parse("Eine hohe Regung.")
+ doc = nlp("Eine hohe Regung.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'regen')
def test_derived_lemma_lung(self):
- doc = analyzer.parse("Die Drosselung.")
+ doc = nlp("Die Drosselung.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'drosseln')
def test_derived_lemma_ierung(self):
- doc = analyzer.parse("Die Validierung.")
+ doc = nlp("Die Validierung.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'validation')
def test_derived_lemma_ieren(self):
- doc = analyzer.parse("Wir validieren das.")
+ doc = nlp("Wir validieren das.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'validation')
def test_derived_lemma_rung(self):
- doc = analyzer.parse("Eine Behinderung.")
+ doc = nlp("Eine Behinderung.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'behindern')
def test_derived_lemma_ung_blacklist_direct(self):
- doc = analyzer.parse("Der Nibelung.")
+ doc = nlp("Der Nibelung.")
self.assertEqual(doc[1]._.holmes.derived_lemma, None)
def test_derived_lemma_heit(self):
- doc = analyzer.parse("Die ganze Schönheit.")
+ doc = nlp("Die ganze Schönheit.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'schön')
def test_derived_lemma_keit(self):
- doc = analyzer.parse("Seine Langlebigkeit.")
+ doc = nlp("Seine Langlebigkeit.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'langlebig')
def test_derived_lemma_chen_no_change(self):
- doc = analyzer.parse("Das Tischchen.")
+ doc = nlp("Das Tischchen.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'tisch')
def test_derived_lemma_lein_no_change(self):
- doc = analyzer.parse("Das Tischlein.")
+ doc = nlp("Das Tischlein.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'tisch')
def test_derived_lemma_chen_umlaut(self):
- doc = analyzer.parse("Das kleine Bäuchchen.")
+ doc = nlp("Das kleine Bäuchchen.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'bauch')
def test_derived_lemma_four_letter_ending_ch(self):
- doc = analyzer.parse("Das Dach.")
+ doc = nlp("Das Dach.")
self.assertEqual(doc[1]._.holmes.derived_lemma, None)
def test_derived_lemma_lein_umlaut(self):
- doc = analyzer.parse("Das kleine Bäuchlein.")
+ doc = nlp("Das kleine Bäuchlein.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'bauch')
def test_derived_lemma_chen_5_chars(self):
- doc = analyzer.parse("Das kleine Öchen.")
+ doc = nlp("Das kleine Öchen.")
self.assertEqual(doc[2]._.holmes.derived_lemma, None)
def test_derived_lemma_chen_4_chars(self):
- doc = analyzer.parse("Das kleine Chen.")
+ doc = nlp("Das kleine Chen.")
self.assertEqual(doc[2]._.holmes.derived_lemma, None)
def test_derived_lemma_chen_no_umlaut_change(self):
- doc = analyzer.parse("Das kleine Löffelchen.")
+ doc = nlp("Das kleine Löffelchen.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'löffel')
def test_derived_lemma_lein_no_umlaut_change_l_ending(self):
- doc = analyzer.parse("Das kleine Löffelein.")
+ doc = nlp("Das kleine Löffelein.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'löffel')
def test_derived_lemma_lein_l_ending(self):
- doc = analyzer.parse("Das kleine Schakalein.")
+ doc = nlp("Das kleine Schakalein.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'schakal')
def test_derived_lemma_e(self):
- doc = analyzer.parse("Das große Auge.")
+ doc = nlp("Das große Auge.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'aug')
def test_derived_lemma_e_with_preceding_vowel(self):
- doc = analyzer.parse("Die große Lappalie.")
+ doc = nlp("Die große Lappalie.")
self.assertEqual(doc[2]._.holmes.derived_lemma, None)
def test_derived_lemma_e_1_char(self):
- doc = analyzer.parse("Das große E.")
+ doc = nlp("Das große E.")
self.assertEqual(doc[2]._.holmes.derived_lemma, None)
def test_derived_lemma_subword_positive_case(self):
- doc = analyzer.parse("Informierensextraktion.")
+ doc = nlp("Informierensextraktion.")
self.assertEqual(
doc[0]._.holmes.subwords[0].derived_lemma, 'information')
def test_derived_lemma_subword_negative_case(self):
- doc = analyzer.parse("Elefantenschau.")
+ doc = nlp("Elefantenschau.")
self.assertEqual(doc[0]._.holmes.subwords[0].derived_lemma, None)
def test_derived_lemma_subword_conjunction_first_word(self):
- doc = analyzer.parse("Fitness- und Freizeitsjogging.")
+ doc = nlp("Fitness- und Freizeitsjogging.")
self.assertEqual(doc[0]._.holmes.subwords[1].derived_lemma, 'joggen')
def test_derived_lemma_subword_conjunction_last_word(self):
- doc = analyzer.parse("Investitionsanfänge und -auswirkungen.")
+ doc = nlp("Investitionsanfänge und -auswirkungen.")
self.assertEqual(
doc[0]._.holmes.subwords[0].derived_lemma, 'investieren')
def test_derived_lemma_lung_after_consonant(self):
- doc = analyzer.parse("Verwandlung.")
+ doc = nlp("Verwandlung.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'verwandeln')
def test_derived_lemma_ierung_without_ation(self):
- doc = analyzer.parse("Bilanzierung.")
+ doc = nlp("Bilanzierung.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'bilanzieren')
def test_derived_lemma_lung_after_vowel_sound(self):
- doc = analyzer.parse("Erzählung.")
+ doc = nlp("Erzählung.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'erzählen')
+ def test_participle_lemma_adja(self):
+ doc = nlp("Für die studierten Kinder.")
+ self.assertEqual(doc[2]._.holmes.lemma, 'studieren')
+
+ def test_participle_lemma_adjd(self):
+ doc = nlp("Das Kind war studiert.")
+ self.assertEqual(doc[3]._.holmes.lemma, 'studieren')
+
def test_non_recorded_subword_alone(self):
- doc = analyzer.parse('Messerlein.')
+ doc = nlp('Messerlein.')
self.assertEqual(len(doc[0]._.holmes.subwords), 0)
def test_non_recorded_subword_at_end(self):
- doc = analyzer.parse('Informationsmesserlein.')
+ doc = nlp('Informationsmesserlein.')
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'information')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'messer')
def test_non_recorded_subword_in_middle(self):
- doc = analyzer.parse('Messerleininformation.')
+ doc = nlp('Messerleininformation.')
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'messer')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'information')
def test_non_recorded_subword_at_beginning(self):
- doc = analyzer.parse('Leinmesserinformation.')
+ doc = nlp('Leinmesserinformation.')
self.assertEqual(len(doc[0]._.holmes.subwords), 2)
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'messer')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'information')
def test_non_recorded_subword_as_first_member_of_compound(self):
- doc = analyzer.parse('Messerlein- und Tellerleingespräche.')
+ doc = nlp('Messerlein- und Tellerleingespräche.')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'messer')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'gespräch')
self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'teller')
self.assertEqual(doc[2]._.holmes.subwords[1].lemma, 'gespräch')
def test_non_recorded_subword_as_second_member_of_compound(self):
- doc = analyzer.parse('Nahrungsmesserlein und -tellerlein.')
+ doc = nlp('Nahrungsmesserlein und -tellerlein.')
self.assertEqual(doc[0]._.holmes.subwords[0].lemma, 'nahrung')
self.assertEqual(doc[0]._.holmes.subwords[1].lemma, 'messer')
self.assertEqual(doc[2]._.holmes.subwords[0].lemma, 'nahrung')
self.assertEqual(doc[2]._.holmes.subwords[1].lemma, 'teller')
+
+ def test_question_word_initial(self):
+ doc = nlp("Wem hast Du geholfen?")
+ self.assertTrue(doc[0]._.holmes.is_initial_question_word)
+
+ def test_question_word_after_preposition(self):
+ doc = nlp("Mit wem hast Du gesprochen?")
+ self.assertTrue(doc[1]._.holmes.is_initial_question_word)
+
+ def test_question_word_in_complex_phrase(self):
+ doc = nlp("Auf der Basis welcher Information bist Du gekommen?")
+ self.assertTrue(doc[3]._.holmes.is_initial_question_word)
+
+ def test_question_word_control_1(self):
+ doc = nlp(". Wem hast Du geholfen?")
+ for token in doc:
+ self.assertFalse(token._.holmes.is_initial_question_word)
+
+ def test_question_word_control_2(self):
+ doc = nlp("Du bist gekommen wegen wem?")
+ for token in doc:
+ self.assertFalse(token._.holmes.is_initial_question_word)
diff --git a/holmes_extractor/tests/de/test_structural_matching_DE.py b/tests/de/test_structural_matching_DE.py
similarity index 66%
rename from holmes_extractor/tests/de/test_structural_matching_DE.py
rename to tests/de/test_structural_matching_DE.py
index 63838ae..ef34120 100644
--- a/holmes_extractor/tests/de/test_structural_matching_DE.py
+++ b/tests/de/test_structural_matching_DE.py
@@ -5,7 +5,8 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-holmes_manager = holmes.Manager(model='de_core_news_md', ontology=ontology)
+holmes_manager = holmes.Manager(model='de_core_news_lg', ontology=ontology,
+ number_of_workers=2)
holmes_manager.register_search_phrase("Ein Hund jagt eine Katze")
holmes_manager.register_search_phrase("Ein Hund jagt einen Bären")
holmes_manager.register_search_phrase("Ein Hund frisst einen Knochen")
@@ -46,22 +47,27 @@
"Wort-Mit-Bindestrich-Nicht-In-Ontologie")
holmes_manager.register_search_phrase("Wortohnebindestrichnichtinontologie")
holmes_manager.register_search_phrase("Information eines Messers")
+holmes_manager.register_search_phrase("Eine verkaufte Reise")
+holmes_manager.register_search_phrase("Jemand wohnt in einem ENTITYLOC")
holmes_manager_with_variable_search_phrases = holmes.Manager(
- model='de_core_news_md')
-holmes_manager_with_embeddings = holmes.Manager(model='de_core_news_md',
+ model='de_core_news_lg', number_of_workers=2)
+holmes_manager_with_embeddings = holmes.Manager(model='de_core_news_lg',
overall_similarity_threshold=0.7, perform_coreference_resolution=False,
- embedding_based_matching_on_root_words=True)
+ embedding_based_matching_on_root_words=True,
+ use_reverse_dependency_matching=False,
+ number_of_workers=1)
holmes_manager_with_embeddings.register_search_phrase(
"Ein Mann sieht einen großen Hund")
holmes_manager_with_embeddings.register_search_phrase("Der Himmel ist grün")
holmes_manager_with_embeddings.register_search_phrase("Ein König tritt zurück")
holmes_manager_with_embeddings.register_search_phrase(
"Die Abdankung eines Königs")
-holmes_manager_with_embeddings.register_search_phrase("Informationskönig")
-holmes_manager_with_embeddings.register_search_phrase("Teller")
+holmes_manager_with_embeddings.register_search_phrase("Erholung")
holmes_manager_with_embeddings.register_search_phrase("herabfallen")
holmes_manager_with_embeddings.register_search_phrase(
- "Jemand isst von einem Jeden")
+ "Jemand geht auf eine Kur")
+holmes_manager_with_embeddings.register_search_phrase(
+ "Der König einer Stadt")
class GermanStructuralMatchingTest(unittest.TestCase):
@@ -74,17 +80,17 @@ def _get_matches(self, holmes_manager, text):
def test_direct_matching(self):
matches = self._get_matches(holmes_manager, "Der Hund jagte die Katze")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_negated)
- self.assertFalse(matches[0].is_uncertain)
- self.assertEqual(matches[0].search_phrase_label,
+ self.assertFalse(matches[0]['negated'])
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertEqual(matches[0]['search_phrase_label'],
"Ein Hund jagt eine Katze")
def test_matching_with_negation_in_subordinate_clause(self):
matches = self._get_matches(holmes_manager,
"Es wurde nie behauptet, dass ein Hund eine Katze gejagt hatte.")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['negated'])
+ self.assertFalse(matches[0]['uncertain'])
def test_nouns_inverted(self):
matches = self._get_matches(holmes_manager, "Die Katze jagte den Hund")
@@ -98,64 +104,64 @@ def test_verb_negation(self):
matches = self._get_matches(
holmes_manager, "Der Hund jagte die Katze nicht")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['negated'])
+ self.assertFalse(matches[0]['uncertain'])
def test_noun_phrase_negation(self):
matches = self._get_matches(
holmes_manager, "Kein Hund jagte keine Katze")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['negated'])
+ self.assertFalse(matches[0]['uncertain'])
def test_irrelevant_negation(self):
matches = self._get_matches(
holmes_manager, "Der nicht alte Hund jagte die Katze")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_negated)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['negated'])
+ self.assertFalse(matches[0]['uncertain'])
def test_adjective_swapping(self):
matches = self._get_matches(holmes_manager, "Der schlaue Mann")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(holmes_manager, "Der Mann war reich")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_adjective_swapping_with_conjunction(self):
matches = self._get_matches(
holmes_manager, "Der schlaue und schlaue Mann")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
matches = self._get_matches(
holmes_manager, "Der Mann war reich und reich")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
def test_conjunction_with_and(self):
matches = self._get_matches(holmes_manager,
"Der Hund und der Hund jagten die Katze und eine Katze")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_conjunction_with_or(self):
matches = self._get_matches(holmes_manager,
"Der Hund oder der Hund jagten die Katze und eine Katze")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertTrue(text_match.is_uncertain)
+ self.assertTrue(text_match['uncertain'])
def test_threeway_conjunction_with_or(self):
matches = self._get_matches(holmes_manager,
"Der Hund, der Hund oder der Hund jagten die Katze und eine Katze")
self.assertEqual(len(matches), 6)
for text_match in matches:
- self.assertTrue(text_match.is_uncertain)
+ self.assertTrue(text_match['uncertain'])
def test_generic_pronoun_with_auxiliary(self):
matches = self._get_matches(holmes_manager, "Ein Berg wurde gesehen")
@@ -169,65 +175,65 @@ def test_active(self):
matches = self._get_matches(
holmes_manager, "Der Hund wird die Katze jagen")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Der Hund hatte die Katze gejagt")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive_with_von(self):
matches = self._get_matches(
holmes_manager, "Die Katze wird vom Hund gejagt")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Die Katze wird vom Hund gejagt werden")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Die Katze war vom Hund gejagt worden")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Die Katze wird vom Hund gejagt worden sein")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive_with_durch(self):
matches = self._get_matches(
holmes_manager, "Die Katze wird durch den Hund gejagt")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Die Katze wird durch den Hund gejagt werden")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
holmes_manager, "Die Katze war durch den Hund gejagt worden")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(holmes_manager,
"Die Katze wird durch den Hund gejagt worden sein")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_modal(self):
matches = self._get_matches(
holmes_manager, "Der Hund könnte eine Katze jagen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_tricky_passive(self):
matches = self._get_matches(
holmes_manager, "Warum der Berg gesehen wurde, ist unklar")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_relative_pronoun_nominative(self):
matches = self._get_matches(
holmes_manager, "Der Hund, der die Katze jagte, war müde")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_relative_pronoun_nominative_inverted(self):
matches = self._get_matches(
@@ -238,89 +244,89 @@ def test_relative_pronoun_nominative_with_conjunction(self):
matches = self._get_matches(holmes_manager,
"Der Hund, der die Katze und die Katze jagte, war müde")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
def test_relative_pronoun_nominative_with_passive(self):
matches = self._get_matches(holmes_manager,
"Die Katze, die vom Hund gejagt wurde, war müde")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_relative_pronoun_accusative(self):
matches = self._get_matches(
holmes_manager, "Der Bär, den der Hund jagte, war müde")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_separable_verb(self):
matches = self._get_matches(
holmes_manager, "Die Studenten werden ausgehen")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
- self.assertEqual(matches[0].search_phrase_label, "excursion")
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertEqual(matches[0]['search_phrase_label'], "excursion")
def test_von_phrase_matches_genitive_phrase(self):
matches = self._get_matches(
holmes_manager, "Der Abschluss von einer Versicherung")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_von_phrase_matches_genitive_phrase_with_coordination(self):
matches = self._get_matches(holmes_manager,
"Der Abschluss und der Abschluss von einer Versicherung und einer Versicherung")
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_genitive_phrase_matches_von_phrase(self):
matches = self._get_matches(
holmes_manager, "Die Kündigung einer Versicherung")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_genitive_phrase_matches_von_phrase_with_coordination(self):
matches = self._get_matches(holmes_manager,
"Die Kündigung einer Versicherung und einer Versicherung")
self.assertEqual(len(matches), 2)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_subjective_zu_clause_complement_with_conjunction_active(self):
matches = self._get_matches(holmes_manager,
"Der Hund und der Löwe entschlossen sich, eine Katze und eine Maus zu jagen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_adjective_complement_with_conjunction_active(self):
matches = self._get_matches(holmes_manager,
"Der Hund war darüber besorgt, eine Katze und eine Maus zu jagen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_passive_governing_clause_zu_clause_complement_with_conjunction_active(self):
matches = self._get_matches(holmes_manager,
"Dem Hund und dem Löwen wurde vorgeschlagen, eine Katze und eine Maus zu jagen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_verb_complement_simple_passive(self):
matches = self._get_matches(holmes_manager,
"Die Katze dachte darüber nach, von einem Hund gejagt zu werden")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_subjective_zu_clause_complement_simple_passive(self):
matches = self._get_matches(holmes_manager,
"Die Katze entschied, vom Hund gejagt zu werden")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_um_zu_clause_complement_with_conjunction_passive(self):
matches = self._get_matches(holmes_manager,
"Die Katze benutzte den Elefant und die Maus, um vom Hund und Löwen gejagt zu werden")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_passive_search_phrase_with_active_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -329,7 +335,7 @@ def test_passive_search_phrase_with_active_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"Der Hund wird die Katze jagen")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive_search_phrase_with_active_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -339,7 +345,7 @@ def test_passive_search_phrase_with_active_conjunction_searched_sentence(self):
"Der Hund und der Hund haben die Katze und die Katze gejagt")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_passive_search_phrase_with_passive_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -349,7 +355,7 @@ def test_passive_search_phrase_with_passive_conjunction_searched_sentence(self):
"Die Katze und die Katze werden von einem Hund und einem Hund gejagt werden")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_passive_search_phrase_with_negated_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -358,8 +364,8 @@ def test_passive_search_phrase_with_negated_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"Der Hund jagte die Katze nie")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[0].is_negated)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[0]['negated'])
def test_question_search_phrase_with_active_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -368,7 +374,7 @@ def test_question_search_phrase_with_active_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"Der Hund wird den Knochen fressen")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_question_search_phrase_with_active_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -378,7 +384,7 @@ def test_question_search_phrase_with_active_conjunction_searched_sentence(self):
"Der Hund und der Hund haben einen Knochen und einen Knochen gefressen")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_question_search_phrase_with_passive_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -388,7 +394,7 @@ def test_question_search_phrase_with_passive_conjunction_searched_sentence(self)
"Der Knochen und der Knochen werden von einem Hund und einem Hund gefressen werden")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_question_search_phrase_with_negated_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -397,8 +403,8 @@ def test_question_search_phrase_with_negated_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"Der Hund fraß den Knochen nie")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[0].is_negated)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[0]['negated'])
def test_original_search_phrase_root_not_matchable(self):
matches = self._get_matches(
@@ -433,16 +439,6 @@ def test_entity_token_does_not_match_subwords(self):
holmes_manager_with_variable_search_phrases.register_search_phrase(
"Ein ENTITYMISC")
- def test_entitynoun_as_non_root_node(self):
- matches = self._get_matches(
- holmes_manager, "Das Fahrzeug hat einen Fehler.")
- holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
- holmes_manager_with_variable_search_phrases.register_search_phrase(
- "Ich sah ein ENTITYNOUN")
- matches = self._get_matches(holmes_manager_with_variable_search_phrases,
- "Ich sah einen Hund und eine Elefantenkatze")
- self.assertEqual(len(matches), 2)
-
def test_separable_verb_in_main_and_dependent_clauses(self):
matches = self._get_matches(holmes_manager,
"Der Mitarbeiter hatte vor, eine Versicherung abzuschließen.")
@@ -452,22 +448,16 @@ def test_matching_additional_preposition_dependency_on_verb(self):
matches = self._get_matches(holmes_manager,
"Der Mitarbeiter braucht eine Versicherung für die nächsten fünf Jahre")
self.assertEqual(len(matches), 3)
- for match in matches:
- if len(match.word_matches) == 5:
- self.assertFalse(match.is_uncertain)
- else:
- self.assertTrue(match.is_uncertain)
- self.assertEqual(len(match.word_matches), 4)
def test_involves_coreference_false(self):
holmes_manager.remove_all_documents()
holmes_manager.parse_and_register_document(
"Ein Hund jagte eine Katze.")
matches = holmes_manager.match()
- self.assertFalse(matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
+ self.assertFalse(matches[0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
def test_empty_string_does_not_match_entity_search_phrase_token(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -497,7 +487,7 @@ def test_adjective_verb_phrase_as_search_phrase_matches_simple(self):
matches = self._get_matches(holmes_manager,
"Der Urlaub war sehr schwer zu buchen")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_adjective_verb_phrase_as_search_phrase_no_match_with_normal_phrase(self):
matches = self._get_matches(holmes_manager,
@@ -509,59 +499,59 @@ def test_adjective_verb_phrase_as_search_phrase_matches_compound(self):
"Der Urlaub und der Urlaub waren sehr schwer und schwer zu buchen und zu buchen")
self.assertEqual(len(matches), 8)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_objective_adjective_verb_phrase_separate_zu_matches_normal_search_phrase_simple(self):
matches = self._get_matches(holmes_manager,
"Die Versicherung war sehr schwer zu finden")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_objective_adjective_verb_phrase_separate_zu_matches_normal_search_phrase_compound(self):
matches = self._get_matches(holmes_manager,
"Die Versicherung und die Versicherung waren sehr schwer und schwer zu finden und zu finden")
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_objective_adjective_verb_phrase_integrated_zu_matches_normal_search_phrase_simple(self):
matches = self._get_matches(holmes_manager,
"Die Versicherung war sehr schwer abzuschließen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_objective_adjective_verb_phrase_integrated_zu_matches_normal_search_phrase_compound(self):
matches = self._get_matches(holmes_manager,
"Die Versicherung und die Versicherung waren sehr schwer und schwer abzuschließen und abzuschließen")
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_subjective_adjective_verb_phrase_separate_zu_matches_normal_search_phrase_simple(self):
matches = self._get_matches(holmes_manager,
"Der Mann war sehr froh zu singen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_subjective_adjective_verb_phrase_separate_zu_matches_normal_search_phrase_compound(self):
matches = self._get_matches(holmes_manager,
"Der Mann und der Mann waren sehr froh zu singen und zu singen")
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_subjective_adjective_verb_phrase_integrated_zu_matches_normal_search_phrase_simple(self):
matches = self._get_matches(holmes_manager,
"Der Mann war sehr froh auszugehen")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_subjective_adjective_verb_phrase_integrated_zu_matches_normal_search_phrase_compound(self):
matches = self._get_matches(holmes_manager,
"Der Mann und der Mann waren sehr froh auszugehen")
self.assertEqual(len(matches), 2)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_german_embeddings(self):
matches = self._get_matches(holmes_manager_with_embeddings,
@@ -632,163 +622,163 @@ def test_two_subwords_filling_same_word(self):
matches = self._get_matches(holmes_manager,
"Informationsextraktion")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
def test_two_subwords_at_beginning_of_same_word(self):
matches = self._get_matches(holmes_manager,
"Informationsextraktionsmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
def test_two_subwords_at_end_of_same_word(self):
matches = self._get_matches(holmes_manager,
"Maßnahmeninformationsextraktion")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
def test_two_subwords_in_different_words(self):
matches = self._get_matches(holmes_manager,
"Maßnahmenextraktion der Maßnahmeninformation")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
def test_two_subwords_two_word_conjunction_first_element(self):
matches = self._get_matches(holmes_manager,
"Informationsentnahme und -extraktion")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
def test_two_subwords_three_word_conjunction_first_element(self):
matches = self._get_matches(holmes_manager,
"Informationsentnahme, -extraktion und -freude")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
def test_two_subwords_two_word_conjunction_last_element(self):
matches = self._get_matches(holmes_manager,
"Informations- und Entnahmeextraktion")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
def test_two_subwords_three_word_conjunction_last_element(self):
matches = self._get_matches(holmes_manager,
"Freude-, Informations- und Entnahmeextraktion")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
def test_two_subwords_in_middle_element(self):
matches = self._get_matches(holmes_manager,
"Freudeverwaltungs--, -informationsextraktions- und -entnahmeverwaltung")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 0)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
def test_three_subwords_filling_same_word_initial_position(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungsmaßnahmen waren das, worüber wir sprachen.")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
def test_three_subwords_filling_same_word_later_position(self):
matches = self._get_matches(holmes_manager,
"Wir redeten über Informationsbeschaffungsmaßnahmen.")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 3)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 3)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 3)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
def test_three_subwords_filling_same_word_beginning_of_word(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungsmaßnahmenextraktion.")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
def test_three_subwords_filling_same_word_end_of_word(self):
matches = self._get_matches(holmes_manager,
"Extraktionsinformationsbeschaffungsmaßnahmen.")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 3)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 1)
def test_three_subwords_split_two_one(self):
matches = self._get_matches(holmes_manager,
"Maßnahmen der Informationsbeschaffung")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword, None)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], None)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 2)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 2)
def test_three_subwords_split_two_one_with_more_subwords(self):
matches = self._get_matches(holmes_manager,
"Extraktionsmaßnahmen der Extraktionsinformationsbeschaffung")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 0)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 2)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 1)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 2)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 2)
def test_three_subwords_split_one_two(self):
matches = self._get_matches(holmes_manager,
@@ -804,192 +794,196 @@ def test_three_subwords_two_word_conjunction_first_elements_two_one(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungsprobleme und -maßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction_first_elements_two_one(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungsprobleme, -maßnahmen und -interessen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_two_word_conjunction_first_elements_one_two(self):
matches = self._get_matches(holmes_manager,
"Informationsprobleme und -beschaffungsmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction_first_elements_one_two(self):
matches = self._get_matches(holmes_manager,
"Informationsprobleme, -interessen und -beschaffungsmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_two_word_conjunction_last_elements_one_two(self):
matches = self._get_matches(holmes_manager,
"Informations- und Interessenbeschaffungsmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction_last_elements_one_two(self):
matches = self._get_matches(holmes_manager,
"Informations-, Problem- und Interessenbeschaffungsmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[0].document_word, 'maßnahme')
- self.assertEqual(matches[0].word_matches[1].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][0]['document_word'], 'maßnahme')
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 4)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 4)
self.assertEqual(
- matches[0].word_matches[1].document_word, 'beschaffung')
- self.assertEqual(matches[0].word_matches[2].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_word'], 'beschaffung')
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_word, 'Information')
+ matches[0]['word_matches'][2]['document_word'], 'Information')
def test_three_subwords_two_word_conjunction_last_elements_two_one(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungs- und Interessenmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction_last_elements_two_one(self):
matches = self._get_matches(holmes_manager,
"Informationsbeschaffungs-, Problem- und Interessenmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 0)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction(self):
matches = self._get_matches(holmes_manager,
"Informationsinteressen, -beschaffungs- und Problemmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_three_subwords_three_word_conjunction_with_other_words(self):
matches = self._get_matches(holmes_manager,
"Informationsinteressen, -interessen-, -beschaffungs-, -interessen- und Problemmaßnahmen")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[0].document_subword.index, 2)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][0]['document_subword_index'], 2)
self.assertEqual(
- matches[0].word_matches[0].document_subword.containing_token_index, 8)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[1].document_subword.index, 1)
+ matches[0]['word_matches'][0]['document_subword_containing_token_index'], 8)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][1]['document_subword_index'], 1)
self.assertEqual(
- matches[0].word_matches[1].document_subword.containing_token_index, 4)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[2].document_subword.index, 0)
+ matches[0]['word_matches'][1]['document_subword_containing_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['document_subword_index'], 0)
self.assertEqual(
- matches[0].word_matches[2].document_subword.containing_token_index, 0)
+ matches[0]['word_matches'][2]['document_subword_containing_token_index'], 0)
def test_uncertain_subword_match_with_or_conjunction(self):
matches = self._get_matches(holmes_manager,
"Informationsinteressen oder -extraktion")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_embedding_match_on_root_subword(self):
matches = self._get_matches(holmes_manager_with_embeddings,
- "Ein Informationskönig")
+ "Stadtprinz")
self.assertEqual(len(matches), 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_word'], 'prinz')
+ self.assertEqual(matches[0]['word_matches'][1]['document_word'], 'Stadt')
def test_embedding_match_on_non_root_subword(self):
matches = self._get_matches(holmes_manager_with_embeddings,
"Die Prinzenabdankung")
self.assertEqual(len(matches), 1)
+ self.assertEqual(matches[0]['word_matches'][0]['document_word'], 'abdankung')
+ self.assertEqual(matches[0]['word_matches'][1]['document_word'], 'prinz')
def test_ontology_matching_with_subwords(self):
matches = self._get_matches(holmes_manager,
@@ -1066,11 +1060,26 @@ def test_hyphenation_11(self):
"Wortmitbindestrichnichtinontologie")
self.assertEqual(len(matches), 1)
- def test_hyphenation_10(self):
+ def test_hyphenation_12(self):
matches = self._get_matches(holmes_manager,
"Wort-Ohne-Bindestrich-Nicht-In-Ontologie")
self.assertEqual(len(matches), 1)
+ def test_reverse_dependency_subword_in_document_1(self):
+ matches = self._get_matches(holmes_manager,
+ "Reiseverkauf")
+ self.assertEqual(len(matches), 1)
+
+ def test_reverse_dependency_subword_in_document_2(self):
+ matches = self._get_matches(holmes_manager,
+ "Reise- und Reiseverkauf")
+ self.assertEqual(len(matches), 2)
+
+ def test_reverse_dependency_subword_in_document_use_reverse_dependency_matching_false(self):
+ matches = self._get_matches(holmes_manager_with_embeddings,
+ "Reise- und Reiseverkauf")
+ self.assertEqual(len(matches), 0)
+
def test_matching_across_non_reported_subword(self):
matches = self._get_matches(holmes_manager,
"Messerleininformation")
@@ -1078,8 +1087,13 @@ def test_matching_across_non_reported_subword(self):
def test_no_embedding_match_word_too_short(self):
matches = self._get_matches(holmes_manager_with_embeddings,
- "Ein Jeden hat das gemacht")
- self.assertEqual(len(matches), 0)
+ "Sie ging auf eine Erholung")
+ self.assertEqual(len(matches), 1)
+
+ def test_no_embedding_match_word_too_short_control(self):
+ matches = self._get_matches(holmes_manager_with_embeddings,
+ "Sie ging auf eine Kur")
+ self.assertEqual(len(matches), 1)
def test_no_embedding_wrong_pos(self):
matches = self._get_matches(holmes_manager_with_embeddings,
@@ -1088,10 +1102,15 @@ def test_no_embedding_wrong_pos(self):
def test_no_embedding_subword_too_short(self):
matches = self._get_matches(holmes_manager_with_embeddings,
- "Jedeninteresse")
+ "Kurinteresse")
self.assertEqual(len(matches), 0)
def test_no_embedding_search_phrase_word_too_short(self):
matches = self._get_matches(holmes_manager_with_embeddings,
- "Jemand isst von einem Teller")
+ "Jemand geht auf eine Kur")
+ self.assertEqual(len(matches), 1)
+
+ def test_recursion_problem_solved(self):
+ matches = self._get_matches(holmes_manager,
+ "Ich wohne seit 2010 in München")
self.assertEqual(len(matches), 1)
diff --git a/tests/de/test_structural_matching_with_coreference_DE.py b/tests/de/test_structural_matching_with_coreference_DE.py
new file mode 100644
index 0000000..ce2c221
--- /dev/null
+++ b/tests/de/test_structural_matching_with_coreference_DE.py
@@ -0,0 +1,363 @@
+import unittest
+import holmes_extractor as holmes
+import os
+
+script_directory = os.path.dirname(os.path.realpath(__file__))
+ontology = holmes.Ontology(os.sep.join(
+ (script_directory, 'test_ontology.owl')))
+coref_holmes_manager = holmes.Manager(model='de_core_news_lg',
+ ontology=ontology, number_of_workers=2)
+coref_holmes_manager.register_search_phrase("Ein Hund jagt eine Katze")
+coref_holmes_manager.register_search_phrase("Ein großes Pferd jagt eine Katze")
+coref_holmes_manager.register_search_phrase("Ein Tiger jagt eine kleine Katze")
+coref_holmes_manager.register_search_phrase("Ein großer Löwe jagt eine Katze")
+coref_holmes_manager.register_search_phrase("Ein ENTITYPER braucht Versicherung")
+coref_holmes_manager.register_search_phrase("Jemand versucht, zu erklären")
+coref_holmes_manager.register_search_phrase("ein müder Hund")
+coref_holmes_manager.register_search_phrase("Ein Gepard jagt einen Gepard")
+coref_holmes_manager.register_search_phrase("Ein Leopard jagt einen Leopard")
+
+coref_holmes_manager.register_search_phrase("Ein Urlaub ist schwer zu finden")
+coref_holmes_manager.register_search_phrase("Jemand liebt einen Elefanten")
+coref_holmes_manager.register_search_phrase("Jemand folgt einem Elefanten der Vergangenheit")
+coref_holmes_manager.register_search_phrase("Ein verkaufter Urlaub")
+coref_holmes_manager.register_search_phrase("Eine große Firma hat Probleme")
+nocoref_holmes_manager = holmes.Manager(model='de_core_news_lg',
+ perform_coreference_resolution=False,
+ number_of_workers=1)
+nocoref_holmes_manager.register_search_phrase("Ein Hund jagt eine Katze")
+
+
+class CoreferenceEnglishMatchingTest(unittest.TestCase):
+
+ def _check_word_match(self, match, word_match_index, document_token_index, extracted_word,
+ subword_index=None):
+ word_match = match['word_matches'][word_match_index]
+ self.assertEqual(word_match['document_token_index'], document_token_index)
+ self.assertEqual(word_match['extracted_word'], extracted_word)
+ if subword_index is not None:
+ self.assertEqual(word_match['document_subword_index'], subword_index)
+
+ def test_simple_pronoun_coreference_same_sentence(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund, und er jagte eine Katze.")
+ matches = coref_holmes_manager.match()
+ self._check_word_match(matches[0], 0, 3, 'Hund')
+ self._check_word_match(matches[0], 1, 7, 'jagen')
+ self._check_word_match(matches[0], 2, 9, 'Katze')
+
+ def test_perform_coreference_resolution_false(self):
+ nocoref_holmes_manager.remove_all_documents()
+ nocoref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund, und er jagte eine Katze.")
+ matches = nocoref_holmes_manager.match()
+ self.assertEqual(len(matches), 0)
+
+ def test_simple_pronoun_coreference_same_sentence_wrong_structure(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund und er wurde von einer Katze gejagt.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 0)
+
+ def test_simple_pronoun_coreference_same_sentence_plural_antecedent(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah Hunde und sie jagten eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 2, 'hund')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_both_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund und einen Hund, und die jagten eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 3, 'Hund')
+ self._check_word_match(matches[1], 0, 6, 'Hund')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_left_matches(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund und ein Pferd, und sie jagten eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 3, 'Hund')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_right_matches(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein Pferd und einen Hund, und die jagten eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 6, 'Hund')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_pronouns_both_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich redete mit Peter Müller und Jana Müller, während er und sie Versicherung brauchten.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 4, 'Peter Müller')
+ self._check_word_match(matches[1], 0, 7, 'Jana Müller')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_lefthand_is_pronoun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich redete mit Peter Müller, während er und Jana Müller Versicherung brauchten.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 4, 'Peter Müller')
+ self._check_word_match(matches[1], 0, 10, 'Jana Müller')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_righthand_is_pronoun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "I redete mit Jana Müller, während Peter Müller und sie Versicherung brauchten.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 8, 'Peter Müller')
+ self._check_word_match(matches[1], 0, 4, 'Jana Müller')
+
+ def test_simple_pronoun_coreference_same_sentence_conjunction_righthand_noun_not_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich redete mit Peter Müller, während er und ein Pferd Versicherung brauchten.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 4, 'Peter Müller')
+
+ def test_simple_pronoun_coreference_diff_sentence(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah eine Katze. Ein Hund jagte sie.")
+ matches = coref_holmes_manager.match()
+ self._check_word_match(matches[0], 0, 6, 'Hund')
+ self._check_word_match(matches[0], 1, 7, 'jagen')
+ self._check_word_match(matches[0], 2, 3, 'Katze')
+
+ def test_simple_pronoun_coreference_diff_sentence_wrong_structure(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah einen Hund. Er wurde durch eine Katze gejagt.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 0)
+
+ def test_simple_pronoun_coreference_diff_sentence_plural_antecedent(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah Katzen. Sie wurden durch einen Hund gejagt.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 2, 2, 'katze')
+
+ def test_simple_pronoun_coreference_diff_sentence_conjunction_in_antecedent_both_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah eine Katze und eine Katze. Ein Hund hat sie gejagt.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 2, 3, 'Katze')
+ self._check_word_match(matches[1], 2, 6, 'Katze')
+
+ def test_simple_pronoun_coreference_diff_sentence_conjunction_in_antecedent_left_matches(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah eine Katze und ein Pferd. Ein Hund hat sie gejagt.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 2, 3, 'Katze')
+
+ def test_simple_pronoun_coreference_diff_sentence_conjunction_in_antecedent_right_matches(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein Pferd und eine Katze. Ein Hund hat sie gejagt")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 2, 6, 'Katze')
+
+ def test_pronoun_coreferent_has_dependency_same_sentence(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein großes Pferd und es jagte eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 3, 'groß')
+ self._check_word_match(matches[0], 1, 4, 'Pferd')
+
+ def test_pronoun_coreferents_with_dependency_conjunction_same_sentence_both_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein großes Pferd und ein großes Pferd und sie jagten eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 3, 'groß')
+ self._check_word_match(matches[0], 1, 4, 'Pferd')
+ self._check_word_match(matches[1], 0, 7, 'groß')
+ self._check_word_match(matches[1], 1, 8, 'Pferd')
+
+ def test_noun_coreferent_has_dependency_same_sentence(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein großes Pferd, und das Pferd jagte eine Katze.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 3, 'groß')
+ self._check_word_match(matches[0], 1, 8, 'Pferd')
+
+ def test_pronoun_coreferent_has_dependency_three_sentences(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ich sah ein Pferd. Es jagte eine Katze. Es war groß")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 12, 'groß')
+ self._check_word_match(matches[0], 1, 3, 'Pferd')
+
+ def test_reflexive_pronoun_coreferent(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Der Gepard jagte sich")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 1, 'Gepard')
+ self._check_word_match(matches[0], 2, 1, 'Gepard')
+
+ def test_reflexive_pronoun_coreferents_with_conjunction_same_noun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Der Gepard und der Gepard jagten sich")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 4)
+ self._check_word_match(matches[0], 0, 1, 'Gepard')
+ self._check_word_match(matches[0], 2, 1, 'Gepard')
+ self._check_word_match(matches[1], 0, 4, 'Gepard')
+ self._check_word_match(matches[1], 2, 1, 'Gepard')
+ self._check_word_match(matches[2], 0, 1, 'Gepard')
+ self._check_word_match(matches[2], 2, 4, 'Gepard')
+ self._check_word_match(matches[3], 0, 4, 'Gepard')
+ self._check_word_match(matches[3], 2, 4, 'Gepard')
+
+ def test_reflexive_pronoun_coreferents_with_conjunction_diff_noun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Der Gepard und der Leopard jagten sich")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 1, 'Gepard')
+ self._check_word_match(matches[0], 2, 1, 'Gepard')
+ self._check_word_match(matches[1], 0, 4, 'Leopard')
+ self._check_word_match(matches[1], 0, 4, 'Leopard')
+
+ def test_repeated_noun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Wir sahen einen großes Hund. Der Hund jagte eine Katze")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 7, 'Hund')
+
+ def test_repeated_noun_match_both_mentions(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Wir sahen einen müden Hund. Der Hund jagte einen Esel")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 0, 3, 'müde')
+ self._check_word_match(matches[0], 1, 4, 'Hund')
+ self._check_word_match(matches[1], 0, 3, 'müde')
+ self._check_word_match(matches[1], 1, 7, 'Hund')
+
+ def test_mentions_following_structural_match(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Ein großes Pferd jagte eine Katze. Das Pferd war glücklich.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 0, 1, 'groß')
+ self._check_word_match(matches[0], 1, 2, 'Pferd')
+
+ def test_adjective_verb_phrase_as_search_phrase_matches_simple(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Wir besprachen einen Urlaub. Er war sehr schwer zu finden.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self.assertFalse(matches[0]['uncertain'])
+
+ def test_coreference_and_derivation(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Ich wollte eine Erklärung. Der Nachbar hat sie versucht.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
+
+ def test_coreference_and_last_subword_matched_simple(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Riesenelefanten. Alle liebten ihn.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 1, 3, 'elefant', 1)
+
+ def test_coreference_and_last_subword_matched_compound(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Riesenelefanten und einen zweiten Riesenelefanten. Alle liebten sie.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 1, 3, 'elefant', 1)
+ self._check_word_match(matches[1], 1, 7, 'elefant', 1)
+
+ def test_coreference_and_last_subword_and_previous_subword_matched_simple(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Vergangenheitselefanten. Alle folgten ihm.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 1, 3, 'elefant', 1)
+
+ def test_coreference_and_last_subword_and_previous_subword_matched_compound(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Vergangenheitselefanten und einen zweiten Vergangenheitselefanten. Alle folgten ihnen.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 1, 3, 'elefant', 1)
+ self._check_word_match(matches[1], 1, 7, 'elefant', 1)
+
+ def test_coreference_and_last_subword_and_reverse_dependency_simple(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Versicherungsurlaub. Jemand verkaufte ihn.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 1, 3, 'urlaub', 1)
+
+ def test_coreference_and_last_subword_and_reverse_dependency_compound(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ """Es gab einen Versicherungsurlaub und einen Versicherungsurlaub. Jemand verkaufte sie.""")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self._check_word_match(matches[0], 1, 3, 'urlaub', 1)
+ self._check_word_match(matches[1], 1, 6, 'urlaub', 1)
+
+ def test_different_extracted_word_not_in_ontology_with_pronoun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Wir besprachen Peters GmbH. Die große Firma hatte Schwierigkeiten. Sie hatte Probleme.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 1, 6, 'Peters')
+
+ def test_different_extracted_word_not_in_ontology_without_pronoun(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Wir besprachen Peters GmbH. Die große Firma hatte Probleme.")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self._check_word_match(matches[0], 1, 6, 'Peters')
diff --git a/holmes_extractor/tests/de/test_supervised_topic_classification_DE.py b/tests/de/test_supervised_topic_classification_DE.py
similarity index 92%
rename from holmes_extractor/tests/de/test_supervised_topic_classification_DE.py
rename to tests/de/test_supervised_topic_classification_DE.py
index 9efeae5..f90cbcb 100644
--- a/holmes_extractor/tests/de/test_supervised_topic_classification_DE.py
+++ b/tests/de/test_supervised_topic_classification_DE.py
@@ -1,8 +1,8 @@
import unittest
import holmes_extractor as holmes
-from holmes_extractor.extensive_matching import SupervisedTopicClassifier
+from holmes_extractor.classification import SupervisedTopicClassifier
-holmes_manager = holmes.Manager('de_core_news_md')
+holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=1)
class GermanSupervisedTopicClassificationTest(unittest.TestCase):
@@ -16,11 +16,11 @@ def test_get_labels_to_classification_frequencies_direct_matching(self):
sttb.prepare()
freq = sttb.labels_to_classification_frequencies
self.assertEqual(freq['verb-nom: jagd-löw'], {'Tiere': 1})
- self.assertEqual(freq['verb-acc: jagd-tigern'], {'Tiere': 1})
+ self.assertEqual(freq['verb-acc: jagd-tiger'], {'Tiere': 1})
self.assertEqual(
- freq['verb-acc: jagd-tigern/verb-nom: jagd-löw'], {'Tiere': 1})
+ freq['verb-acc: jagd-tiger/verb-nom: jagd-löw'], {'Tiere': 1})
self.assertEqual(freq['word: löw'], {'Tiere': 1})
- self.assertEqual(freq['word: tigern'], {'Tiere': 1})
+ self.assertEqual(freq['word: tiger'], {'Tiere': 1})
def test_linked_matching_common_dependent(self):
sttb = holmes_manager.get_supervised_topic_training_basis(
@@ -33,13 +33,13 @@ def test_linked_matching_common_dependent(self):
# for some reason spaCy does not resolve 'isst' and 'frisst' to the infinitive forms
self.assertEqual(freq['verb-nom: isst-löw'], {'Tiere': 1})
self.assertEqual(freq['verb-nom: frisst-löw'], {'Tiere': 1})
- self.assertEqual(freq['verb-acc: frisst-tigern'], {'Tiere': 1})
- self.assertEqual(freq['verb-acc: frisst-tigern/verb-nom: frisst-löw'],
+ self.assertEqual(freq['verb-acc: frisst-tiger'], {'Tiere': 1})
+ self.assertEqual(freq['verb-acc: frisst-tiger/verb-nom: frisst-löw'],
{'Tiere': 1})
self.assertEqual(freq['verb-nom: frisst-löw/verb-nom: isst-löw'],
{'Tiere': 1})
self.assertEqual(freq['word: löw'], {'Tiere': 1})
- self.assertEqual(freq['word: tigern'], {'Tiere': 1})
+ self.assertEqual(freq['word: tiger'], {'Tiere': 1})
def test_linked_matching_common_dependent_control(self):
sttb = holmes_manager.get_supervised_topic_training_basis(
@@ -107,17 +107,17 @@ def test_get_labels_to_classification_frequencies_direct_matching_with_subwords(
sttb.prepare()
freq = sttb.labels_to_classification_frequencies
self.assertEqual(freq['verb-nom: jagd-löw'], {'Tiere': 1})
- self.assertEqual(freq['verb-acc: jagd-tigern'], {'Tiere': 1})
+ self.assertEqual(freq['verb-acc: jagd-tiger'], {'Tiere': 1})
self.assertEqual(
- freq['verb-acc: jagd-tigern/verb-nom: jagd-löw'], {'Tiere': 1})
- self.assertEqual(freq['word: informationslöw'], {'Tiere': 1})
+ freq['verb-acc: jagd-tiger/verb-nom: jagd-löw'], {'Tiere': 1})
+ self.assertEqual(freq['word: informationslöwe'], {'Tiere': 1})
self.assertEqual(freq['word: informationstiger'], {'Tiere': 1})
self.assertEqual(freq['intcompound: löw-information'], {'Tiere': 1})
- self.assertEqual(freq['intcompound: tigern-information'], {'Tiere': 1})
+ self.assertEqual(freq['intcompound: tiger-information'], {'Tiere': 1})
self.assertEqual(
freq['intcompound: löw-information/verb-nom: jagd-löw'], {'Tiere': 1})
self.assertEqual(
- freq['intcompound: tigern-information/verb-acc: jagd-tigern'], {'Tiere': 1})
+ freq['intcompound: tiger-information/verb-acc: jagd-tiger'], {'Tiere': 1})
def test_get_labels_to_classification_frequencies_direct_matching_with_subwords_and_conjunction_of_verb(self):
sttb = holmes_manager.get_supervised_topic_training_basis(
@@ -129,19 +129,19 @@ def test_get_labels_to_classification_frequencies_direct_matching_with_subwords_
freq = sttb.labels_to_classification_frequencies
self.assertEqual(freq['verb-nom: jagd-löw'], {'Tiere': 1})
self.assertEqual(freq['verb-nom: tragen-löw'], {'Tiere': 1})
- self.assertEqual(freq['verb-acc: tragen-tigern'], {'Tiere': 1})
+ self.assertEqual(freq['verb-acc: tragen-tiger'], {'Tiere': 1})
self.assertEqual(
- freq['verb-acc: tragen-tigern/verb-nom: tragen-löw'], {'Tiere': 1})
+ freq['verb-acc: tragen-tiger/verb-nom: tragen-löw'], {'Tiere': 1})
self.assertEqual(freq['word: informationslöw'], {'Tiere': 1})
self.assertEqual(freq['word: informationstiger'], {'Tiere': 1})
self.assertEqual(freq['intcompound: löw-information'], {'Tiere': 1})
- self.assertEqual(freq['intcompound: tigern-information'], {'Tiere': 1})
+ self.assertEqual(freq['intcompound: tiger-information'], {'Tiere': 1})
self.assertEqual(
freq['intcompound: löw-information/verb-nom: jagd-löw'], {'Tiere': 1})
self.assertEqual(
freq['intcompound: löw-information/verb-nom: tragen-löw'], {'Tiere': 1})
self.assertEqual(
- freq['intcompound: tigern-information/verb-acc: tragen-tigern'], {'Tiere': 1})
+ freq['intcompound: tiger-information/verb-acc: tragen-tiger'], {'Tiere': 1})
def test_get_labels_to_classification_frequencies_with_front_subword_conjunction(self):
sttb = holmes_manager.get_supervised_topic_training_basis(
@@ -157,7 +157,7 @@ def test_get_labels_to_classification_frequencies_with_front_subword_conjunction
self.assertEqual(freq['intcompound: löw-maßnahm'], {'Tiere': 1})
self.assertEqual(freq['intcompound: löw-raket'], {'Tiere': 1})
self.assertEqual(freq['verb-nom: fressen-löw'], {'Tiere': 1})
- self.assertEqual(freq['word: raketenlöw'], {'Tiere': 1})
+ self.assertEqual(freq['word: raketenlöwe'], {'Tiere': 1})
self.assertEqual(
freq['intcompound: extraktion-information/intcompound: maßnahm-extraktion'], {'Tiere': 1})
self.assertEqual(
@@ -271,7 +271,7 @@ def _test_whole_scenario(self, oneshot):
'verb-acc: benutzen-maus')
# should not have any effect because the supervised topic objects have their own
# StructuralMatcher instance
- self.assertEqual(list(trainer._sorted_label_dict.keys()),
+ self.assertEqual(list(trainer.sorted_label_dict.keys()),
['intcompound: hund-plüsch', 'intcompound: hund-plüsch/verb-nom: jagd-hund',
'verb-acc: benutzen-maus',
'verb-acc: benutzen-maus/verb-nom: benutzen-programmierer', 'verb-acc: jagd-hund',
@@ -293,7 +293,7 @@ def _test_whole_scenario(self, oneshot):
0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,
0.0, 0.0, 0.0]],
- trainer._input_matrix.toarray().tolist())
+ trainer.input_matrix.toarray().tolist())
else:
self.assertEqual([
[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
@@ -304,10 +304,10 @@ def _test_whole_scenario(self, oneshot):
0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,
0.0, 0.0, 0.0]],
- trainer._input_matrix.toarray().tolist())
+ trainer.input_matrix.toarray().tolist())
self.assertEqual([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]],
- trainer._output_matrix.toarray().tolist())
+ trainer.output_matrix.toarray().tolist())
self.assertEqual((22, 15, 8), trainer._hidden_layer_sizes)
stc = trainer.classifier()
self.assertEqual(stc.parse_and_classify(
@@ -321,7 +321,7 @@ def _test_whole_scenario(self, oneshot):
serialized_supervised_topic_classifier_model = stc.serialize_model()
stc2 = holmes_manager.deserialize_supervised_topic_classifier(
serialized_supervised_topic_classifier_model)
- self.assertEqual(list(stc2._model.sorted_label_dict.keys()),
+ self.assertEqual(list(stc2.model.sorted_label_dict.keys()),
['intcompound: hund-plüsch', 'intcompound: hund-plüsch/verb-nom: jagd-hund',
'verb-acc: benutzen-maus',
'verb-acc: benutzen-maus/verb-nom: benutzen-programmierer', 'verb-acc: jagd-hund',
@@ -376,22 +376,22 @@ def test_filtering(self):
"Ein Programmierer schreibt Python", 'IT', 'i2')
sttb.prepare()
trainer = sttb.train(minimum_occurrences=2, cv_threshold=0.0)
- self.assertEqual(list(trainer._sorted_label_dict.keys()),
+ self.assertEqual(list(trainer.sorted_label_dict.keys()),
['verb-acc: benutzen-maus',
'verb-acc: benutzen-maus/verb-nom: benutzen-programmierer',
'verb-nom: benutzen-programmierer', 'verb-nom: jagd-katz',
'word: hund', 'word: katz', 'word: maus', 'word: programmierer'])
- self.assertEqual(set(map(lambda phr: phr.label, trainer._phraselet_infos)),
+ self.assertEqual(set(map(lambda phr: phr.label, trainer.phraselet_infos)),
{'verb-acc: benutzen-maus',
'verb-nom: benutzen-programmierer', 'verb-nom: jagd-katz',
'word: hund', 'word: katz', 'word: maus', 'word: programmierer'})
trainer2 = sttb.train(minimum_occurrences=2, cv_threshold=1)
- self.assertEqual(list(trainer2._sorted_label_dict.keys()),
+ self.assertEqual(list(trainer2.sorted_label_dict.keys()),
['verb-acc: benutzen-maus',
'verb-acc: benutzen-maus/verb-nom: benutzen-programmierer',
'verb-nom: benutzen-programmierer', 'verb-nom: jagd-katz',
'word: hund', 'word: katz', 'word: programmierer'])
- self.assertEqual(set(map(lambda phr: phr.label, trainer2._phraselet_infos)),
+ self.assertEqual(set(map(lambda phr: phr.label, trainer2.phraselet_infos)),
{'verb-acc: benutzen-maus',
'verb-nom: benutzen-programmierer', 'verb-nom: jagd-katz',
'word: hund', 'word: katz', 'word: programmierer'})
diff --git a/holmes_extractor/tests/de/test_topic_matching_DE.py b/tests/de/test_topic_matching_DE.py
similarity index 62%
rename from holmes_extractor/tests/de/test_topic_matching_DE.py
rename to tests/de/test_topic_matching_DE.py
index 97355e0..3f7f13c 100644
--- a/holmes_extractor/tests/de/test_topic_matching_DE.py
+++ b/tests/de/test_topic_matching_DE.py
@@ -5,20 +5,22 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-holmes_manager = holmes.Manager('de_core_news_md', ontology=ontology)
-holmes_manager_with_embeddings = holmes.Manager('de_core_news_md',
- overall_similarity_threshold=0.65)
-
+holmes_manager = holmes.Manager('de_core_news_lg', ontology=ontology,
+ number_of_workers=1)
class GermanTopicMatchingTest(unittest.TestCase):
- def _check_equals(self, text_to_match, document_text, highest_score, manager=holmes_manager):
+ def _check_equals(self, text_to_match, document_text, highest_score, manager=holmes_manager,
+ word_embedding_match_threshold=1.0):
manager.remove_all_documents()
manager.parse_and_register_document(document_text)
topic_matches = manager.topic_match_documents_against(text_to_match,
+ word_embedding_match_threshold=
+ word_embedding_match_threshold,
relation_score=20, reverse_only_relation_score=15,
- single_word_score=10, single_word_any_tag_score=5)
- self.assertEqual(int(topic_matches[0].score), highest_score)
+ single_word_score=10, single_word_any_tag_score=5,
+ different_match_cutoff_score=10)
+ self.assertEqual(int(topic_matches[0]['score']), highest_score)
def test_direct_matching(self):
self._check_equals("Eine Pflanze wächst", "Eine Pflanze wächst", 34)
@@ -27,7 +29,7 @@ def test_direct_matching_nonsense_word(self):
self._check_equals("Ein Gegwghg wächst", "Ein Gegwghg wächst", 34)
def test_entity_matching(self):
- self._check_equals("Ein ENTITYPER singt", "Peter singt", 34)
+ self._check_equals("Ein ENTITYPER singt", "Richard singt", 34)
def test_entitynoun_matching(self):
self._check_equals("Ein ENTITYNOUN singt", "Ein Vogel singt", 25)
@@ -44,67 +46,67 @@ def test_matching_no_change_from_template_words(self):
def test_reverse_only_parent_lemma_aux_threeway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel hat ein Dach", 68,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_twoway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel hat ein Haus", 29,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_auxiliary_threeway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel wird ein Dach haben", 69,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_auxiliary_twoway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel wird ein Haus haben", 29,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_modal_threeway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel soll ein Dach haben", 69,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_modal_twoway(self):
self._check_equals("Der Esel hat ein Dach", "Der Esel soll ein Haus haben", 29,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_verb_threeway(self):
self._check_equals("Der Esel macht ein Dach", "Der Esel macht ein Dach", 68,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_verb_twoway(self):
self._check_equals("Der Esel macht ein Dach", "Der Esel macht ein Haus", 29,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_threeway_control(self):
self._check_equals("Der Esel malt ein Dach an", "Der Esel malt ein Dach an", 82,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_twoway_control_no_embedding_based_match(self):
self._check_equals("Der Esel malt ein Dach an", "Der Esel malt eine Maus an", 34,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_be(self):
self._check_equals("Ein Präsident ist ein Politiker", "Ein Präsident ist ein Politiker", 68,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_be_reversed(self):
self._check_equals("Ein Präsident ist ein Politiker", "Ein Politiker ist ein Präsident", 24,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_become(self):
self._check_equals("Ein Präsident wird ein Politiker", "Ein Präsident wird ein Politiker", 68,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_become_reversed(self):
- self._check_equals("Ein Präsident wird ein Politiker", "Ein Politiker wird ein Präsident", 24,
- holmes_manager_with_embeddings)
+ self._check_equals("Ein Präsident wird ein Politiker", "Ein Politiker wird ein Präsident", 39,
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_only_parent_lemma_aux_in_document(self):
self._check_equals("Ein Esel hat ein Dach", "Ein Esel hat ein Dach gesehen", 24,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_matching_noun(self):
- self._check_equals("Ein König mit einem Land", "Ein Präsident mit einem Land", 49,
- holmes_manager_with_embeddings)
+ self._check_equals("Ein König mit einem Land", "Ein Präsident mit einem Land", 48,
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_reverse_matching_noun_control_no_embeddings(self):
self._check_equals("Ein König mit einem Land", "Ein Präsident mit einem Land", 29,
@@ -157,7 +159,7 @@ def test_multiword_in_text_to_search_dependent_words_in_document_root(self):
def test_double_match(self):
self._check_equals("vier Ochsen und sechs Ochsen",
"vier Ochsen", 34,
- holmes_manager_with_embeddings)
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_separate_words_in_text_to_match_subwords_in_document_text_with_fugen_s(self):
self._check_equals("Die Extraktion der Information",
@@ -191,22 +193,22 @@ def test_subwords_in_text_to_match_subwords_in_document_text_without_fugen_s(sel
def test_subwords_in_text_to_match_subwords_in_document_text_lemmatization_failed(self):
self._check_equals("Mozartsymphonien",
- "Mozartsymphonie", 30,
+ "Mozartsymphonie", 20,
holmes_manager)
def test_subwords_conjunction_in_text_to_match(self):
self._check_equals("Mozart- und Beethovensymphonie",
- "Mozartsymphonie", 30,
+ "Mozartsymphonie", 20,
holmes_manager)
def test_subwords_conjunction_in_document_text(self):
self._check_equals("Mozartsymphonie",
- "Mozart- und Beethovensymphonie", 30,
+ "Mozart- und Beethovensymphonie", 20,
holmes_manager)
def test_subwords_conjunction_in_text_to_match_and_document_text(self):
self._check_equals("Mozart- und Mahlersymphonie",
- "Mozart- und Beethovensymphonie", 30,
+ "Mozart- und Beethovensymphonie", 20,
holmes_manager)
def test_subword_matches_verbal_expression(self):
@@ -244,8 +246,8 @@ def test_single_word_matches_word_with_subwords_linked_via_ontology_control(self
def test_embedding_matching_with_subwords(self):
self._check_equals("Eine Königsabdanken",
- "Der Prinz dankte ab", 14,
- holmes_manager_with_embeddings)
+ "Der Prinz dankte ab", 15,
+ holmes_manager, word_embedding_match_threshold=0.42)
def test_embedding_matching_with_subwords_control(self):
self._check_equals("Eine Königsabdanken",
@@ -272,18 +274,24 @@ def test_derivation_in_subwords_2(self):
self._check_equals("Informierung wird extrahiert",
"Informationsextraktion", 35)
+ def test_reverse_derivation_1(self):
+ self._check_equals("Wohnungsverkauf",
+ "eine verkaufte Wohnung", 29)
+
+ def test_reverse_derivation_2(self):
+ self._check_equals("eine verkaufte Wohnung",
+ "Wohnungsverkauf", 35)
+
def test_indexes(self):
holmes_manager.remove_all_documents()
holmes_manager.parse_and_register_document(
"Dies ist ein irrelevanter Satz. Ich glaube, dass eine Pflanze wächst.")
topic_matches = holmes_manager.topic_match_documents_against(
"Eine Pflanze wächst")
- self.assertEqual(topic_matches[0].sentences_start_index, 6)
- self.assertEqual(topic_matches[0].sentences_end_index, 13)
- self.assertEqual(topic_matches[0].start_index, 11)
- self.assertEqual(topic_matches[0].end_index, 12)
- self.assertEqual(topic_matches[0].relative_start_index, 5)
- self.assertEqual(topic_matches[0].relative_end_index, 6)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 6)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 13)
+ self.assertEqual(topic_matches[0]['start_index'], 11)
+ self.assertEqual(topic_matches[0]['end_index'], 12)
def test_same_index_different_documents(self):
holmes_manager.remove_all_documents()
@@ -294,12 +302,12 @@ def test_same_index_different_documents(self):
topic_matches = holmes_manager.topic_match_documents_against(
"Eine Pflanze wächst")
self.assertEqual(len(topic_matches), 2)
- self.assertEqual(topic_matches[0].document_label, '1')
- self.assertEqual(topic_matches[1].document_label, '2')
- self.assertEqual(topic_matches[0].start_index, 1)
- self.assertEqual(topic_matches[0].end_index, 2)
- self.assertEqual(topic_matches[1].start_index, 1)
- self.assertEqual(topic_matches[1].end_index, 2)
+ self.assertEqual(topic_matches[0]['document_label'], '1')
+ self.assertEqual(topic_matches[1]['document_label'], '2')
+ self.assertEqual(topic_matches[0]['start_index'], 1)
+ self.assertEqual(topic_matches[0]['end_index'], 2)
+ self.assertEqual(topic_matches[1]['start_index'], 1)
+ self.assertEqual(topic_matches[1]['end_index'], 2)
def test_suppressed_relation_matching_picked_up_during_reverse_matching_subwords(self):
holmes_manager.remove_all_documents()
@@ -308,53 +316,56 @@ def test_suppressed_relation_matching_picked_up_during_reverse_matching_subwords
topic_matches = holmes_manager.topic_match_documents_against("Das Königabdanken",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 29)
+ different_match_cutoff_score=10)
+ self.assertEqual(int(topic_matches[0]['score']), 29)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_parent(self):
- holmes_manager_with_embeddings.remove_all_documents()
- holmes_manager_with_embeddings.parse_and_register_document(
- "Der Prinz dankte ab")
- topic_matches = holmes_manager_with_embeddings.topic_match_documents_against(
- "Das Königsabdanken",
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "Der Prinz dankte ab. Jemand dankte ab. Jemand dankte ab.")
+ topic_matches = holmes_manager.topic_match_documents_against(
+ "Das Königsabdanken", word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
- single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 5)
+ single_word_any_tag_score=5, different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 5)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_parent_control(self):
- holmes_manager_with_embeddings.remove_all_documents()
- holmes_manager_with_embeddings.parse_and_register_document(
- "Der Prinz dankte ab")
- topic_matches = holmes_manager_with_embeddings.topic_match_documents_against(
- "Das Königsabdanken",
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "Der Prinz dankte ab. Jemand dankte ab. Jemand dankte ab.")
+ topic_matches = holmes_manager.topic_match_documents_against(
+ "Das Königsabdanken", word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
- single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=1)
- self.assertEqual(int(topic_matches[0].score), 14)
+ single_word_any_tag_score=5, different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 15)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_child(self):
- holmes_manager_with_embeddings.remove_all_documents()
- holmes_manager_with_embeddings.parse_and_register_document(
- "Der König vom Abdanken")
- topic_matches = holmes_manager_with_embeddings.topic_match_documents_against(
- "Die Abdankenprinzen",
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "Der König vom Abdanken. Das Abdanken. Das Abdanken.")
+ topic_matches = holmes_manager.topic_match_documents_against(
+ "Die Abdankenprinzen", word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
- single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 5)
+ single_word_any_tag_score=5,different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 5)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_child_control(self):
- holmes_manager_with_embeddings.remove_all_documents()
- holmes_manager_with_embeddings.parse_and_register_document(
- "Der König vom Abdanken")
- topic_matches = holmes_manager_with_embeddings.topic_match_documents_against(
- "Die Abdankenprinzen",
+ holmes_manager.remove_all_documents()
+ holmes_manager.parse_and_register_document(
+ "Der König vom Abdanken. Das Abdanken. Das Abdanken.")
+ topic_matches = holmes_manager.topic_match_documents_against(
+ "Die Abdankenprinzen", word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
- single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=1)
- self.assertEqual(int(topic_matches[0].score), 14)
+ single_word_any_tag_score=5,different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 14)
def test_disjunct_relation_mapping_within_subword_dictionaries(self):
holmes_manager.remove_all_documents()
@@ -362,10 +373,10 @@ def test_disjunct_relation_mapping_within_subword_dictionaries(self):
holmes_manager.parse_and_register_document(
"Informationssymphonieentführung von Löwen")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Symphonie von Information und Entführung von Löwen")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Informationssymphonieentführung von Löwen', 'text_to_match': 'Symphonie von Information und Entführung von Löwen', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 41, 'score': 78.0, 'word_infos': [[0, 11, 'relation', False, "Matches INFORMATION directly."], [12, 21, 'relation', False, "Matches SYMPHONIE directly."], [21, 31, 'relation', False, "Matches ENTFÜHRUNG directly."], [36, 41, 'relation', True, "Matches LÖWE directly."]]}])
+ [{'document_label': '', 'text': 'Informationssymphonieentführung von Löwen', 'text_to_match': 'Symphonie von Information und Entführung von Löwen', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 41, 'score': 780.0, 'word_infos': [[0, 11, 'relation', False, 'Matches INFORMATION directly.'], [12, 21, 'relation', False, 'Matches SYMPHONIE directly.'], [21, 31, 'relation', False, 'Matches ENTFÜHRUNG directly.'], [36, 41, 'relation', True, 'Matches LÖWE directly.']], 'answers': []}])
def test_overlapping_relation_mapping_within_subword_dictionaries(self):
holmes_manager.remove_all_documents()
@@ -373,11 +384,10 @@ def test_overlapping_relation_mapping_within_subword_dictionaries(self):
holmes_manager.parse_and_register_document(
"Informationsextraktion von Löwen")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Extraktion von Information und Löwen")
-
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Informationsextraktion von Löwen', 'text_to_match': 'Extraktion von Information und Löwen', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 32, 'score': 102.33333333333334, 'word_infos': [[0, 11, 'overlapping_relation', False, "Matches INFORMATION directly."], [12, 22, 'overlapping_relation', False, "Matches EXTRAKTION directly."], [27, 32, 'overlapping_relation', True, "Matches LÖWE directly."]]}])
+ [{'document_label': '', 'text': 'Informationsextraktion von Löwen', 'text_to_match': 'Extraktion von Information und Löwen', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 32, 'score': 1023.3333333333334, 'word_infos': [[0, 11, 'overlapping_relation', False, 'Matches INFORMATION directly.'], [12, 22, 'overlapping_relation', False, 'Matches EXTRAKTION directly.'], [27, 32, 'overlapping_relation', True, 'Matches LÖWE directly.']], 'answers': []}])
def test_subword_dictionaries_subword_is_not_peak(self):
holmes_manager.remove_all_documents()
@@ -385,11 +395,10 @@ def test_subword_dictionaries_subword_is_not_peak(self):
holmes_manager.parse_and_register_document(
"Information und Löwen wurden genommen")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Informationsnehmen der Löwen")
-
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Information und Löwen wurden genommen', 'text_to_match': 'Informationsnehmen der Löwen', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 37, 'score': 98.75999999999999, 'word_infos': [[0, 11, 'overlapping_relation', False, "Matches INFORMATION directly."], [16, 21, 'overlapping_relation', False, "Matches LÖWE directly."], [29, 37, 'overlapping_relation', True, "Matches NEHMEN directly."]]}])
+ [{'document_label': '', 'text': 'Information und Löwen wurden genommen', 'text_to_match': 'Informationsnehmen der Löwen', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 0, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 37, 'score': 987.6, 'word_infos': [[0, 11, 'overlapping_relation', False, 'Matches INFORMATION directly.'], [16, 21, 'overlapping_relation', False, 'Matches LÖWE directly.'], [29, 37, 'overlapping_relation', True, 'Matches NEHMEN directly.']], 'answers': []}])
def test_subword_conjunction_within_dictionaries_single_word_hyphen_first_word(self):
holmes_manager.remove_all_documents()
@@ -397,11 +406,10 @@ def test_subword_conjunction_within_dictionaries_single_word_hyphen_first_word(s
holmes_manager.parse_and_register_document(
"Informations- und Informationsextraktion")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Extraktion")
-
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Informations- und Informationsextraktion', 'text_to_match': 'Extraktion', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 40, 'score': 5.0, 'word_infos': [[30, 40, 'single', True, 'Matches EXTRAKTION directly.']]}])
+ [{'document_label': '', 'text': 'Informations- und Informationsextraktion', 'text_to_match': 'Extraktion', 'rank': '1', 'index_within_document': 0, 'subword_index': 1, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 40, 'score': 50.0, 'word_infos': [[30, 40, 'single', True, 'Matches EXTRAKTION directly.']], 'answers': []}])
def test_subword_conjunction_within_dictionaries_single_word_hyphen_second_word(self):
holmes_manager.remove_all_documents()
@@ -409,10 +417,10 @@ def test_subword_conjunction_within_dictionaries_single_word_hyphen_second_word(
holmes_manager.parse_and_register_document(
"Informationsextraktion und -extraktion")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Information")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Informationsextraktion und -extraktion', 'text_to_match': 'Information', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 38, 'score': 5.0, 'word_infos': [[0, 11, 'single', True, 'Matches INFORMATION directly.']]}])
+ [{'document_label': '', 'text': 'Informationsextraktion und -extraktion', 'text_to_match': 'Information', 'rank': '1', 'index_within_document': 0, 'subword_index': 0, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 38, 'score': 50.0, 'word_infos': [[0, 11, 'single', True, 'Matches INFORMATION directly.']], 'answers': []}])
def test_subword_conjunction_within_dictionaries_relation_hyphen_first_word(self):
holmes_manager.remove_all_documents()
@@ -420,10 +428,10 @@ def test_subword_conjunction_within_dictionaries_relation_hyphen_first_word(self
holmes_manager.parse_and_register_document(
"Ein Königs- und Prinzenabdanken")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
+ holmes_manager.topic_match_documents_against(
"Das Abdanken eines Königs")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Ein Königs- und Prinzenabdanken', 'text_to_match': 'Das Abdanken eines Königs', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 31, 'score': 40.0, 'word_infos': [[4, 9, 'relation', False, 'Matches KÖNIG directly.'], [23, 31, 'relation', True, 'Matches ABDANKEN directly.']]}])
+ [{'document_label': '', 'text': 'Ein Königs- und Prinzenabdanken', 'text_to_match': 'Das Abdanken eines Königs', 'rank': '1', 'index_within_document': 1, 'subword_index': 1, 'start_index': 1, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 31, 'score': 400.0, 'word_infos': [[4, 9, 'relation', False, 'Matches KÖNIG directly.'], [23, 31, 'relation', True, 'Matches ABDANKEN directly.']], 'answers': []}])
def test_subword_conjunction_within_dictionaries_relation_hyphen_second_word(self):
holmes_manager.remove_all_documents()
@@ -431,7 +439,16 @@ def test_subword_conjunction_within_dictionaries_relation_hyphen_second_word(sel
holmes_manager.parse_and_register_document(
"Ein Königsabdanken und -prinz")
topic_match_dictionaries = \
- holmes_manager.topic_match_documents_returning_dictionaries_against(
- "Das Prinz eines Königs")
+ holmes_manager.topic_match_documents_against(
+ "Der Prinz eines Königs")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Ein Königsabdanken und -prinz', 'text_to_match': 'Das Prinz eines Königs', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 29, 'score': 40.0, 'word_infos': [[4, 9, 'relation', False, 'Matches KÖNIG directly.'], [24, 29, 'relation', True, 'Matches PRINZ directly.']]}])
+ [{'document_label': '', 'text': 'Ein Königsabdanken und -prinz', 'text_to_match': 'Der Prinz eines Königs', 'rank': '1', 'index_within_document': 3, 'subword_index': 1, 'start_index': 1, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 29, 'score': 400.0, 'word_infos': [[4, 9, 'relation', False, 'Matches KÖNIG directly.'], [24, 29, 'relation', True, 'Matches PRINZ directly.']], 'answers': []}])
+
+ def test_no_error(self):
+ holmes_manager.remove_all_documents()
+ holmes_manager.remove_all_search_phrases()
+ holmes_manager.parse_and_register_document(
+ "Ein Krankenhaus für demenzkranke Menschen")
+ topic_match_dictionaries = \
+ holmes_manager.topic_match_documents_against(
+ "Mein Kind ist krank")
diff --git a/holmes_extractor/tests/en/test_ontology.owl b/tests/en/test_ontology.owl
similarity index 92%
rename from holmes_extractor/tests/en/test_ontology.owl
rename to tests/en/test_ontology.owl
index bd89213..701cb74 100644
--- a/holmes_extractor/tests/en/test_ontology.owl
+++ b/tests/en/test_ontology.owl
@@ -7,10 +7,10 @@
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
-
-
-
+
@@ -26,27 +26,19 @@
-
-
-
-
-
-
-
-
-
+
-
+
@@ -56,7 +48,7 @@
-
+
@@ -64,7 +56,7 @@
-
+
@@ -72,19 +64,19 @@
-
+
-
+
-
+
@@ -93,7 +85,7 @@
-
+
@@ -101,7 +93,7 @@
-
+
@@ -109,7 +101,7 @@
-
+
@@ -117,7 +109,7 @@
-
+
@@ -125,7 +117,7 @@
-
+
@@ -133,7 +125,7 @@
-
+
@@ -141,25 +133,25 @@
-
+
-
+
-
+
-
+
@@ -167,7 +159,7 @@
-
+
@@ -175,31 +167,31 @@
-
+
-
+
-
+
-
+
-
+
@@ -207,13 +199,13 @@
-
+
-
+
@@ -221,7 +213,7 @@
-
+
@@ -229,7 +221,7 @@
-
+
@@ -237,43 +229,43 @@
-
+
-
+
-
+
-
+
-
+
-
+
-
+
@@ -282,18 +274,18 @@
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
+
@@ -330,4 +314,3 @@
-
diff --git a/holmes_extractor/tests/en/test_phraselet_production_EN.py b/tests/en/test_phraselet_production_EN.py
similarity index 71%
rename from holmes_extractor/tests/en/test_phraselet_production_EN.py
rename to tests/en/test_phraselet_production_EN.py
index 7f5bd46..06d6f5b 100644
--- a/holmes_extractor/tests/en/test_phraselet_production_EN.py
+++ b/tests/en/test_phraselet_production_EN.py
@@ -5,55 +5,68 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-ontology_holmes_manager = holmes.Manager(model='en_core_web_lg',
- perform_coreference_resolution=False, ontology=ontology)
-ontology_holmes_manager_adm_false = holmes.Manager(model='en_core_web_lg',
+ontology_holmes_manager = holmes.Manager(model='en_core_web_trf',
+ perform_coreference_resolution=False, ontology=ontology,
+ number_of_workers=1)
+ontology_holmes_manager_adm_false = holmes.Manager(model='en_core_web_trf',
perform_coreference_resolution=False, ontology=ontology,
- analyze_derivational_morphology=False)
+ analyze_derivational_morphology=False,
+ number_of_workers=1)
symmetric_ontology = holmes.Ontology(os.sep.join((script_directory, 'test_ontology.owl')),
symmetric_matching=True)
-symmetric_ontology_nocoref_holmes_manager = holmes.Manager(model='en_core_web_lg',
- ontology=symmetric_ontology, perform_coreference_resolution=False)
-no_ontology_coref_holmes_manager = holmes.Manager(model='en_core_web_lg',
- perform_coreference_resolution=True)
+symmetric_ontology_nocoref_holmes_manager = holmes.Manager(model='en_core_web_trf',
+ ontology=symmetric_ontology, perform_coreference_resolution=False,
+ number_of_workers=1)
+no_ontology_coref_holmes_manager = holmes.Manager(model='en_core_web_trf',
+ perform_coreference_resolution=True,
+ number_of_workers=True)
class EnglishPhraseletProductionTest(unittest.TestCase):
def _check_equals(self, manager, text_to_match, phraselet_labels,
replace_with_hypernym_ancestors=True, match_all_words=False,
- include_reverse_only=False):
+ include_reverse_only=False, process_initial_question_words=False):
manager.remove_all_search_phrases()
doc = manager.semantic_analyzer.parse(text_to_match)
phraselet_labels_to_phraselet_infos = {}
- manager.structural_matcher.add_phraselets_to_dict(doc,
+ manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=replace_with_hypernym_ancestors,
match_all_words=match_all_words,
ignore_relation_phraselets=False,
include_reverse_only=include_reverse_only,
- stop_lemmas=manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=process_initial_question_words)
self.assertEqual(
set(phraselet_labels_to_phraselet_infos.keys()),
set(phraselet_labels))
self.assertEqual(len(phraselet_labels_to_phraselet_infos.keys()),
len(phraselet_labels))
- def _get_phraselet_dict(self, manager, text_to_match):
+ def _get_phraselet_dict(self, manager, text_to_match, words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None):
manager.remove_all_search_phrases()
doc = manager.semantic_analyzer.parse(text_to_match)
phraselet_labels_to_phraselet_infos = {}
- manager.structural_matcher.add_phraselets_to_dict(doc,
+ manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=False,
match_all_words=True,
ignore_relation_phraselets=False,
include_reverse_only=True,
- stop_lemmas=manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=words_to_corpus_frequencies,
+ maximum_corpus_frequency=maximum_corpus_frequency,
+ process_initial_question_words=False)
return phraselet_labels_to_phraselet_infos
def test_verb_subject_no_entry_in_ontology(self):
@@ -76,7 +89,7 @@ def test_verb_direct_object_no_entry_in_ontology(self):
def test_verb_indirect_object_no_entry_in_ontology(self):
self._check_equals(ontology_holmes_manager, "Somebody gives something to a plant",
- ['predicate-recipient: give-plant', 'word: plant'])
+ ['predicate-recipient: gift-plant', 'word: plant'])
def test_noun_adjective_no_entry_in_ontology(self):
self._check_equals(ontology_holmes_manager, "A healthy plant",
@@ -251,12 +264,23 @@ def test_phraselet_stop_words_governed_suppressed(self):
"So he did it at home", ['word: home'],
include_reverse_only=False)
+ def test_question_word(self):
+ self._check_equals(no_ontology_coref_holmes_manager,
+ "Who opened the door?",
+ ['head-WHsubj: open-who', 'predicate-patient: open-door', 'word: door'],
+ process_initial_question_words=True)
+
+ def test_question_word_control(self):
+ self._check_equals(no_ontology_coref_holmes_manager,
+ "Who opened the door?", ['predicate-patient: open-door', 'word: door'],
+ process_initial_question_words=False)
+
def test_coref_and_phraselet_labels(self):
no_ontology_coref_holmes_manager.remove_all_search_phrases()
doc = no_ontology_coref_holmes_manager.semantic_analyzer.parse(
"I saw a dog. He was chasing a cat and a cat")
phraselet_labels_to_phraselet_infos = {}
- no_ontology_coref_holmes_manager.structural_matcher.add_phraselets_to_dict(
+ no_ontology_coref_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(
doc,
phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
replace_with_hypernym_ancestors=False,
@@ -264,9 +288,14 @@ def test_coref_and_phraselet_labels(self):
include_reverse_only=False,
ignore_relation_phraselets=False,
stop_lemmas=no_ontology_coref_holmes_manager.
- semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=no_ontology_coref_holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=no_ontology_coref_holmes_manager.
+ semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=no_ontology_coref_holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
self.assertEqual(set(
phraselet_labels_to_phraselet_infos.keys()),
set(['predicate-patient: see-dog', 'predicate-actor: chase-dog',
@@ -305,12 +334,6 @@ def test_ontology_defined_multiword_not_match_all_words_with_adjective(self):
['governor-adjective: mimi momo-big',
'word: mimi momo'], False, False)
- def test_ontology_and_entity_defined_multiword_not_match_all_words_with_adjective(self):
- self._check_equals(ontology_holmes_manager,
- "The big Richard Mimi Momo",
- ['governor-adjective: richard mimi momo-big',
- 'word: richard mimi momo'], False, False)
-
def test_entity_defined_multiword_match_all_words(self):
self._check_equals(no_ontology_coref_holmes_manager,
"Richard Paul Hudson came",
@@ -367,7 +390,7 @@ def test_noun_lemmas_preferred_control(self):
word_phraselet = dict['word: anonymity']
self.assertEqual(word_phraselet.parent_lemma, 'anonymous')
self.assertEqual(word_phraselet.parent_derived_lemma, 'anonymity')
- relation_phraselet = dict['governor-adjective: use-anonymity']
+ relation_phraselet = dict['predicate-patient: use-anonymity']
self.assertEqual(relation_phraselet.child_lemma, 'anonymous')
self.assertEqual(relation_phraselet.child_derived_lemma, 'anonymity')
@@ -429,7 +452,7 @@ def test_shorter_lemmas_preferred_control(self):
self.assertEqual(relation_phraselet.child_lemma, 'behaviour')
self.assertEqual(relation_phraselet.child_derived_lemma, 'behave')
- def test_reverse_derived_lemmas_in_ontology_one_lemma(self):
+ def test_reverse_derived_lemmas_in_ontology_one_lemma_1(self):
dict = self._get_phraselet_dict(ontology_holmes_manager,
"He ate moodily")
self.assertFalse('word: moody' in dict)
@@ -441,7 +464,7 @@ def test_reverse_derived_lemmas_in_ontology_one_lemma(self):
self.assertEqual(relation_phraselet.child_lemma, 'moodily')
self.assertEqual(relation_phraselet.child_derived_lemma, 'moodiness')
- def test_reverse_derived_lemmas_in_ontology_one_lemma(self):
+ def test_reverse_derived_lemmas_in_ontology_one_lemma_2(self):
dict = self._get_phraselet_dict(ontology_holmes_manager,
"He offended the cat")
self.assertFalse('word: offend' in dict)
@@ -454,42 +477,134 @@ def test_reverse_derived_lemmas_in_ontology_one_lemma(self):
self.assertEqual(relation_phraselet.parent_derived_lemma, 'offence')
doc = ontology_holmes_manager.semantic_analyzer.parse(
'He took offense')
- ontology_holmes_manager.structural_matcher.add_phraselets_to_dict(doc,
+ ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=dict,
replace_with_hypernym_ancestors=False,
match_all_words=True,
ignore_relation_phraselets=False,
include_reverse_only=True,
- stop_lemmas=ontology_holmes_manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=ontology_holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=ontology_holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
word_phraselet = dict['word: offence']
self.assertEqual(word_phraselet.parent_lemma, 'offense')
self.assertEqual(word_phraselet.parent_derived_lemma, 'offence')
doc = ontology_holmes_manager.semantic_analyzer.parse(
'He took offence')
- ontology_holmes_manager.structural_matcher.add_phraselets_to_dict(doc,
+ ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
phraselet_labels_to_phraselet_infos=dict,
replace_with_hypernym_ancestors=False,
match_all_words=True,
ignore_relation_phraselets=False,
include_reverse_only=True,
- stop_lemmas=ontology_holmes_manager.semantic_analyzer.topic_matching_phraselet_stop_lemmas,
- reverse_only_parent_lemmas=ontology_holmes_manager.semantic_analyzer.
- topic_matching_reverse_only_parent_lemmas)
+ stop_lemmas=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
+ stop_tags=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
+ reverse_only_parent_lemmas=ontology_holmes_manager.semantic_matching_helper.
+ topic_matching_reverse_only_parent_lemmas,
+ words_to_corpus_frequencies=None,
+ maximum_corpus_frequency=None,
+ process_initial_question_words=False)
word_phraselet = dict['word: offence']
self.assertEqual(word_phraselet.parent_lemma, 'offense')
self.assertEqual(word_phraselet.parent_derived_lemma, 'offence')
def test_reverse_derived_lemmas_in_ontology_multiword(self):
dict = self._get_phraselet_dict(ontology_holmes_manager,
- "He used a vault horse")
- self.assertFalse('word: vault horse' in dict)
- self.assertFalse('predicate-patient: use-vault horse' in dict)
- word_phraselet = dict['word: vaulting horse']
- self.assertEqual(word_phraselet.parent_lemma, 'vaulting horse')
- self.assertEqual(word_phraselet.parent_derived_lemma, 'vaulting horse')
- relation_phraselet = dict['predicate-patient: use-vaulting horse']
- self.assertEqual(relation_phraselet.child_lemma, 'vaulting horse')
+ "He used a waste horse")
+ self.assertFalse('word: waste horse' in dict)
+ self.assertFalse('predicate-patient: use-waste horse' in dict)
+ word_phraselet = dict['word: wastage horse']
+ self.assertEqual(word_phraselet.parent_lemma, 'wastage horse')
+ self.assertEqual(word_phraselet.parent_derived_lemma, 'wastage horse')
+ relation_phraselet = dict['predicate-patient: use-wastage horse']
+ self.assertEqual(relation_phraselet.child_lemma, 'wastage horse')
self.assertEqual(
- relation_phraselet.child_derived_lemma, 'vaulting horse')
+ relation_phraselet.child_derived_lemma, 'wastage horse')
+
+ def test_frequency_factors_small(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The dog chased the cat",
+ words_to_corpus_frequencies={'dog': 1, 'chasing': 1, 'cat': 2}, maximum_corpus_frequency=5)
+ dog_phraselet = dict['word: dog']
+ self.assertEqual(str(dog_phraselet.frequency_factor), '1.0')
+ cat_phraselet = dict['word: cat']
+ self.assertEqual(str(cat_phraselet.frequency_factor), '1.0')
+ chase_phraselet = dict['word: chasing']
+ self.assertEqual(str(chase_phraselet.frequency_factor), '1.0')
+ chase_dog_phraselet = dict['predicate-actor: chasing-dog']
+ self.assertEqual(str(chase_dog_phraselet.frequency_factor), '1.0')
+ chase_cat_phraselet = dict['predicate-patient: chasing-cat']
+ self.assertEqual(str(chase_cat_phraselet.frequency_factor), '1.0')
+
+ def test_frequency_factors_small_with_small_mcf(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The dog chased the cat",
+ words_to_corpus_frequencies={'dog': 1, 'chasing': 1, 'cat': 2}, maximum_corpus_frequency=2)
+ dog_phraselet = dict['word: dog']
+ self.assertEqual(str(dog_phraselet.frequency_factor), '1.0')
+ cat_phraselet = dict['word: cat']
+ self.assertEqual(str(cat_phraselet.frequency_factor), '1.0')
+ chase_phraselet = dict['word: chasing']
+ self.assertEqual(str(chase_phraselet.frequency_factor), '1.0')
+ chase_dog_phraselet = dict['predicate-actor: chasing-dog']
+ self.assertEqual(str(chase_dog_phraselet.frequency_factor), '1.0')
+ chase_cat_phraselet = dict['predicate-patient: chasing-cat']
+ self.assertEqual(str(chase_cat_phraselet.frequency_factor), '1.0')
+
+ def test_frequency_factors_large(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The dog chased the cat",
+ words_to_corpus_frequencies={'dog': 3, 'chasing': 4, 'cat': 5}, maximum_corpus_frequency=5)
+ dog_phraselet = dict['word: dog']
+ self.assertEqual(str(dog_phraselet.frequency_factor), '0.5693234419266069')
+ cat_phraselet = dict['word: cat']
+ self.assertEqual(str(cat_phraselet.frequency_factor), '0.1386468838532139')
+ chase_phraselet = dict['word: chasing']
+ self.assertEqual(str(chase_phraselet.frequency_factor), '0.31739380551401464')
+ chase_dog_phraselet = dict['predicate-actor: chasing-dog']
+ self.assertEqual(str(chase_dog_phraselet.frequency_factor), '0.18069973380142287')
+ chase_cat_phraselet = dict['predicate-patient: chasing-cat']
+ self.assertEqual(str(chase_cat_phraselet.frequency_factor), '0.044005662088831145')
+
+ def test_frequency_factors_large_with_ontology_match(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The dog chased the cat",
+ words_to_corpus_frequencies={'dog': 2, 'puppy': 4, 'chasing': 4, 'cat': 5}, maximum_corpus_frequency=5)
+ dog_phraselet = dict['word: dog']
+ self.assertEqual(str(dog_phraselet.frequency_factor), '0.31739380551401464')
+ cat_phraselet = dict['word: cat']
+ self.assertEqual(str(cat_phraselet.frequency_factor), '0.1386468838532139')
+ chase_phraselet = dict['word: chasing']
+ self.assertEqual(str(chase_phraselet.frequency_factor), '0.31739380551401464')
+ chase_dog_phraselet = dict['predicate-actor: chasing-dog']
+ self.assertEqual(str(chase_dog_phraselet.frequency_factor), '0.10073882777866815')
+ chase_cat_phraselet = dict['predicate-patient: chasing-cat']
+ self.assertEqual(str(chase_cat_phraselet.frequency_factor), '0.044005662088831145')
+
+ def test_frequency_factors_very_large(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The dog chased the cat",
+ words_to_corpus_frequencies={'dog': 97, 'chasing': 98, 'cat': 99}, maximum_corpus_frequency=100)
+ dog_phraselet = dict['word: dog']
+ self.assertEqual(str(dog_phraselet.frequency_factor), '0.008864383480215898')
+ cat_phraselet = dict['word: cat']
+ self.assertEqual(str(cat_phraselet.frequency_factor), '0.0043869621537525605')
+ chase_phraselet = dict['word: chasing']
+ self.assertEqual(str(chase_phraselet.frequency_factor), '0.00661413286687762')
+ chase_dog_phraselet = dict['predicate-actor: chasing-dog']
+ self.assertEqual(str(chase_dog_phraselet.frequency_factor), '5.863021012110299e-05')
+ chase_cat_phraselet = dict['predicate-patient: chasing-cat']
+ self.assertEqual(str(chase_cat_phraselet.frequency_factor), '2.9015950566883042e-05')
+
+ def test_ent_types(self):
+ dict = self._get_phraselet_dict(ontology_holmes_manager,
+ "The big Richard came home.")
+ self.assertEqual(dict['word: richard'].parent_ent_type, 'PERSON')
+ self.assertEqual(dict['predicate-actor: come-richard'].parent_ent_type, '')
+ self.assertEqual(dict['predicate-actor: come-richard'].child_ent_type, 'PERSON')
+ self.assertEqual(dict['governor-adjective: richard-big'].parent_ent_type, 'PERSON')
+ self.assertEqual(dict['governor-adjective: richard-big'].child_ent_type, '')
diff --git a/tests/en/test_questions_EN.py b/tests/en/test_questions_EN.py
new file mode 100644
index 0000000..c94a182
--- /dev/null
+++ b/tests/en/test_questions_EN.py
@@ -0,0 +1,380 @@
+import unittest
+import holmes_extractor as holmes
+from holmes_extractor.topic_matching import TopicMatcher
+import os
+
+script_directory = os.path.dirname(os.path.realpath(__file__))
+ontology = holmes.Ontology(os.sep.join((script_directory, 'test_ontology.owl')),
+ symmetric_matching=True)
+manager = holmes.Manager(model='en_core_web_trf', ontology=ontology,
+ number_of_workers=1)
+
+class EnglishInitialQuestionsTest(unittest.TestCase):
+
+ def _check_equals(self, text_to_match, document_text, highest_score, answer_start, answer_end,
+ word_embedding_match_threshold=0.42, initial_question_word_embedding_match_threshold=0.42,
+ use_frequency_factor=True, initial_question_word_answer_score=40, relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ ):
+ manager.remove_all_documents()
+ manager.parse_and_register_document(document_text)
+ topic_matches = manager.topic_match_documents_against(text_to_match,
+ word_embedding_match_threshold=
+ word_embedding_match_threshold,
+ initial_question_word_embedding_match_threshold=initial_question_word_embedding_match_threshold,
+ initial_question_word_answer_score=initial_question_word_answer_score,
+ relation_score=20,
+ reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5,
+ different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=relation_matching_frequency_threshold,
+ embedding_matching_frequency_threshold=embedding_matching_frequency_threshold,
+ use_frequency_factor=use_frequency_factor)
+ self.assertEqual(int(topic_matches[0]['score']), highest_score)
+ if answer_start is not None:
+ self.assertEqual(topic_matches[0]['answers'][0][0], answer_start)
+ self.assertEqual(topic_matches[0]['answers'][0][1], answer_end)
+ else:
+ self.assertEqual(len(topic_matches[0]['answers']), 0)
+
+ def test_basic_matching(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard and Peter sang a duet.", 'q')
+ manager.parse_and_register_document("A book sings an elogy", 'n')
+ topic_matches = manager.topic_match_documents_against("Who sings?")
+ self.assertEqual([{'document_label': 'q', 'text': 'Richard and Peter sang a duet.', 'text_to_match': 'Who sings?', 'rank': '1', 'index_within_document': 3, 'subword_index': None, 'start_index': 0, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 6, 'sentences_character_start_index': 0, 'sentences_character_end_index': 30, 'score': 620.0, 'word_infos': [[0, 7, 'relation', False, 'Matches the question word WHO.'], [12, 17, 'relation', False, 'Matches the question word WHO.'], [18, 22, 'relation', True, 'Matches SING directly.']], 'answers': [[0, 7], [12, 17]]}, {'document_label': 'n', 'text': 'A book sings an elogy', 'text_to_match': 'Who sings?', 'rank': '2', 'index_within_document': 2, 'subword_index': None, 'start_index': 2, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 21, 'score': 20.0, 'word_infos': [[7, 12, 'single', True, 'Matches SING directly.']], 'answers': []}], topic_matches)
+
+ def test_ignore_questions(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard and Peter sang a duet.", 'q')
+ manager.parse_and_register_document("A book sings an elogy", 'n')
+ topic_matches = manager.topic_match_documents_against("Who sings?", initial_question_word_behaviour='ignore')
+ self.assertEqual([{'document_label': 'q', 'text': 'Richard and Peter sang a duet.', 'text_to_match': 'Who sings?', 'rank': '1=', 'index_within_document': 3, 'subword_index': None, 'start_index': 3, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 6, 'sentences_character_start_index': 0, 'sentences_character_end_index': 30, 'score': 20.0, 'word_infos': [[18, 22, 'single', True, 'Matches SING directly.']], 'answers': []}, {'document_label': 'n', 'text': 'A book sings an elogy', 'text_to_match': 'Who sings?', 'rank': '1=', 'index_within_document': 2, 'subword_index': None, 'start_index': 2, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 21, 'score': 20.0, 'word_infos': [[7, 12, 'single', True, 'Matches SING directly.']], 'answers': []}], topic_matches)
+
+ def test_exclusive_questions(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard and Peter sang a duet.", 'q')
+ manager.parse_and_register_document("A book sings an elogy", 'n')
+ topic_matches = manager.topic_match_documents_against("Who sings?", initial_question_word_behaviour='exclusive')
+ self.assertEqual(len(topic_matches), 1)
+ self.assertEqual(topic_matches[0]['document_label'], 'q')
+
+ def test_governed_interrogative_pronoun_matching_common_noun(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("The man sang a duet.", 'q')
+ topic_matches = manager.topic_match_documents_against("Which person sings?",
+ initial_question_word_embedding_match_threshold=0.5)
+ self.assertEqual([{'document_label': 'q', 'text': 'The man sang a duet.', 'text_to_match': 'Which person sings?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 20, 'score': 288.3671696, 'word_infos': [[4, 7, 'relation', False, 'Has a word embedding that is 55% similar to PERSON.'], [8, 12, 'relation', True, 'Matches SING directly.']], 'answers': [[0, 7]]}], topic_matches)
+ topic_matches = manager.topic_match_documents_against("A person sings", word_embedding_match_threshold=0.42)
+ self.assertEqual([{'document_label': 'q', 'text': 'The man sang a duet.', 'text_to_match': 'A person sings', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 20, 'score': 154.1835848, 'word_infos': [[4, 7, 'relation', False, 'Has a word embedding that is 55% similar to PERSON.'], [8, 12, 'relation', True, 'Matches SING directly.']], 'answers': []}], topic_matches)
+
+ def test_governed_interrogative_pronoun_matching_proper_noun(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard Hudson sang a duet.", 'q')
+ topic_matches = manager.topic_match_documents_against("Which person sings?")
+ self.assertEqual([{'document_label': 'q', 'text': 'Richard Hudson sang a duet.', 'text_to_match': 'Which person sings?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 27, 'score': 620.0, 'word_infos': [[0, 14, 'relation', False, 'Has an entity label that is 100% similar to the word embedding corresponding to PERSON.'], [15, 19, 'relation', True, 'Matches SING directly.']], 'answers': [[0, 14]]}], topic_matches)
+ topic_matches = manager.topic_match_documents_against("A person sings")
+ self.assertEqual([{'document_label': 'q', 'text': 'Richard Hudson sang a duet.', 'text_to_match': 'A person sings', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 27, 'score': 320.0, 'word_infos': [[0, 14, 'relation', False, 'Has an entity label that is 100% similar to the word embedding corresponding to PERSON.'], [15, 19, 'relation', True, 'Matches SING directly.']], 'answers': []}], topic_matches)
+
+ def test_basic_matching_with_coreference(self):
+ self._check_equals("Who came home?", 'I spoke to Richard. He came home', 98, 11, 18)
+
+ def test_basic_matching_with_coreference_and_coordination(self):
+ self._check_equals("Who came home?", 'I spoke to Richard Hudson and Peter Hudson. They came home', 98, 11, 25)
+
+ def test_governed_interrogative_pronoun_matching_direct(self):
+ self._check_equals('Which politician lied?', 'The politician lied', 54, 0, 14)
+
+ def test_governed_interrogative_pronoun_matching_direct_control(self):
+ self._check_equals('A politician lies', 'The politician lied', 34, None, None)
+
+ def test_governed_interrogative_pronoun_matching_derivation(self):
+ self._check_equals('Which performance by the boys was important?', 'The boys performed', 59, 0, 18)
+
+ def test_governed_interrogative_pronoun_matching_derivation_control(self):
+ self._check_equals('A performance by the boys is important', 'The boys performed', 39, None, None)
+
+ def test_governed_interrogative_pronoun_matching_ontology(self):
+ self._check_equals('Which animal woke up?', 'The cat woke up', 45, 0, 7)
+
+ def test_governed_interrogative_pronoun_matching_ontology_control(self):
+ self._check_equals('An animal woke up', 'The cat woke up', 29, None, None)
+
+ def test_governed_interrogative_pronoun_reverse_dependency(self):
+ self._check_equals('Which child did its parents adopt?', 'The adopted child', 54, 0, 17)
+
+ def test_governed_interrogative_pronoun_reverse_dependency_control(self):
+ self._check_equals('A child is adopted by its parents', 'The adopted child', 34, None, None)
+
+ def test_governed_interrogative_pronoun_with_coreference(self):
+ self._check_equals("Which person came home?", 'I spoke to Richard. He came home', 98, 11, 18)
+
+ def test_separate_embedding_threshold_for_question_words_normal_threshold_1(self):
+ self._check_equals("Which man came home?", 'The person came home', 52, 0, 10,
+ word_embedding_match_threshold=1.0, initial_question_word_answer_score=20)
+
+ def test_separate_embedding_threshold_for_question_words_normal_threshold_1_control(self):
+ self._check_equals("A man comes home", 'The person came home', 29, None, None,
+ word_embedding_match_threshold=1.0, initial_question_word_answer_score=20)
+
+ def test_separate_embedding_threshold_for_question_words_normal_threshold_below_1(self):
+ self._check_equals("Which man came home?", 'The person came home', 52, 0, 10,
+ word_embedding_match_threshold=0.9, initial_question_word_answer_score=20)
+
+ def test_separate_embedding_threshold_for_question_words_normal_threshold_below_1_control(self):
+ self._check_equals("A man comes home", 'The person came home', 29, None, None,
+ word_embedding_match_threshold=0.9, initial_question_word_answer_score=20)
+
+ def test_single_word_match_does_not_recognize_dependent_question_word(self):
+ self._check_equals("Which man?", 'The man', 10, None, None)
+
+ def test_single_word_match_with_dependent_question_word_control(self):
+ self._check_equals("The man?", 'The man', 10, None, None)
+
+ def test_no_relation_frequency_threshold_for_direct_question_words(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard came. Come. Come.", 'q')
+ topic_matches = manager.topic_match_documents_against("What came?", relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'Richard came. Come. Come.', 'text_to_match': 'What came?', 'rank': '1', 'index_within_document': 1, 'subword_index': None, 'start_index': 0, 'end_index': 5, 'sentences_start_index': 0, 'sentences_end_index': 6, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 228.8235527856964, 'word_infos': [[0, 7, 'relation', False, 'Matches the question word WHAT.'], [8, 12, 'relation', True, 'Matches COME directly.'], [14, 18, 'single', False, 'Matches COME directly.'], [20, 24, 'single', False, 'Matches COME directly.']], 'answers': [[0, 7]]}])
+
+ def test_no_relation_frequency_threshold_for_direct_question_words_control(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Richard came. Come. Come.", 'd')
+ topic_matches = manager.topic_match_documents_against("Richard came?", relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'd', 'text': 'Richard came. Come. Come.', 'text_to_match': 'Richard came?', 'rank': '1', 'index_within_document': 1, 'subword_index': None, 'start_index': 0, 'end_index': 5, 'sentences_start_index': 0, 'sentences_end_index': 6, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 167.43581219046695, 'word_infos': [[0, 7, 'relation', False, 'Matches RICHARD directly.'], [8, 12, 'relation', True, 'Matches COME directly.'], [14, 18, 'single', False, 'Matches COME directly.'], [20, 24, 'single', False, 'Matches COME directly.']], 'answers': []}])
+
+ def test_no_relation_frequency_threshold_for_governed_question_words(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("The dog barked. The dog barked. The dog barked.", 'q')
+ topic_matches = manager.topic_match_documents_against("Which dog barked?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'The dog barked. The dog barked. The dog barked.', 'text_to_match': 'Which dog barked?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 47, 'score': 107.3165784983407, 'word_infos': [[4, 7, 'relation', False, 'Matches DOG directly.'], [8, 14, 'relation', True, 'Matches BARK directly.'], [20, 23, 'relation', False, 'Matches DOG directly.'], [24, 30, 'relation', False, 'Matches BARK directly.'], [36, 39, 'relation', False, 'Matches DOG directly.'], [40, 46, 'relation', False, 'Matches BARK directly.']], 'answers': [[0, 7], [16, 23], [32, 39]]}])
+
+ def test_no_relation_frequency_threshold_for_governed_question_words_control(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("The dog barked. The dog barked. The dog barked.", 'q')
+ topic_matches = manager.topic_match_documents_against("The dog barked?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'The dog barked. The dog barked. The dog barked.', 'text_to_match': 'The dog barked?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 47, 'score': 25.58887041904562, 'word_infos': [[4, 7, 'single', False, 'Matches DOG directly.'], [8, 14, 'single', True, 'Matches BARK directly.'], [20, 23, 'single', False, 'Matches DOG directly.'], [24, 30, 'single', False, 'Matches BARK directly.'], [36, 39, 'single', False, 'Matches DOG directly.'], [40, 46, 'single', False, 'Matches BARK directly.']], 'answers': []}])
+
+ def test_no_reverse_relation_frequency_threshold_for_governed_question_words(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("in a house. in a house. in a house.", 'q')
+ topic_matches = manager.topic_match_documents_against("In which house?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'in a house. in a house. in a house.', 'text_to_match': 'In which house?', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 0, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 35, 'score': 107.07053166738835, 'word_infos': [[0, 2, 'relation', False, 'Matches IN directly.'], [5, 10, 'relation', False, 'Matches HOUSE directly.'], [12, 14, 'relation', True, 'Matches IN directly.'], [17, 22, 'relation', False, 'Matches HOUSE directly.'], [24, 26, 'relation', False, 'Matches IN directly.'], [29, 34, 'relation', False, 'Matches HOUSE directly.']], 'answers': [[3, 10], [15, 22], [27, 34]]}])
+
+ def test_no_reverse_relation_frequency_threshold_for_governed_question_words_control(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("in a house. in a house. in a house.", 'q')
+ topic_matches = manager.topic_match_documents_against("In a house",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'in a house. in a house. in a house.', 'text_to_match': 'In a house', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 35, 'score': 25.638079785236094, 'word_infos': [[0, 2, 'single', False, 'Matches IN directly.'], [5, 10, 'single', True, 'Matches HOUSE directly.'], [12, 14, 'single', False, 'Matches IN directly.'], [17, 22, 'single', False, 'Matches HOUSE directly.'], [24, 26, 'single', False, 'Matches IN directly.'], [29, 34, 'single', False, 'Matches HOUSE directly.']], 'answers': []}])
+
+ def test_no_embedding_frequency_threshold_for_governed_question_words_on_child(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("The dog barked. The dog barked. The dog barked.", 'q')
+ topic_matches = manager.topic_match_documents_against("Which cat barked?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ initial_question_word_embedding_match_threshold=0.2, word_embedding_match_threshold=0.2)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'The dog barked. The dog barked. The dog barked.', 'text_to_match': 'Which cat barked?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 47, 'score': 126.34484701824243, 'word_infos': [[4, 7, 'relation', False, 'Has a word embedding that is 80% similar to CAT.'], [8, 14, 'relation', True, 'Matches BARK directly.'], [20, 23, 'relation', False, 'Has a word embedding that is 80% similar to CAT.'], [24, 30, 'relation', False, 'Matches BARK directly.'], [36, 39, 'relation', False, 'Has a word embedding that is 80% similar to CAT.'], [40, 46, 'relation', False, 'Matches BARK directly.']], 'answers': [[0, 7], [16, 23], [32, 39]]}])
+
+ def test_no_embedding_frequency_threshold_for_governed_question_words_on_child_control(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("The dog barked. The dog barked. The dog barked.", 'q')
+ topic_matches = manager.topic_match_documents_against("The cat barked?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ initial_question_word_embedding_match_threshold=0.2, word_embedding_match_threshold=0.2)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'The dog barked.', 'text_to_match': 'The cat barked?', 'rank': '1=', 'index_within_document': 2, 'subword_index': None, 'start_index': 2, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 15, 'score': 7.381404928570852, 'word_infos': [[8, 14, 'single', True, 'Matches BARK directly.']], 'answers': []}, {'document_label': 'q', 'text': 'The dog barked.', 'text_to_match': 'The cat barked?', 'rank': '1=', 'index_within_document': 6, 'subword_index': None, 'start_index': 6, 'end_index': 6, 'sentences_start_index': 4, 'sentences_end_index': 7, 'sentences_character_start_index': 16, 'sentences_character_end_index': 31, 'score': 7.381404928570852, 'word_infos': [[8, 14, 'single', True, 'Matches BARK directly.']], 'answers': []}, {'document_label': 'q', 'text': 'The dog barked.', 'text_to_match': 'The cat barked?', 'rank': '1=', 'index_within_document': 10, 'subword_index': None, 'start_index': 10, 'end_index': 10, 'sentences_start_index': 8, 'sentences_end_index': 11, 'sentences_character_start_index': 32, 'sentences_character_end_index': 47, 'score': 7.381404928570852, 'word_infos': [[8, 14, 'single', True, 'Matches BARK directly.']], 'answers': []}])
+
+ def test_no_embedding_frequency_threshold_for_governed_question_words_on_parent(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("A big dog. A big dog. A big dog.", 'q')
+ topic_matches = manager.topic_match_documents_against("Which big cat?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ initial_question_word_embedding_match_threshold=0.2, word_embedding_match_threshold=0.2)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'A big dog. A big dog. A big dog.', 'text_to_match': 'Which big cat?', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 1, 'end_index': 10, 'sentences_start_index': 0, 'sentences_end_index': 11, 'sentences_character_start_index': 0, 'sentences_character_end_index': 32, 'score': 126.24642828586148, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [6, 9, 'relation', True, 'Has a word embedding that is 80% similar to CAT.'], [13, 16, 'relation', False, 'Matches BIG directly.'], [17, 20, 'relation', False, 'Has a word embedding that is 80% similar to CAT.'], [24, 27, 'relation', False, 'Matches BIG directly.'], [28, 31, 'relation', False, 'Has a word embedding that is 80% similar to CAT.']], 'answers': [[0, 9], [11, 20], [22, 31]]}])
+
+ def test_no_embedding_frequency_threshold_for_governed_question_words_on_parent_control(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("A big dog. A big dog. A big dog.", 'q')
+ topic_matches = manager.topic_match_documents_against("The big cat?",
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ initial_question_word_embedding_match_threshold=0.2, word_embedding_match_threshold=0.2)
+ self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'A big dog.', 'text_to_match': 'The big cat?', 'rank': '1=', 'index_within_document': 1, 'subword_index': None, 'start_index': 1, 'end_index': 1, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 10, 'score': 7.381404928570852, 'word_infos': [[2, 5, 'single', True, 'Matches BIG directly.']], 'answers': []}, {'document_label': 'q', 'text': 'A big dog.', 'text_to_match': 'The big cat?', 'rank': '1=', 'index_within_document': 5, 'subword_index': None, 'start_index': 5, 'end_index': 5, 'sentences_start_index': 4, 'sentences_end_index': 7, 'sentences_character_start_index': 11, 'sentences_character_end_index': 21, 'score': 7.381404928570852, 'word_infos': [[2, 5, 'single', True, 'Matches BIG directly.']], 'answers': []}, {'document_label': 'q', 'text': 'A big dog.', 'text_to_match': 'The big cat?', 'rank': '1=', 'index_within_document': 9, 'subword_index': None, 'start_index': 9, 'end_index': 9, 'sentences_start_index': 8, 'sentences_end_index': 11, 'sentences_character_start_index': 22, 'sentences_character_end_index': 32, 'score': 7.381404928570852, 'word_infos': [[2, 5, 'single', True, 'Matches BIG directly.']], 'answers': []}])
+
+ def test_check_what_be_positive_case(self):
+ self._check_equals('What is this?', 'this is a house', 45, 8, 15)
+
+ def test_check_who_subj_positive_case(self):
+ self._check_equals('Who looked into the sun?', 'the man looked into the sun', 127, 0, 7)
+
+ def test_check_who_subj_question_in_second_sentence(self):
+ self._check_equals('Hello. Who looked into the sun?', 'the man looked into the sun', 70, None, None)
+
+ def test_check_who_subj_wrong_syntax(self):
+ self._check_equals('Who looked into the sun?', 'the sun looked into the man', 19, None, None)
+
+ def test_check_who_subj_wrong_noun(self):
+ self._check_equals('Who looked into the sun?', 'the dog looked into the sun', 70, None, None)
+
+ def test_check_who_obj_positive_case(self):
+ self._check_equals('Who did the building see?', 'the building saw its man', 104, 17, 24)
+
+ def test_check_who_obj_wrong_syntax(self):
+ self._check_equals('Who did the building see?', 'the building saw his dog', 34, None, None)
+
+ def test_check_who_prep_positive_case(self):
+ self._check_equals('who did the dog talk with', 'the dog talked with its man', 108, 20, 27)
+
+ def test_check_who_prep_at_beginning_positive_case(self):
+ self._check_equals('with whom did the dog talk', 'the dog talked with its man', 108, 20, 27)
+
+ def test_check_who_prep_control_no_question_word(self):
+ self._check_equals('a dog talks with a man', 'the dog talked with its man', 108, None, None)
+
+ def test_check_who_prep_control_wrong_prep(self):
+ self._check_equals('a dog talks about a man', 'the dog talked with its man', 81, None, None)
+
+ def test_check_who_prep_to_positive_case(self):
+ self._check_equals('who did the dog talk to', 'the dog talked to its man', 104, 18, 25)
+
+ def test_check_who_wrong_prep(self):
+ self._check_equals('who did the dog talk to', 'the dog talked with its man', 34, None, None)
+
+ def test_check_who_prep_to_control_no_question_word(self):
+ self._check_equals('a dog talks to a man', 'the dog talked to its man', 81, None, None)
+
+ def test_check_who_prep_by_positive_case(self):
+ self._check_equals('who did the dog swear by', 'the dog swore by its man', 104, 17, 24)
+
+ def test_check_who_prep_by_control_no_question_word(self):
+ self._check_equals('a dog swears by a man', 'the dog swore by its man', 81, None, None)
+
+ def test_check_who_prep_of_positive_case(self):
+ self._check_equals('who did the dog speak of', 'the dog spoke of its man', 104, 17, 24)
+
+ def test_check_who_prep_of_control_no_question_word(self):
+ self._check_equals('a dog speaks of a man', 'the dog spoke of its man', 81, None, None)
+
+ def test_check_who_masc_personal_pronoun(self):
+ self._check_equals('who spoke', 'There came a doctor. He spoke.', 45, 11, 19,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_fem_personal_pronoun(self):
+ self._check_equals('who spoke', 'There came a doctor. She spoke.', 45, 11, 19,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_masc_personal_pronoun_elsewhere_in_chain(self):
+ self._check_equals('who spoke', 'A doctor spoke. He was angry.', 45, 0, 8,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_fem_personal_pronoun_elsewhere_in_chain(self):
+ self._check_equals('who spoke', 'A doctor spoke. She was angry.', 45, 0, 8,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_personal_pronoun_control_1(self):
+ self._check_equals('who spoke', 'A doctor spoke.', 5, None, None,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_personal_pronoun_control_2(self):
+ self._check_equals('who spoke', 'A doctor spoke. It was angry.', 5, None, None,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_who_personal_pronoun_control_3(self):
+ self._check_equals('who spoke', 'There came a doctor. It spoke.', 5, None, None,
+ initial_question_word_embedding_match_threshold=1.0)
+
+ def test_check_whom_positive_case(self):
+ self._check_equals('Whom did you talk about?', 'the dog talked about its man', 49, 21, 28)
+
+ def test_check_whom_wrong_syntax(self):
+ self._check_equals('Whom did you talk about?', 'the man talked about his dog', 9, None, None)
+
+ def test_check_where_positive_case(self):
+ self._check_equals('Where did the meeting take place?', 'the meeting took place in the office', 143, 23, 36)
+
+ def test_check_where_wrong_prep(self):
+ self._check_equals('Where did the meeting take place?', 'the meeting took place about the office', 83, None, None)
+
+ def test_check_when_positive_case(self):
+ self._check_equals('When did the meeting take place?', 'the meeting took place yesterday', 143, 23, 32)
+
+ def test_check_when_right_prep(self):
+ self._check_equals('When did the meeting take place?', 'the meeting took place after dawn', 143, 23, 33)
+
+ def test_check_when_wrong_prep(self):
+ self._check_equals('When did the meeting take place?', 'the meeting took place about dawn', 83, None, None)
+
+ def test_check_when_wrong_entity(self):
+ self._check_equals('When did the meeting take place?', 'the meeting took place with Richard', 83, None, None)
+
+ def test_check_when_wrong_syntax(self):
+ self._check_equals('When did the meeting take place?', 'the meeting took place', 83, None, None)
+
+ def test_check_when_in_time_phrase(self):
+ self._check_equals('When will the meeting take place?', 'the meeting will take place in three weeks', 142, 31, 42)
+
+ def test_check_where_in_time_phrase(self):
+ self._check_equals('Where will the meeting take place?', 'the meeting will take place in three weeks', 83, None, None)
+
+ def test_check_how_positive_case_phrase(self):
+ self._check_equals('How did the team manage it?', 'the team managed it by working hard', 104, 20, 35)
+
+ def test_check_how_positive_case_preposition(self):
+ self._check_equals('How did the team manage it?', 'the team managed it with hard work', 104, 20, 34)
+
+ def test_check_how_wrong_preposition(self):
+ self._check_equals('How did the team manage it?', 'the team managed it without hard work', 34, None, None)
+
+ def test_check_how_negative_case(self):
+ self._check_equals('How did the team manage it?', 'the team managed it because of the weather', 34, None, None)
+
+ def test_check_why_positive_because(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it because they had ambition', 104, 20, 45)
+
+ def test_check_why_positive_owing_to(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it owing to their ambition', 104, 20, 43)
+
+ def test_check_why_positive_thanks_to(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it thanks to their ambition', 104, 20, 44)
+
+ def test_check_why_positive_to(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it to show everyone', 104, 20, 36)
+
+ def test_check_why_positive_in_order_to(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it in order to show everyone', 104, 20, 45)
+
+ def test_check_why_positive_in_order_to_control(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it in place', 34, None, None)
+
+ def test_check_why_because_of(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it because of the weather', 104, 20, 42)
+
+ def test_check_why_because_of_be(self):
+ self._check_equals('Why did the team manage it?', 'the team managed it because it was efficient', 104, 20, 44)
+
+ def test_in_answers_split_1(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("I lived in a house and a flat.")
+ topic_matches = manager.topic_match_documents_against("What did you live in?")
+ self.assertEqual(topic_matches[0]['answers'], [[11, 18], [23, 29]])
+
+ def test_in_answers_split_2(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("I am going in two weeks and in three weeks")
+ topic_matches = manager.topic_match_documents_against("When are you going?")
+ self.assertEqual(topic_matches[0]['answers'], [[14, 23], [31, 42]])
+
+ def test_in_answers_split_3(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("I am going in two weeks and three weeks")
+ topic_matches = manager.topic_match_documents_against("When are you going?")
+ self.assertEqual(topic_matches[0]['answers'], [[14, 23], [28, 39]])
+
+ def test_entity_multiword(self):
+ manager.remove_all_documents()
+ manager.parse_and_register_document("Then Richard Hudson spoke")
+ topic_matches = manager.topic_match_documents_against("Who spoke?")
+ self.assertEqual(topic_matches, [{'document_label': '', 'text': 'Then Richard Hudson spoke', 'text_to_match': 'Who spoke?', 'rank': '1', 'index_within_document': 3, 'subword_index': None, 'start_index': 1, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 620.0, 'word_infos': [[5, 19, 'relation', False, 'Matches the question word WHO.'], [20, 25, 'relation', True, 'Matches SPEAK directly.']], 'answers': [[5, 19]]}])
+
+ def test_governing_verb_within_noun_phrase(self):
+ self._check_equals('Who did Richard see?', 'The person Richard saw was angry', 34, None, None)
diff --git a/holmes_extractor/tests/en/test_semantics_EN.py b/tests/en/test_semantics_EN.py
similarity index 77%
rename from holmes_extractor/tests/en/test_semantics_EN.py
rename to tests/en/test_semantics_EN.py
index af43375..ea8b515 100644
--- a/holmes_extractor/tests/en/test_semantics_EN.py
+++ b/tests/en/test_semantics_EN.py
@@ -1,14 +1,15 @@
import unittest
-from holmes_extractor.semantics import SemanticAnalyzerFactory
-
-analyzer = SemanticAnalyzerFactory().semantic_analyzer(model='en_core_web_lg', debug=False,
- perform_coreference_resolution=True)
-
+import spacy
+import coreferee
+import holmes_extractor
+nlp = spacy.load('en_core_web_trf')
+nlp.add_pipe('coreferee')
+nlp.add_pipe('holmes')
class EnglishSemanticAnalyzerTest(unittest.TestCase):
def test_initialize_semantic_dependencies(self):
- doc = analyzer.parse("The dog chased the cat.")
+ doc = nlp("The dog chased the cat.")
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '1:nsubj; 4:dobj')
self.assertEqual(
@@ -17,14 +18,14 @@ def test_initialize_semantic_dependencies(self):
doc[5]._.holmes.string_representation_of_children(), '')
def test_one_righthand_sibling_with_and_conjunction(self):
- doc = analyzer.parse("The dog and the hound chased the cat")
+ doc = nlp("The dog and the hound chased the cat")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4])
self.assertFalse(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertFalse(doc[4]._.holmes.is_involved_in_or_conjunction)
self.assertEqual(doc[4]._.holmes.righthand_siblings, [])
def test_many_righthand_siblings_with_and_conjunction(self):
- doc = analyzer.parse("The dog, the wolf and the hound chased the cat")
+ doc = nlp("The dog, the wolf and the hound chased the cat")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4, 7])
self.assertFalse(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertFalse(doc[4]._.holmes.is_involved_in_or_conjunction)
@@ -33,14 +34,14 @@ def test_many_righthand_siblings_with_and_conjunction(self):
self.assertEqual(doc[7]._.holmes.righthand_siblings, [])
def test_one_righthand_sibling_with_or_conjunction(self):
- doc = analyzer.parse("The dog or the hound chased the cat")
+ doc = nlp("The dog or the hound chased the cat")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4])
self.assertTrue(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertTrue(doc[4]._.holmes.is_involved_in_or_conjunction)
self.assertEqual(doc[4]._.holmes.righthand_siblings, [])
def test_many_righthand_siblings_with_or_conjunction(self):
- doc = analyzer.parse("The dog, the wolf or the hound chased the cat")
+ doc = nlp("The dog, the wolf or the hound chased the cat")
self.assertEqual(doc[1]._.holmes.righthand_siblings, [4, 7])
self.assertTrue(doc[1]._.holmes.is_involved_in_or_conjunction)
self.assertTrue(doc[4]._.holmes.is_involved_in_or_conjunction)
@@ -49,21 +50,20 @@ def test_many_righthand_siblings_with_or_conjunction(self):
self.assertEqual(doc[7]._.holmes.righthand_siblings, [])
def test_righthand_siblings_of_semantic_children_two(self):
- doc = analyzer.parse("The large and strong dog came home")
+ doc = nlp("The large and strong dog came home")
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '1:amod; 3:amod')
self.assertEqual(doc[1]._.holmes.righthand_siblings, [3])
def test_righthand_siblings_of_semantic_children_many(self):
- doc = analyzer.parse("The large, strong and fierce dog came home")
+ doc = nlp("The large or strong and fierce dog came home")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:amod; 3:amod; 5:amod')
- self.assertEqual(doc[1]._.holmes.righthand_siblings, [])
- self.assertEqual(doc[3]._.holmes.righthand_siblings, [5])
- # Conjunction between 1 and 3 is already reflected in the underlying spaCy structure and does not need to be dealt with by Holmes
+ self.assertEqual(doc[1]._.holmes.righthand_siblings, [3,5])
+ self.assertEqual(doc[3]._.holmes.righthand_siblings, [])
def test_semantic_children_of_righthand_siblings_two(self):
- doc = analyzer.parse("The large dog and cat")
+ doc = nlp("The large dog and cat")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'1:amod; 3:cc; 4:conj')
self.assertEqual(doc[2]._.holmes.righthand_siblings, [4])
@@ -71,7 +71,7 @@ def test_semantic_children_of_righthand_siblings_two(self):
doc[4]._.holmes.string_representation_of_children(), '1:amod(U)')
def test_semantic_children_of_righthand_siblings_many(self):
- doc = analyzer.parse("The large dog, cat and mouse")
+ doc = nlp("The large dog, cat and mouse")
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '1:amod; 4:conj')
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
@@ -80,146 +80,146 @@ def test_semantic_children_of_righthand_siblings_many(self):
doc[6]._.holmes.string_representation_of_children(), '1:amod(U)')
def test_predicative_adjective(self):
- doc = analyzer.parse("The dog was big")
+ doc = nlp("The dog was big")
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '3:amod')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_predicative_adjective_with_conjunction(self):
- doc = analyzer.parse("The dog and the cat were big and strong")
+ doc = nlp("The dog and the cat were big and strong")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'2:cc; 4:conj; 6:amod; 8:amod')
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '6:amod; 8:amod')
def test_predicative_adjective_with_non_coreferring_pronoun(self):
- doc = analyzer.parse("It was big")
+ doc = nlp("It was big")
self.assertEqual(
doc[0]._.holmes.string_representation_of_children(), '2:amod')
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '-1:None')
def test_predicative_adjective_with_coreferring_pronoun(self):
- doc = analyzer.parse("I saw a dog. It was big")
+ doc = nlp("I saw a dog. It was big")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '7:amod')
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '-6:None')
def test_negator_negation_within_clause(self):
- doc = analyzer.parse("The dog did not chase the cat")
+ doc = nlp("The dog did not chase the cat")
self.assertEqual(doc[4]._.holmes.is_negated, True)
def test_operator_negation_within_clause(self):
- doc = analyzer.parse("No dog chased any cat")
+ doc = nlp("No dog chased any cat")
self.assertEqual(doc[1]._.holmes.is_negated, True)
self.assertEqual(doc[2]._.holmes.is_negated, False)
def test_negator_negation_within_parent_clause(self):
- doc = analyzer.parse(
+ doc = nlp(
"It had not been claimed that the dog had chased the cat")
self.assertEqual(doc[4]._.holmes.is_negated, True)
def test_operator_negation_within_parent_clause(self):
- doc = analyzer.parse("Nobody said the dog had chased the cat")
+ doc = nlp("Nobody said the dog had chased the cat")
self.assertEqual(doc[5]._.holmes.is_negated, True)
def test_negator_negation_within_child_clause(self):
- doc = analyzer.parse("The dog chased the cat who was not happy")
+ doc = nlp("The dog chased the cat who was not happy")
self.assertEqual(doc[2]._.holmes.is_negated, False)
def test_operator_negation_within_child_clause(self):
- doc = analyzer.parse("The dog chased the cat who told nobody")
+ doc = nlp("The dog chased the cat who told nobody")
self.assertEqual(doc[2]._.holmes.is_negated, False)
def test_passive(self):
- doc = analyzer.parse("The dog was chased")
+ doc = nlp("The dog was chased")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '1:nsubjpass; 2:auxpass')
def test_used_to_positive(self):
- doc = analyzer.parse("The dog always used to chase the cat")
+ doc = nlp("The dog always used to chase the cat")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '-6:None')
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:nsubj; 2:advmod; 4:aux; 7:dobj')
def test_used_to_negative_1(self):
- doc = analyzer.parse("The dog was used to chase the cat")
+ doc = nlp("The dog was used to chase the cat")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:nsubjpass; 2:auxpass; 5:xcomp')
def test_used_to_negative_2(self):
- doc = analyzer.parse("The dog used the mouse to chase the cat")
+ doc = nlp("The dog used the mouse to chase the cat")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'1:nsubj; 4:dobj; 6:xcomp')
def test_going_to(self):
- doc = analyzer.parse("The dog is going to chase the cat")
+ doc = nlp("The dog is going to chase the cat")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '-6:None')
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:nsubj; 2:aux; 4:aux; 7:dobj')
def test_was_going_to(self):
- doc = analyzer.parse("The dog was going to chase the cat")
+ doc = nlp("The dog was going to chase the cat")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '-6:None')
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:nsubj(U); 2:aux(U); 4:aux(U); 7:dobj(U)')
def test_complementizing_clause_active_child_clause_active(self):
- doc = analyzer.parse("The dog decided to chase the cat")
+ doc = nlp("The dog decided to chase the cat")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'1:nsubj(U); 3:aux; 6:dobj')
def test_complementizing_clause_passive_child_clause_active(self):
- doc = analyzer.parse("The dog was ordered to chase the cat")
+ doc = nlp("The dog was ordered to chase the cat")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:aux; 7:dobj')
def test_complementizing_clause_object_child_clause_active(self):
- doc = analyzer.parse("The mouse ordered the dog to chase the cat")
+ doc = nlp("The mouse ordered the dog to chase the cat")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'4:nsubj(U); 5:aux; 8:dobj')
def test_complementizing_clause_active_child_clause_passive(self):
- doc = analyzer.parse("The dog decided to be chased")
+ doc = nlp("The dog decided to be chased")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 3:aux; 4:auxpass')
def test_complementizing_clause_passive_child_clause_passive(self):
- doc = analyzer.parse("The dog was ordered to be chased")
+ doc = nlp("The dog was ordered to be chased")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 4:aux; 5:auxpass')
def test_complementizing_clause_object_child_clause_passive(self):
- doc = analyzer.parse("The mouse ordered the dog to be chased")
+ doc = nlp("The mouse ordered the dog to be chased")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'4:nsubjpass(U); 5:aux; 6:auxpass')
def test_complementization_with_conjunction_and_agent(self):
- doc = analyzer.parse(
+ doc = nlp(
"The mouse ordered the dog and the cat to be chased by the cat and the tiger")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
- '4:nsubjpass; 7:nsubjpass; 8:aux; 9:auxpass; 11:agent; 13:pobjb; 16:pobjb')
+ '4:nsubjpass(U); 7:nsubjpass(U); 8:aux; 9:auxpass; 11:agent; 13:pobjb; 16:pobjb')
def test_complementizing_clause_atypical_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"I had spent three years ruminating and that I knew")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
- '4:nsubj(U)')
+ '0:nsubj(U)')
def test_who_one_antecedent(self):
- doc = analyzer.parse("The dog who chased the cat was tired")
+ doc = nlp("The dog who chased the cat was tired")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '1:nsubj; 5:dobj')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_who_predicate_conjunction(self):
- doc = analyzer.parse("The dog who chased and caught the cat was tired")
+ doc = nlp("The dog who chased and caught the cat was tired")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:nsubj; 4:cc; 5:conj')
self.assertEqual(
@@ -228,30 +228,20 @@ def test_who_predicate_conjunction(self):
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_who_many_antecedents(self):
- doc = analyzer.parse(
+ doc = nlp(
"The lion, the tiger and the dog who chased the cat were tired")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U); 7:nsubj; 11:dobj')
def test_which_one_antecedent(self):
- doc = analyzer.parse("The dog which chased the cat was tired")
+ doc = nlp("The dog which chased the cat was tired")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '1:nsubj; 5:dobj')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
- def test_who_predicate_conjunction(self):
- doc = analyzer.parse(
- "The dog which chased and caught the cat was tired")
- self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
- '1:nsubj; 4:cc; 5:conj')
- self.assertEqual(
- doc[5]._.holmes.string_representation_of_children(), '1:nsubj; 7:dobj')
- self.assertEqual(
- doc[2]._.holmes.string_representation_of_children(), '-2:None')
-
def test_which_many_antecedents(self):
- doc = analyzer.parse(
+ doc = nlp(
"The lion, the tiger and the dog which chased the cat were tired")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U); 7:nsubj; 11:dobj')
@@ -259,14 +249,14 @@ def test_which_many_antecedents(self):
doc[8]._.holmes.string_representation_of_children(), '-8:None')
def test_that_subj_one_antecedent(self):
- doc = analyzer.parse("The dog that chased the cat was tired")
+ doc = nlp("The dog that chased the cat was tired")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '1:nsubj; 5:dobj')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_that_predicate_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog that chased and caught the cat was tired")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:nsubj; 4:cc; 5:conj')
@@ -276,54 +266,44 @@ def test_that_predicate_conjunction(self):
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_that_subj_many_antecedents(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog and the tiger that chased the cat were tired")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj; 8:dobj')
def test_that_obj_one_antecedent(self):
- doc = analyzer.parse("The cat that the dog chased was tired")
+ doc = nlp("The cat that the dog chased was tired")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:dobj; 4:nsubj')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_that_obj_many_antecedents(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat and the mouse that the dog chased were tired")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
- '1:dobj(U); 4:dobj; 7:nsubj')
+ '1:dobj; 4:dobj(U); 7:nsubj')
def test_relant_one_antecedent(self):
- doc = analyzer.parse("The cat the dog chased was tired")
+ doc = nlp("The cat the dog chased was tired")
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '1:relant; 3:nsubj')
- def test_relant_many_antecedents_and_predicate_conjunction(self):
- doc = analyzer.parse(
- "The cat and the mouse the dog chased and pursued were tired")
- self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
- '1:relant(U); 4:relant; 6:nsubj; 8:cc; 9:conj')
- self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
- '1:relant(U); 4:relant; 6:nsubj(U)')
-
- def test_relant_multiple_predicate_conjunction(self):
- doc = analyzer.parse(
- "The cat the dog pursued, caught and chased was dead")
+ def test_relant_predicate_conjunction(self):
+ doc = nlp(
+ "The cat the dog chased and pursued were tired")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
- '1:relant; 3:nsubj; 6:conj')
+ '1:relant; 3:nsubj; 5:cc; 6:conj')
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
- '1:relant; 3:nsubj(U); 7:cc; 8:conj')
- self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'1:relant; 3:nsubj(U)')
def test_displaced_preposition_phrasal_verb(self):
- doc = analyzer.parse("The office you ate your roll in was new")
+ doc = nlp("The office you ate your roll in was new")
self.assertEqual(
- doc[6]._.holmes.string_representation_of_children(), '')
+ doc[6]._.holmes.string_representation_of_children(), '1:pobj')
def test_displaced_preposition_no_complementizer(self):
- doc = analyzer.parse("The office you ate your roll at was new")
+ doc = nlp("The office you ate your roll at was new")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:pobj')
self.assertEqual(
@@ -332,7 +312,7 @@ def test_displaced_preposition_no_complementizer(self):
'1:pobjp; 2:nsubj; 5:dobj; 6:prep')
def test_displaced_preposition_no_complementizer_with_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"The building and the office you ate and consumed your roll at were new")
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'1:pobj(U); 4:pobj')
@@ -342,7 +322,7 @@ def test_displaced_preposition_no_complementizer_with_conjunction(self):
'1:pobjp(U); 4:pobjp; 5:nsubj(U); 10:dobj; 11:prep')
def test_displaced_preposition_no_complementizer_with_second_preposition(self):
- doc = analyzer.parse(
+ doc = nlp(
"The office you ate your roll with gusto at was new")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'1:pobj')
@@ -352,7 +332,7 @@ def test_displaced_preposition_no_complementizer_with_second_preposition(self):
'1:pobjp; 2:nsubj; 5:dobj; 6:prep; 7:pobjp; 8:prep')
def test_displaced_preposition_no_complementizer_with_second_preposition_and_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"The building and the office you ate and consumed your roll with gusto at were new")
self.assertEqual(doc[13]._.holmes.string_representation_of_children(),
'1:pobj(U); 4:pobj')
@@ -362,7 +342,7 @@ def test_displaced_preposition_no_complementizer_with_second_preposition_and_con
'1:pobjp(U); 4:pobjp; 5:nsubj(U); 10:dobj; 11:prep; 12:pobjp; 13:prep')
def test_displaced_preposition_that(self):
- doc = analyzer.parse("The office that you ate your roll at was new")
+ doc = nlp("The office that you ate your roll at was new")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:pobj')
self.assertEqual(
@@ -372,7 +352,7 @@ def test_displaced_preposition_that(self):
def test_displaced_preposition_that_preposition_points_to_that(self):
# For some reason gets a different spaCy representation that the previous one
- doc = analyzer.parse("The building that you ate your roll at was new")
+ doc = nlp("The building that you ate your roll at was new")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:pobj')
self.assertEqual(
@@ -381,7 +361,7 @@ def test_displaced_preposition_that_preposition_points_to_that(self):
'1:pobjp; 3:nsubj; 6:dobj; 7:prep')
def test_displaced_preposition_that_with_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"The building and the office that you ate and consumed your roll at were new")
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
'1:pobj(U); 4:pobj')
@@ -391,7 +371,7 @@ def test_displaced_preposition_that_with_conjunction(self):
'1:pobjp(U); 4:pobjp; 6:nsubj(U); 11:dobj; 12:prep')
def test_displaced_preposition_that_with_second_preposition_preposition_points_to_that(self):
- doc = analyzer.parse(
+ doc = nlp(
"The building that you ate your roll with gusto at was new")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:pobj')
@@ -401,7 +381,7 @@ def test_displaced_preposition_that_with_second_preposition_preposition_points_t
'1:pobjp; 3:nsubj; 6:dobj; 7:prep; 8:pobjp; 9:prep')
def test_displaced_preposition_that_with_second_preposition(self):
- doc = analyzer.parse(
+ doc = nlp(
"The office that you ate your roll with gusto at was new")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:pobj')
@@ -411,24 +391,24 @@ def test_displaced_preposition_that_with_second_preposition(self):
'1:pobjp; 3:nsubj; 6:dobj; 7:prep; 8:pobjp; 9:prep')
def test_displaced_preposition_that_with_second_preposition_and_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"The building and the office that you ate and consumed your roll with gusto at were new")
self.assertEqual(doc[14]._.holmes.string_representation_of_children(),
- '1:pobj(U); 4:pobj')
+ '1:pobj; 4:pobj(U)')
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:pobjp(U); 4:pobjp(U); 6:nsubj; 8:cc; 9:conj; 14:prep(U)')
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
- '1:pobjp(U); 4:pobjp; 6:nsubj(U); 11:dobj; 12:prep; 13:pobjp; 14:prep')
+ '1:pobjp; 4:pobjp(U); 6:nsubj(U); 11:dobj; 12:prep; 13:pobjp; 14:prep')
def test_simple_whose_clause(self):
- doc = analyzer.parse("The dog whose owner I met was tired")
+ doc = nlp("The dog whose owner I met was tired")
self.assertEqual(
doc[3]._.holmes.string_representation_of_children(), '1:poss')
self.assertEqual(
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_whose_clause_with_conjunction_of_possessor(self):
- doc = analyzer.parse("The dog whose owner and friend I met was tired")
+ doc = nlp("The dog whose owner and friend I met was tired")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:poss; 4:cc; 5:conj')
self.assertEqual(
@@ -437,234 +417,238 @@ def test_whose_clause_with_conjunction_of_possessor(self):
doc[2]._.holmes.string_representation_of_children(), '-2:None')
def test_whose_clause_with_conjunction_of_possessed(self):
- doc = analyzer.parse("The lion and dog whose owner I met were tired")
+ doc = nlp("The lion and dog whose owner I met were tired")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:poss(U); 3:poss')
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '-4:None')
- def test_phrasal_verb(self):
- doc = analyzer.parse("He took out insurance")
+ def test_phrasal_verb_1(self):
+ doc = nlp("He took out insurance")
self.assertEqual(doc[1]._.holmes.lemma, 'take out')
self.assertEqual(
doc[1]._.holmes.string_representation_of_children(), '0:nsubj; 3:dobj')
+ def test_participle(self):
+ doc = nlp("An adopted child")
+ self.assertEqual(doc[1]._.holmes.lemma, 'adopt')
+
def test_positive_modal_verb(self):
- doc = analyzer.parse("He should do it")
+ doc = nlp("He should do it")
self.assertEqual(doc[2]._.holmes.string_representation_of_children(),
'0:nsubj(U); 1:aux; 3:dobj(U)')
def test_negative_modal_verb(self):
- doc = analyzer.parse("He cannot do it")
+ doc = nlp("He cannot do it")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'0:nsubj(U); 1:aux; 2:neg(U); 4:dobj(U)')
self.assertTrue(doc[3]._.holmes.is_negated)
def test_ought_to(self):
- doc = analyzer.parse("He ought to do it")
+ doc = nlp("He ought to do it")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'0:nsubj(U); 2:aux; 4:dobj')
- def test_phrasal_verb(self):
- doc = analyzer.parse("He will have been doing it")
+ def test_phrasal_verb_2(self):
+ doc = nlp("He will have been doing it")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:nsubj; 1:aux; 2:aux; 3:aux; 5:dobj')
- def test_pobjb(self):
- doc = analyzer.parse("Eating by employees")
+ def test_pobjb_1(self):
+ doc = nlp("Eating by employees")
self.assertEqual(
doc[0]._.holmes.string_representation_of_children(), '1:prep; 2:pobjb')
- def test_pobjb(self):
- doc = analyzer.parse("Eating of icecream")
+ def test_pobjb_2(self):
+ doc = nlp("Eating of icecream")
self.assertEqual(
doc[0]._.holmes.string_representation_of_children(), '1:prep; 2:pobjo')
def test_pobjt(self):
- doc = analyzer.parse("Travelling to Munich")
+ doc = nlp("Travelling to Munich")
self.assertEqual(
doc[0]._.holmes.string_representation_of_children(), '1:prep; 2:pobjt')
def test_dative_prepositional_phrase(self):
- doc = analyzer.parse("He gave it to the employee")
+ doc = nlp("He gave it to the employee")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'0:nsubj; 2:dobj; 3:prep; 5:pobjt')
self.assertFalse(doc[3]._.holmes.is_matchable)
def test_dative_prepositional_phrase_with_conjunction(self):
- doc = analyzer.parse("He gave it to the employee and the boss")
+ doc = nlp("He gave it to the employee and the boss")
self.assertEqual(doc[1]._.holmes.string_representation_of_children(),
'0:nsubj; 2:dobj; 3:prep; 5:pobjt; 8:pobjt')
self.assertFalse(doc[3]._.holmes.is_matchable)
def test_simple_participle_phrase(self):
- doc = analyzer.parse("He talked about the cat chased by the dog")
+ doc = nlp("He talked about the cat chased by the dog")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'4:dobj; 6:agent; 8:pobjb')
def test_participle_phrase_with_conjunction(self):
- doc = analyzer.parse(
+ doc = nlp(
"He talked about the cat and the mouse chased by the dog and the tiger")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
- '4:dobj; 7:dobj; 9:agent; 11:pobjb; 14:pobjb')
+ '4:dobj; 7:dobj; 9:agent; 11:pobjb; 14:dobj')
def test_subjective_modifying_adverbial_phrase(self):
- doc = analyzer.parse("The lion-chased cat came home")
+ doc = nlp("The lion-chased cat came home")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:advmodsubj; 4:advmodobj')
def test_subjective_modifying_adverbial_phrase_with_conjunction(self):
- doc = analyzer.parse("The lion-chased cat and mouse came home")
+ doc = nlp("The lion-chased cat and mouse came home")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:advmodsubj; 4:advmodobj; 6:advmodobj(U)')
def test_objective_modifying_adverbial_phrase(self):
- doc = analyzer.parse("The cat-chasing lion came home")
+ doc = nlp("The cat-chasing lion came home")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:advmodobj; 4:advmodsubj')
def test_objective_modifying_adverbial_phrase_with_conjunction(self):
- doc = analyzer.parse("The cat-chasing lion and dog came home")
+ doc = nlp("The cat-chasing lion and dog came home")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'1:advmodobj; 4:advmodsubj; 6:advmodsubj(U)')
def test_verb_prepositional_complement_simple_active(self):
- doc = analyzer.parse("The dog was thinking about chasing a cat")
+ doc = nlp("The dog was thinking about chasing a cat")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:nsubj(U); 7:dobj')
def test_verb_prepositional_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog and the lion were thinking about chasing a cat and a mouse")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U); 10:dobj; 13:dobj')
def test_verb_prepositional_complement_with_relative_clause_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog who was thinking about chasing a cat came home")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:nsubj(U); 8:dobj')
def test_verb_preposition_complement_with_coreferring_pronoun_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"He saw a dog. It was thinking about chasing a cat")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '5:nsubj(U); 11:dobj')
def test_verb_preposition_complement_with_non_coreferring_pronoun_active(self):
- doc = analyzer.parse("It was thinking about chasing a cat")
+ doc = nlp("It was thinking about chasing a cat")
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '6:dobj')
def test_adjective_prepositional_complement_simple_active(self):
- doc = analyzer.parse("The dog was worried about chasing a cat")
+ doc = nlp("The dog was worried about chasing a cat")
self.assertEqual(
doc[5]._.holmes.string_representation_of_children(), '1:nsubj(U); 7:dobj')
def test_adjective_prepositional_complement_with_conjunction_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog and the lion were worried about chasing a cat and a mouse")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U); 10:dobj; 13:dobj')
def test_adjective_prepositional_complement_with_relative_clause_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"The dog who was worried about chasing a cat came home")
self.assertEqual(
doc[6]._.holmes.string_representation_of_children(), '1:nsubj(U); 8:dobj')
def test_adjective_preposition_complement_with_coreferring_pronoun_active(self):
- doc = analyzer.parse(
+ doc = nlp(
"He saw a dog. He was worried about chasing a cat")
self.assertEqual(
doc[9]._.holmes.string_representation_of_children(), '5:nsubj(U); 11:dobj')
def test_adjective_preposition_complement_with_non_coreferring_pronoun_active(self):
- doc = analyzer.parse("It was worried about chasing a cat")
+ doc = nlp("It was worried about chasing a cat")
self.assertEqual(
doc[4]._.holmes.string_representation_of_children(), '6:dobj')
def test_verb_prepositional_complement_simple_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat was thinking about being chased by a dog")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 5:auxpass; 7:agent; 9:pobjb')
def test_verb_prepositional_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat and the mouse were thinking about being chased by a dog and a lion")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 4:nsubjpass(U); 8:auxpass; 10:agent; 12:pobjb; 15:pobjb')
def test_verb_prepositional_complement_with_relative_clause_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat who was thinking about being chased by a dog came home")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 6:auxpass; 8:agent; 10:pobjb')
def test_verb_preposition_complement_with_coreferring_pronoun_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"He saw a dog. It was thinking about being chased by a cat")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'5:nsubjpass(U); 9:auxpass; 11:agent; 13:pobjb')
def test_verb_preposition_complement_with_non_coreferring_pronoun_passive(self):
- doc = analyzer.parse("It was thinking about being chased by a cat")
+ doc = nlp("It was thinking about being chased by a cat")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'4:auxpass; 6:agent; 8:pobjb')
def test_adjective_prepositional_complement_simple_passive(self):
- doc = analyzer.parse("The cat was worried about being chased by a dog")
+ doc = nlp("The cat was worried about being chased by a dog")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 5:auxpass; 7:agent; 9:pobjb')
def test_adjective_prepositional_complement_with_conjunction_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat and the mouse were worried about being chased by a dog and a lion")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 4:nsubjpass(U); 8:auxpass; 10:agent; 12:pobjb; 15:pobjb')
def test_adjective_prepositional_complement_with_relative_clause_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat who was worried about being chased by a dog came home")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
'1:nsubjpass(U); 6:auxpass; 8:agent; 10:pobjb')
def test_adjective_preposition_complement_with_coreferring_pronoun_passive(self):
- doc = analyzer.parse(
+ doc = nlp(
"He saw a dog. It was worried about being chased by a cat")
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'5:nsubjpass(U); 9:auxpass; 11:agent; 13:pobjb')
def test_adjective_preposition_complement_with_non_coreferring_pronoun_passive(self):
- doc = analyzer.parse("It was worried about being chased by a cat")
+ doc = nlp("It was worried about being chased by a cat")
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'4:auxpass; 6:agent; 8:pobjb')
def test_verb_prepositional_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat and the mouse kept on singing and shouting")
self.assertEqual(doc[7]._.holmes.string_representation_of_children(),
- '1:nsubj(U); 4:nsubj(U)')
+ '1:nsubj(U); 4:nsubj(U); 8:cc; 9:conj')
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U)')
def test_verb_p_c_with_conjunction_of_dependent_verb_and_coreferring_pronoun(self):
- doc = analyzer.parse("I saw a cat. It kept on singing and shouting")
+ doc = nlp("I saw a cat. It kept on singing and shouting")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
- '5:nsubj(U)')
+ '5:nsubj(U); 9:cc; 10:conj')
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'5:nsubj(U)')
- def test_verb_p_c_with_conjunction_of_dependent_verb_and_non_coreferring_pronoun(self):
- doc = analyzer.parse("It kept on singing and shouting")
+ def test_verb_p_c_with_conjunction_of_dependent_verb_and_non_coreferring_pronoun_1(self):
+ doc = nlp("It kept on singing and shouting")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
- '4:cc; 5:conj')
+ '0:nsubj(U); 4:cc; 5:conj')
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
- '')
+ '0:nsubj(U)')
def test_adjective_prepositional_complement_with_conjunction_of_dependent_verb(self):
- doc = analyzer.parse(
+ doc = nlp(
"The cat and the mouse were worried about singing and shouting")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'1:nsubj(U); 4:nsubj(U); 9:cc; 10:conj')
@@ -672,50 +656,45 @@ def test_adjective_prepositional_complement_with_conjunction_of_dependent_verb(s
'1:nsubj(U); 4:nsubj(U)')
def test_adjective_p_c_with_conjunction_of_dependent_verb_and_coreferring_pronoun(self):
- doc = analyzer.parse(
+ doc = nlp(
"I saw a cat. It was worried about singing and shouting")
self.assertEqual(doc[9]._.holmes.string_representation_of_children(),
'5:nsubj(U); 10:cc; 11:conj')
self.assertEqual(doc[11]._.holmes.string_representation_of_children(),
'5:nsubj(U)')
- def test_verb_p_c_with_conjunction_of_dependent_verb_and_non_coreferring_pronoun(self):
- doc = analyzer.parse("It was worried about singing and shouting")
+ def test_verb_p_c_with_conjunction_of_dependent_verb_and_non_coreferring_pronoun_2(self):
+ doc = nlp("It was worried about singing and shouting")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'5:cc; 6:conj')
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
'')
def test_single_preposition_dependency_added_to_noun(self):
- doc = analyzer.parse(
+ doc = nlp(
"The employee needs insurance for the next five years")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'4:prepposs(U); 8:pobjp(U)')
def test_multiple_preposition_dependencies_added_to_noun(self):
- doc = analyzer.parse(
+ doc = nlp(
"The employee needs insurance for the next five years and in Europe")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'4:prepposs(U); 8:pobjp(U); 10:prepposs(U); 11:pobjp(U)')
def test_single_preposition_dependency_added_to_coreferring_pronoun(self):
- doc = analyzer.parse(
+ doc = nlp(
"We discussed the house. The employee needs it for the next five years")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'9:prepposs(U); 13:pobjp(U)')
- def test_single_preposition_dependency_not_added_to_non_coreferring_pronoun(self):
- doc = analyzer.parse("The employee needs it for the next five years")
- self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
- '')
-
def test_dependencies_not_added_to_sibling_to_the_right(self):
- doc = analyzer.parse("He saw them and laughed")
+ doc = nlp("He saw them and laughed")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:nsubj(U)')
def test_coreference_within_sentence(self):
- doc = analyzer.parse("The employee got home and he was surprised")
+ doc = nlp("The employee got home and he was surprised")
self.assertEqual(
doc[1]._.holmes.token_and_coreference_chain_indexes, [1, 5])
self.assertEqual(
@@ -724,7 +703,7 @@ def test_coreference_within_sentence(self):
doc[3]._.holmes.token_and_coreference_chain_indexes, [3])
def test_coreference_between_sentences(self):
- doc = analyzer.parse("The employee got home. He was surprised")
+ doc = nlp("The employee got home. He was surprised")
self.assertEqual(
doc[1]._.holmes.token_and_coreference_chain_indexes, [1, 5])
self.assertEqual(
@@ -733,7 +712,7 @@ def test_coreference_between_sentences(self):
doc[3]._.holmes.token_and_coreference_chain_indexes, [3])
def test_coreference_three_items_in_chain(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard was at work. He went home. He was surprised")
self.assertEqual(
doc[0]._.holmes.token_and_coreference_chain_indexes, [0, 5, 9])
@@ -745,7 +724,7 @@ def test_coreference_three_items_in_chain(self):
doc[3]._.holmes.token_and_coreference_chain_indexes, [3])
def test_coreference_conjunction_in_antecedent(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard and Carol came to work. They had a discussion")
self.assertEqual(
doc[0]._.holmes.token_and_coreference_chain_indexes, [0, 7])
@@ -757,26 +736,14 @@ def test_coreference_conjunction_in_antecedent(self):
doc[3]._.holmes.token_and_coreference_chain_indexes, [3])
def test_coreference_within_relative_clause(self):
- doc = analyzer.parse("The man who knows himself has an advantage")
+ doc = nlp("The man who knows himself has an advantage")
self.assertEqual(
doc[1]._.holmes.token_and_coreference_chain_indexes, [1, 4])
self.assertEqual(
doc[4]._.holmes.token_and_coreference_chain_indexes, [4, 1])
- def test_coreference_repeated_conjunctions(self):
- doc = analyzer.parse("A dog and a man came. A dog and a man sang")
- # suboptimal situation that has to be rectified during structural matching
- self.assertEqual(doc[1]._.holmes.token_and_coreference_chain_indexes,
- [1, 8, 11])
- self.assertEqual(doc[4]._.holmes.token_and_coreference_chain_indexes,
- [4, 8, 11])
- self.assertEqual(
- doc[8]._.holmes.token_and_coreference_chain_indexes, [8, 1, 4])
- self.assertEqual(doc[11]._.holmes.token_and_coreference_chain_indexes,
- [11, 1, 4])
-
def test_maximum_mentions_difference(self):
- doc = analyzer.parse("""Richard came to work. He was happy. He was happy. He was happy.
+ doc = nlp("""Richard came to work. He was happy. He was happy. He was happy.
He was happy. He was happy. He was happy. He was happy. He was happy.""")
self.assertEqual(
doc[0]._.holmes.token_and_coreference_chain_indexes, [0, 5, 9, 13])
@@ -797,13 +764,44 @@ def test_maximum_mentions_difference(self):
self.assertEqual(
doc[34]._.holmes.token_and_coreference_chain_indexes, [34, 22, 26, 30])
+ def test_most_specific_coreferring_term_index_with_pronoun(self):
+ doc = nlp("I saw Richard. The person came home. He was surprised.")
+ self.assertEqual(
+ doc[2]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[5]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[9]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[3]._.holmes.most_specific_coreferring_term_index, None)
+
+ def test_most_specific_coreferring_term_index_without_pronoun(self):
+ doc = nlp("I saw Richard. The person came home.")
+ self.assertEqual(
+ doc[2]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[5]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[3]._.holmes.most_specific_coreferring_term_index, None)
+
+ def test_most_specific_coreferring_term_index_with_coordination(self):
+ doc = nlp("I saw Richard. The person and Maria were talking. They came home.")
+ self.assertEqual(
+ doc[2]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[5]._.holmes.most_specific_coreferring_term_index, 2)
+ self.assertEqual(
+ doc[7]._.holmes.most_specific_coreferring_term_index, None)
+ self.assertEqual(
+ doc[9]._.holmes.most_specific_coreferring_term_index, None)
+
def test_adjective_verb_clause_subjective_simple(self):
- doc = analyzer.parse("Richard was glad to understand.")
+ doc = nlp("Richard was glad to understand.")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:arg(U); 3:aux')
def test_adjective_verb_clause_subjective_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard and Thomas were glad and happy to understand and to comprehend.")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'0:arg(U); 2:arg(U); 7:aux; 9:cc; 11:conj')
@@ -811,12 +809,12 @@ def test_adjective_verb_clause_subjective_compound(self):
'0:arg(U); 2:arg(U); 10:aux')
def test_adjective_verb_clause_objective_simple(self):
- doc = analyzer.parse("Richard was hard to reach.")
+ doc = nlp("Richard was hard to reach.")
self.assertEqual(doc[4]._.holmes.string_representation_of_children(),
'0:arg(U); 3:aux')
def test_adjective_verb_clause_objective_compound(self):
- doc = analyzer.parse(
+ doc = nlp(
"Richard and Thomas were hard and difficult to reach and to call.")
self.assertEqual(doc[8]._.holmes.string_representation_of_children(),
'0:arg(U); 2:arg(U); 7:aux; 9:cc; 11:conj')
@@ -824,107 +822,144 @@ def test_adjective_verb_clause_objective_compound(self):
'0:arg(U); 2:arg(U); 10:aux')
def test_prepositional_phrase_dependent_on_noun_no_conjunction(self):
- doc = analyzer.parse("Houses in the village.")
+ doc = nlp("Houses in the village.")
self.assertEqual(doc[0]._.holmes.string_representation_of_children(),
'1:prep; 3:pobjp')
def test_prepositional_phrase_dependent_on_noun_with_conjunction(self):
- doc = analyzer.parse("Houses in the village and the town.")
+ doc = nlp("Houses in the village and the town.")
self.assertEqual(doc[0]._.holmes.string_representation_of_children(),
'1:prep; 3:pobjp; 6:pobjp')
def test_simple_relative_prepositional_phrase(self):
- doc = analyzer.parse("The table from which we ate.")
+ doc = nlp("The table from which we ate.")
self.assertEqual(doc[3]._.holmes.string_representation_of_children(),
'-2:None')
self.assertEqual(doc[5]._.holmes.string_representation_of_children(),
'1:pobjp; 2:prep; 4:nsubj')
def test_conjunction_relative_prepositional_phrase(self):
- doc = analyzer.parse(
+ doc = nlp(
"The table and the chair from which you and I ate and drank.")
self.assertEqual(doc[6]._.holmes.string_representation_of_children(),
- '-5:None')
+ '-2:None')
self.assertEqual(doc[10]._.holmes.string_representation_of_children(),
'1:pobjp(U); 4:pobjp(U); 5:prep(U); 7:nsubj; 9:nsubj; 11:cc; 12:conj')
self.assertEqual(doc[12]._.holmes.string_representation_of_children(),
- '1:pobjp(U); 4:pobjp; 5:prep; 7:nsubj(U); 9:nsubj(U)')
+ '1:pobjp; 4:pobjp(U); 5:prep; 7:nsubj(U); 9:nsubj(U)')
def test_parent_token_indexes(self):
- doc = analyzer.parse("Houses in the village.")
+ doc = nlp("Houses in the village.")
self.assertEqual(doc[0]._.holmes.string_representation_of_children(),
'1:prep; 3:pobjp')
- self.assertEqual(doc[3]._.holmes.parent_dependencies, [
+ self.assertEqual(doc[3]._.holmes.coreference_linked_parent_dependencies, [
[0, 'pobjp'], [1, 'pobj']])
+ self.assertEqual(doc[3]._.holmes.string_representation_of_parents(),
+ '0:pobjp; 1:pobj')
def test_derived_lemma_from_dictionary(self):
- doc = analyzer.parse("A long imprisonment.")
+ doc = nlp("A long imprisonment.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'imprison')
def test_derived_lemma_root_word_from_dictionary(self):
- doc = analyzer.parse("He was imprisoned.")
+ doc = nlp("He was imprisoned.")
self.assertEqual(doc[2]._.holmes.derived_lemma, None)
def test_derived_lemma_ization(self):
- doc = analyzer.parse("Linearization problems.")
+ doc = nlp("Linearization problems.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'linearize')
def test_derived_lemma_isation(self):
- doc = analyzer.parse("Linearisation problems.")
+ doc = nlp("Linearisation problems.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'linearise')
def test_derived_lemma_ically(self):
- doc = analyzer.parse("They used it very economically.")
+ doc = nlp("They used it very economically.")
self.assertEqual(doc[4]._.holmes.derived_lemma, 'economic')
def test_derived_lemma_ibly(self):
- doc = analyzer.parse("It stank horribly.")
+ doc = nlp("It stank horribly.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'horrible')
- def test_derived_lemma_ibly(self):
- doc = analyzer.parse("Regrettably it was a problem.")
+ def test_derived_lemma_ably(self):
+ doc = nlp("Regrettably it was a problem.")
self.assertEqual(doc[0]._.holmes.derived_lemma, 'regrettable')
- def test_derived_lemma_ibly(self):
- doc = analyzer.parse("He used the software happily.")
+ def test_derived_lemma_ily(self):
+ doc = nlp("He used the software happily.")
self.assertEqual(doc[4]._.holmes.derived_lemma, 'happy')
def test_derived_lemma_ly(self):
- doc = analyzer.parse("It went swingingly.")
+ doc = nlp("It went swingingly.")
self.assertEqual(doc[2]._.holmes.derived_lemma, 'swinging')
def test_derived_lemma_ness(self):
- doc = analyzer.parse("There was a certain laxness.")
+ doc = nlp("There was a certain laxness.")
self.assertEqual(doc[4]._.holmes.derived_lemma, 'lax')
def test_derived_lemma_ness_with_y(self):
- doc = analyzer.parse("There was a certain bawdiness.")
+ doc = nlp("There was a certain bawdiness.")
self.assertEqual(doc[4]._.holmes.derived_lemma, 'bawdy')
def test_derived_lemma_ing(self):
- doc = analyzer.parse("The playing was very loud.")
+ doc = nlp("The playing was very loud.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'play')
def test_derived_lemma_ing_with_doubling(self):
- doc = analyzer.parse("The ramming of the vehicle was very loud.")
+ doc = nlp("The ramming of the vehicle was very loud.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'ram')
def test_derived_lemma_ication(self):
- doc = analyzer.parse("The verification of the results.")
+ doc = nlp("The verification of the results.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'verify')
def test_derived_lemma_ation_3(self):
- doc = analyzer.parse("The manipulation of the results.")
+ doc = nlp("The manipulation of the results.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'manipulate')
def test_derived_lemma_ication_in_icate(self):
- doc = analyzer.parse("The domestication of the dog.")
+ doc = nlp("The domestication of the dog.")
self.assertEqual(doc[1]._.holmes.derived_lemma, 'domesticate')
def test_no_derived_lemma(self):
- doc = analyzer.parse("vehicle.")
+ doc = nlp("vehicle.")
self.assertEqual(doc[0]._.holmes.derived_lemma, None)
def test_formerly_problematic_sentence_no_exception_thrown(self):
- analyzer.parse(
+ nlp(
"Mothers with vouchers for themselves and their young childrenwere finding that many eligible products were gone.")
+
+ def test_pipe(self):
+ docs = list(nlp.pipe(['some dogs', 'some cats']))
+ self.assertEqual(docs[0][1]._.holmes.lemma, 'dog')
+ self.assertEqual(docs[1][1]._.holmes.lemma, 'cat')
+
+ def test_predicative_adjective_in_relative_clause(self):
+ doc = nlp("He saw his son, who was sad.")
+ self.assertEqual(doc[3]._.holmes.string_representation_of_children(), '2:poss; 6:relcl; 7:amod')
+
+ def test_question_word_initial(self):
+ doc = nlp("Whom did you talk to?")
+ self.assertTrue(doc[0]._.holmes.is_initial_question_word)
+
+ def test_question_word_after_preposition(self):
+ doc = nlp("To whom did you talk?")
+ self.assertTrue(doc[1]._.holmes.is_initial_question_word)
+
+ def test_question_word_after_double_preposition(self):
+ doc = nlp("Because of whom did you come?")
+ self.assertTrue(doc[2]._.holmes.is_initial_question_word)
+
+ def test_question_word_in_complex_phrase(self):
+ doc = nlp("On the basis of what information did you come?")
+ self.assertTrue(doc[4]._.holmes.is_initial_question_word)
+
+ def test_question_word_control_1(self):
+ doc = nlp(". Whom did you talk to?")
+ for token in doc:
+ self.assertFalse(token._.holmes.is_initial_question_word)
+
+ def test_question_word_control_2(self):
+ doc = nlp("You came because of whom?")
+ for token in doc:
+ self.assertFalse(token._.holmes.is_initial_question_word)
diff --git a/holmes_extractor/tests/en/test_structural_matching_EN.py b/tests/en/test_structural_matching_EN.py
similarity index 70%
rename from holmes_extractor/tests/en/test_structural_matching_EN.py
rename to tests/en/test_structural_matching_EN.py
index eab6804..91f0a18 100644
--- a/holmes_extractor/tests/en/test_structural_matching_EN.py
+++ b/tests/en/test_structural_matching_EN.py
@@ -4,19 +4,19 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
- (script_directory, 'test_ontology.owl')))
-nocoref_holmes_manager = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- perform_coreference_resolution=False)
+ (script_directory, 'test_ontology.owl')), symmetric_matching=True)
+nocoref_holmes_manager = holmes.Manager(model='en_core_web_trf', ontology=ontology,
+ perform_coreference_resolution=False,
+ number_of_workers=2)
nocoref_holmes_manager.register_search_phrase("A dog chases a cat")
nocoref_holmes_manager.register_search_phrase("The man was poor")
nocoref_holmes_manager.register_search_phrase("The rich man")
nocoref_holmes_manager.register_search_phrase("Someone eats a sandwich")
-nocoref_holmes_manager.register_search_phrase("The giving to a beneficiary")
+nocoref_holmes_manager.register_search_phrase("The gift to a beneficiary")
nocoref_holmes_manager.register_search_phrase("A colleague's computer")
nocoref_holmes_manager.register_search_phrase(
"An ENTITYPERSON opens an account")
nocoref_holmes_manager.register_search_phrase("A dog eats a bone")
-nocoref_holmes_manager.register_search_phrase("Who fell asleep?")
nocoref_holmes_manager.register_search_phrase("Who is sad?")
nocoref_holmes_manager.register_search_phrase("Insurance for years")
nocoref_holmes_manager.register_search_phrase(
@@ -41,23 +41,29 @@
nocoref_holmes_manager.register_search_phrase("neatness")
nocoref_holmes_manager.register_search_phrase("modest")
nocoref_holmes_manager.register_search_phrase("monthly")
-nocoref_holmes_manager.register_search_phrase("Somebody uses a vaulting horse")
-nocoref_holmes_manager.register_search_phrase("A big vaulting horse")
-nocoref_holmes_manager.register_search_phrase("Somebody sees a vault horse")
-nocoref_holmes_manager.register_search_phrase("A small vault horse")
-nocoref_holmes_manager.register_search_phrase("a vaulting horse")
+nocoref_holmes_manager.register_search_phrase("Somebody uses a wastage horse")
+nocoref_holmes_manager.register_search_phrase("A big wastage horse")
+nocoref_holmes_manager.register_search_phrase("Somebody sees a waste horse")
+nocoref_holmes_manager.register_search_phrase("A small waste horse")
+nocoref_holmes_manager.register_search_phrase("a wastage horse")
nocoref_holmes_manager.register_search_phrase("a big hyphenated multiword")
nocoref_holmes_manager.register_search_phrase("a small hyphenated-multiword")
nocoref_holmes_manager.register_search_phrase("a big unhyphenated multiword")
nocoref_holmes_manager.register_search_phrase("a small unhyphenated-multiword")
nocoref_holmes_manager.register_search_phrase("hyphenated single multiword")
nocoref_holmes_manager.register_search_phrase("unhyphenated single multiword")
-
-holmes_manager_with_variable_search_phrases = holmes.Manager(model='en_core_web_lg',
- ontology=ontology, perform_coreference_resolution=False)
-holmes_manager_with_embeddings = holmes.Manager(model='en_core_web_lg',
- overall_similarity_threshold=0.7, perform_coreference_resolution=False)
-
+nocoref_holmes_manager.register_search_phrase("An adopted boy")
+nocoref_holmes_manager.register_search_phrase("Someone adopts a girl")
+nocoref_holmes_manager.register_search_phrase("An running boy")
+nocoref_holmes_manager.register_search_phrase("A girl is running")
+nocoref_holmes_manager.register_search_phrase("A son is excited")
+
+holmes_manager_with_variable_search_phrases = holmes.Manager(model='en_core_web_trf',
+ ontology=ontology, perform_coreference_resolution=False,
+ number_of_workers=1)
+holmes_manager_with_embeddings = holmes.Manager(model='en_core_web_trf',
+ overall_similarity_threshold=0.7, perform_coreference_resolution=False, use_reverse_dependency_matching=False,
+ number_of_workers=2)
class EnglishStructuralMatchingTest(unittest.TestCase):
@@ -70,13 +76,13 @@ def test_direct_matching(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog chased the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_negated)
+ self.assertFalse(matches[0]['negated'])
def test_matching_within_large_sentence_with_negation(self):
matches = self._get_matches(
nocoref_holmes_manager, "We discussed various things. Although it had never been claimed that a dog had ever chased a cat, it was nonetheless true. This had always been a difficult topic.")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
+ self.assertTrue(matches[0]['negated'])
def test_nouns_inverted(self):
matches = self._get_matches(
@@ -97,19 +103,19 @@ def test_verb_negation(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog did not chase the cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
+ self.assertTrue(matches[0]['negated'])
def test_noun_phrase_negation(self):
matches = self._get_matches(
nocoref_holmes_manager, "No dog chased any cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_negated)
+ self.assertTrue(matches[0]['negated'])
def test_irrelevant_negation(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog who was not old chased the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_negated)
+ self.assertFalse(matches[0]['negated'])
def test_adjective_swapping(self):
matches = self._get_matches(nocoref_holmes_manager, "The poor man")
@@ -121,8 +127,8 @@ def test_adjective_swapping_with_conjunction(self):
matches = self._get_matches(
nocoref_holmes_manager, "The poor and poor man")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
matches = self._get_matches(
nocoref_holmes_manager, "The man was rich and rich")
self.assertEqual(len(matches), 2)
@@ -132,24 +138,24 @@ def test_conjunction_with_and(self):
"The dog and the dog chased a cat and another cat")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_conjunction_with_or(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog or the dog chased a cat and another cat")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertTrue(text_match.is_uncertain)
+ self.assertTrue(text_match['uncertain'])
def test_threeway_conjunction_with_or(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog, the dog or the dog chased a cat and another cat")
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[1].is_uncertain)
- self.assertTrue(matches[2].is_uncertain)
- self.assertFalse(matches[3].is_uncertain)
- self.assertTrue(matches[4].is_uncertain)
- self.assertTrue(matches[5].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
+ self.assertTrue(matches[1]['uncertain'])
+ self.assertTrue(matches[2]['uncertain'])
+ self.assertTrue(matches[3]['uncertain'])
+ self.assertTrue(matches[4]['uncertain'])
+ self.assertTrue(matches[5]['uncertain'])
def test_generic_pronoun(self):
matches = self._get_matches(
@@ -160,70 +166,70 @@ def test_active(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog will chase the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
nocoref_holmes_manager, "The dog always used to chase the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive(self):
matches = self._get_matches(
nocoref_holmes_manager, "The cat is chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(
nocoref_holmes_manager, "The cat will be chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
matches = self._get_matches(nocoref_holmes_manager,
"The cat was going to be chased by the dog")
self.assertEqual(len(matches), 1)
matches = self._get_matches(nocoref_holmes_manager,
"The cat always used to be chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_was_going_to_active(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog was going to chase the cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_was_going_to_passive(self):
matches = self._get_matches(nocoref_holmes_manager,
"The cat was going to be chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_active_complement_without_object(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog decided to chase the cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_active_complement_with_object(self):
matches = self._get_matches(
nocoref_holmes_manager, "He told the dog to chase the cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_passive_complement_without_object(self):
matches = self._get_matches(
nocoref_holmes_manager, "The sandwich decided to be eaten")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_passive_complement_with_object(self):
matches = self._get_matches(nocoref_holmes_manager,
"He told the cat to be chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_relative_clause_without_pronoun(self):
matches = self._get_matches(
nocoref_holmes_manager, "The cat the dog chased was scared")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_relative_clause_without_pronoun_inverted(self):
matches = self._get_matches(
@@ -234,95 +240,89 @@ def test_subjective_relative_clause_with_pronoun(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog who chased the cat came home")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_subjective_relative_clause_with_pronoun_and_conjunction(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog who chased the cat and cat came home")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
def test_objective_relative_clause_with_wh_pronoun(self):
matches = self._get_matches(
nocoref_holmes_manager, "The cat who the dog chased came home")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_objective_relative_clause_with_that_pronoun(self):
matches = self._get_matches(
nocoref_holmes_manager, "The cat that the dog chased came home")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_whose_clause(self):
matches = self._get_matches(nocoref_holmes_manager,
"The colleague whose computer I repaired last week has gone home")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_whose_clause_with_conjunction_of_possessor(self):
matches = self._get_matches(nocoref_holmes_manager,
"The colleague and colleague whose computer I repaired last week have gone home")
self.assertEqual(len(matches), 2)
- self.assertTrue(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
def test_whose_clause_with_conjunction_of_possessed(self):
matches = self._get_matches(nocoref_holmes_manager,
"The colleague whose computer and computer I repaired last week has gone home")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertFalse(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertFalse(matches[1]['uncertain'])
def test_phrasal_verb(self):
matches = self._get_matches(
nocoref_holmes_manager, "Richard Hudson took out an account")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_modal_verb(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog could chase the cat")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_active_participle(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog chasing the cat was a problem")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive_participle(self):
matches = self._get_matches(nocoref_holmes_manager,
"He talked about the cat chased by the dog")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
-
- def test_active_participle(self):
- matches = self._get_matches(
- nocoref_holmes_manager, "The dog chasing the cat was a problem")
- self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_gerund_with_of(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog's chasing of the cat was a problem")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_gerund_with_by(self):
matches = self._get_matches(nocoref_holmes_manager,
- "The cat's chasing by the dog was a problem")
+ "The cat's being chased by the dog was a problem")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_objective_modifying_adverbial_phrase(self):
matches = self._get_matches(
nocoref_holmes_manager, "The cat-chasing dog and dog came home")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[1]['uncertain'])
def test_objective_modifying_adverbial_phrase_with_inversion(self):
matches = self._get_matches(
@@ -333,8 +333,8 @@ def test_subjective_modifying_adverbial_phrase(self):
matches = self._get_matches(
nocoref_holmes_manager, "The dog-chased cat and cat came home")
self.assertEqual(len(matches), 2)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[1].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[1]['uncertain'])
def test_subjective_modifying_adverbial_phrase_with_inversion(self):
matches = self._get_matches(
@@ -345,25 +345,25 @@ def test_adjective_prepositional_complement_with_conjunction_active(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog and the lion were worried about chasing a cat and a mouse")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_adjective_prepositional_complement_with_conjunction_passive(self):
matches = self._get_matches(nocoref_holmes_manager,
"The cat and the mouse were worried about being chased by a dog and a lion")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_verb_prepositional_complement_with_conjunction_active(self):
matches = self._get_matches(nocoref_holmes_manager,
"The dog and the lion were thinking about chasing a cat and a mouse")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_verb_prepositional_complement_with_conjunction_passive(self):
matches = self._get_matches(nocoref_holmes_manager,
"The cat and the mouse were thinking about being chased by a dog and a lion")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_passive_search_phrase_with_active_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -372,7 +372,7 @@ def test_passive_search_phrase_with_active_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"The dog will chase the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_passive_search_phrase_with_active_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -382,7 +382,7 @@ def test_passive_search_phrase_with_active_conjunction_searched_sentence(self):
"The dog and the dog have chased a cat and a cat")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_passive_search_phrase_with_passive_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -392,7 +392,7 @@ def test_passive_search_phrase_with_passive_conjunction_searched_sentence(self):
"The cat and the cat will be chased by a dog and a dog")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_passive_search_phrase_with_negated_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -401,8 +401,8 @@ def test_passive_search_phrase_with_negated_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"The dog never chased the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[0].is_negated)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[0]['negated'])
def test_question_search_phrase_with_active_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -411,7 +411,7 @@ def test_question_search_phrase_with_active_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"The dog will chase the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_question_search_phrase_with_active_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -421,7 +421,7 @@ def test_question_search_phrase_with_active_conjunction_searched_sentence(self):
"The dog and the dog have chased a cat and a cat")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_question_search_phrase_with_passive_conjunction_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -431,7 +431,7 @@ def test_question_search_phrase_with_passive_conjunction_searched_sentence(self)
"The cat and the cat will be chased by a dog and a dog")
self.assertEqual(len(matches), 4)
for text_match in matches:
- self.assertFalse(text_match.is_uncertain)
+ self.assertFalse(text_match['uncertain'])
def test_question_search_phrase_with_negated_searched_sentence(self):
holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
@@ -440,13 +440,13 @@ def test_question_search_phrase_with_negated_searched_sentence(self):
matches = self._get_matches(holmes_manager_with_variable_search_phrases,
"The dog never chased the cat")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
- self.assertTrue(matches[0].is_negated)
+ self.assertFalse(matches[0]['uncertain'])
+ self.assertTrue(matches[0]['negated'])
def test_coherent_matching_1(self):
holmes_manager_with_embeddings.register_search_phrase(
"Farmers go into the mountains")
- match_dict = holmes_manager_with_embeddings.match_search_phrases_against(
+ match_dict = holmes_manager_with_embeddings.match(document_text=
"In Norway the peasants go into the mountains")
self.assertEqual(len(match_dict), 1)
self.assertEqual(match_dict[0]['word_matches']
@@ -466,12 +466,6 @@ def test_coherent_matching_1(self):
self.assertEqual(match_dict[0]['word_matches']
[3]['document_word'], "mountain")
- def test_coherent_matching_2(self):
- matches = self._get_matches(nocoref_holmes_manager,
- "It was quite early when she kissed her old grandmother, who was still asleep.")
- # error if coherent matching not working properly
- self.assertEqual(len(matches), 1)
-
def test_original_search_phrase_root_not_matchable(self):
matches = self._get_matches(
nocoref_holmes_manager, "The man was very sad.")
@@ -497,11 +491,6 @@ def test_matching_additional_preposition_dependency_on_noun(self):
matches = self._get_matches(nocoref_holmes_manager,
"An employee needs insurance for the next five years")
self.assertEqual(len(matches), 2)
- for match in matches:
- if len(match.word_matches) == 7:
- self.assertFalse(match.is_uncertain)
- else:
- self.assertTrue(match.is_uncertain)
def test_dative_prepositional_phrase_in_document_dative_noun_phrase_in_search_phrase_1(self):
matches = self._get_matches(nocoref_holmes_manager,
@@ -548,21 +537,11 @@ def test_capital_entity_is_not_analysed_as_entity_search_phrase_token(self):
"We discussed an entity and a second ENTITY.")
self.assertEqual(len(matches), 2)
- def test_entity_matching_with_underscore_in_entity_label(self):
- holmes_manager_with_variable_search_phrases.remove_all_search_phrases()
- holmes_manager_with_variable_search_phrases.register_search_phrase(
- "ENTITYWORK_OF_ART")
- holmes_manager_with_variable_search_phrases.register_search_phrase(
- "Somebody buys an ENTITYWORK_OF_ART")
- matches = self._get_matches(holmes_manager_with_variable_search_phrases,
- "I bought a Picasso")
- self.assertEqual(len(matches), 2)
-
def test_adjective_verb_phrase_as_search_phrase_matches_simple(self):
matches = self._get_matches(nocoref_holmes_manager,
"The holiday was very hard to book")
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_adjective_verb_phrase_as_search_phrase_no_match_with_normal_phrase(self):
matches = self._get_matches(nocoref_holmes_manager,
@@ -572,227 +551,276 @@ def test_adjective_verb_phrase_as_search_phrase_no_match_with_normal_phrase(self
def test_adjective_verb_phrase_as_search_phrase_matches_compound(self):
matches = self._get_matches(nocoref_holmes_manager,
"The holiday and the holiday were very hard and hard to book and to book")
- self.assertEqual(len(matches), 4)
+ self.assertEqual(len(matches), 8)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_objective_adjective_verb_phrase_matches_normal_search_phrase_simple(self):
matches = self._get_matches(nocoref_holmes_manager,
"The insurance was very hard to find")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_objective_adjective_verb_phrase_matches_normal_search_phrase_compound(self):
matches = self._get_matches(nocoref_holmes_manager,
"The insurance and the insurance were very hard and hard to find and to find")
- self.assertEqual(len(matches), 2)
+ self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_simple(self):
matches = self._get_matches(nocoref_holmes_manager,
"The man was very glad to sing")
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_compound(self):
matches = self._get_matches(nocoref_holmes_manager,
"The man and the man were very glad and glad to sing and to sing")
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_matching_with_prepositional_phrase_dependent_on_verb(self):
matches = self._get_matches(nocoref_holmes_manager,
"The salesman lived in England, Germany and France")
self.assertEqual(len(matches), 3)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_matching_with_prepositional_phrase_dependent_on_noun(self):
matches = self._get_matches(nocoref_holmes_manager,
"The salesman had a house in England, Germany and France")
self.assertEqual(len(matches), 3)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_derivation_in_document_on_root(self):
matches = self._get_matches(nocoref_holmes_manager,
"The eating of a bone by a puppy")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_search_phrase_on_root(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody gives to a beneficiary")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'derivation')
def test_derivation_in_document_on_non_root(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody attempts an explanation")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_search_phrase_on_non_root(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody demands to explain")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_document_on_non_root_with_conjunction(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody attempts an explanation and an explanation")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
- self.assertEqual(matches[1].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
+ self.assertEqual(matches[1]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_search_phrase_on_non_root_with_conjunction(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody demands to explain and to explain")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
- self.assertEqual(matches[1].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
+ self.assertEqual(matches[1]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_document_on_single_word(self):
matches = self._get_matches(nocoref_holmes_manager,
"neat")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'derivation')
def test_derivation_in_search_phrase_on_single_word(self):
matches = self._get_matches(nocoref_holmes_manager,
"musical")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'derivation')
def test_derivation_in_document_on_single_word_with_ontology(self):
matches = self._get_matches(nocoref_holmes_manager,
"month")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[0].type, 'ontology')
- self.assertEqual(matches[1].word_matches[0].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'ontology')
+ self.assertEqual(matches[1]['word_matches'][0]['match_type'], 'derivation')
def test_derivation_in_search_phrase_on_single_word_with_ontology(self):
matches = self._get_matches(nocoref_holmes_manager,
"modestly")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[0].type, 'derivation')
- self.assertEqual(matches[1].word_matches[0].type, 'ontology')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'derivation')
+ self.assertEqual(matches[1]['word_matches'][0]['match_type'], 'ontology')
def test_derivation_in_document_on_non_root_with_ontology(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody attempts an invitation")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'ontology')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'ontology')
def test_derivation_in_search_phrase_on_non_root_with_ontology(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody shouts to explain")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'ontology')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'ontology')
def test_derivation_in_search_phrase_and_document_on_root_with_ontology(self):
matches = self._get_matches(nocoref_holmes_manager,
"Somebody explains to a salesman")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'ontology')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'ontology')
def test_derivation_in_document_with_multiword_root_word(self):
matches = self._get_matches(nocoref_holmes_manager,
- "A big vault horse")
+ "A big waste horse")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_document_with_multiword_non_root_word(self):
matches = self._get_matches(nocoref_holmes_manager,
- "A vault horse was used")
+ "A waste horse was used")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[1]['word_matches'][1]['match_type'], 'derivation')
def test_derivation_in_document_with_multiword_single_word(self):
matches = self._get_matches(nocoref_holmes_manager,
- "a vault horse")
+ "a waste horse")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'derivation')
def test_derivation_in_document_with_multiword_single_word_control(self):
matches = self._get_matches(nocoref_holmes_manager,
- "a vaulting horse")
+ "a wastage horse")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'direct')
def test_derivation_in_search_phrase_with_multiword_root_word(self):
matches = self._get_matches(nocoref_holmes_manager,
- "A small vaulting horse")
+ "A small wastage horse")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_derivation_in_search_phrase_with_multiword_non_root_word(self):
matches = self._get_matches(nocoref_holmes_manager,
- "A vaulting horse was seen")
+ "A wastage horse was seen")
self.assertEqual(len(matches), 2)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[1]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_1(self):
matches = self._get_matches(nocoref_holmes_manager,
"A big hyphenated-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_2(self):
matches = self._get_matches(nocoref_holmes_manager,
"A big hyphenated multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_3(self):
matches = self._get_matches(nocoref_holmes_manager,
"A small hyphenated-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_4(self):
matches = self._get_matches(nocoref_holmes_manager,
"A small hyphenated multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_5(self):
matches = self._get_matches(nocoref_holmes_manager,
"A big unhyphenated-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_6(self):
matches = self._get_matches(nocoref_holmes_manager,
"A big unhyphenated multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_7(self):
matches = self._get_matches(nocoref_holmes_manager,
"A small unhyphenated-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_8(self):
matches = self._get_matches(nocoref_holmes_manager,
"A small unhyphenated multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_9(self):
matches = self._get_matches(nocoref_holmes_manager,
"hyphenated-single-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'direct')
def test_hyphenation_10(self):
matches = self._get_matches(nocoref_holmes_manager,
"unhyphenated-single-multiword")
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'direct')
+ self.assertEqual(matches[0]['word_matches'][0]['match_type'], 'direct')
+
+ def test_dobj_matches_amod(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "Someone adopts a boy")
+ self.assertEqual(len(matches), 1)
+ self.assertTrue(matches[0]['uncertain'])
+
+ def test_amod_matches_dobj(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "An adopted girl")
+ self.assertEqual(len(matches), 1)
+
+ def test_nsubj_matches_amod(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "A boy is running")
+ self.assertEqual(len(matches), 1)
+
+ def test_amod_matches_nsubj(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "A running girl")
+ self.assertEqual(len(matches), 1)
+
+ def test_dobj_matches_amod_with_conjunction(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "Someone adopts a boy and a boy")
+ self.assertEqual(len(matches), 2)
+ self.assertTrue(matches[0]['uncertain'])
+ self.assertTrue(matches[1]['uncertain'])
+
+ def test_amod_matches_dobj_with_conjunction(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "An adopted girl and girl")
+ self.assertEqual(len(matches), 2)
+
+ def test_nsubj_matches_amod_with_conjunction(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "A boy and a boy are running")
+ self.assertEqual(len(matches), 2)
+
+ def test_amod_matches_nsubj_with_conjunction(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "A running girl and girl")
+ self.assertEqual(len(matches), 2)
+
+ def test_amod_matches_nsubj_with_conjunction_use_reverse_dependency_matching_false(self):
+ holmes_manager_with_embeddings.register_search_phrase("A girl is running")
+ matches = self._get_matches(holmes_manager_with_embeddings,
+ "A running girl and girl")
+ self.assertEqual(len(matches), 0)
def test_ontology_multiword_information_in_word_match_objects_at_sentence_boundaries(self):
holmes_manager_with_variable_search_phrases.remove_all_documents()
@@ -803,15 +831,15 @@ def test_ontology_multiword_information_in_word_match_objects_at_sentence_bounda
"A dog chases a cat")
matches = holmes_manager_with_variable_search_phrases.match()
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].first_document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].last_document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].first_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].last_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 3)
- self.assertEqual(matches[0].word_matches[2].first_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].last_document_token.i, 3)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['first_document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['last_document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['first_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['last_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][2]['first_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['last_document_token_index'], 3)
def test_ontology_multiword_information_in_word_match_objects_not_at_sentence_boundaries(self):
holmes_manager_with_variable_search_phrases.remove_all_documents()
@@ -822,15 +850,15 @@ def test_ontology_multiword_information_in_word_match_objects_not_at_sentence_bo
"A dog chases a cat")
matches = holmes_manager_with_variable_search_phrases.match()
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 1)
- self.assertEqual(matches[0].word_matches[0].first_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[0].last_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].first_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].last_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[2].first_document_token.i, 3)
- self.assertEqual(matches[0].word_matches[2].last_document_token.i, 4)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][0]['first_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][0]['last_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['first_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['last_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['first_document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][2]['last_document_token_index'], 4)
def test_entity_multiword_information_in_word_match_objects_at_sentence_boundaries(self):
holmes_manager_with_variable_search_phrases.remove_all_documents()
@@ -841,15 +869,15 @@ def test_entity_multiword_information_in_word_match_objects_at_sentence_boundari
"A dog chases an ENTITYPERSON")
matches = holmes_manager_with_variable_search_phrases.match()
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].first_document_token.i, 0)
- self.assertEqual(matches[0].word_matches[0].last_document_token.i, 0)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].first_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].last_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 4)
- self.assertEqual(matches[0].word_matches[2].first_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].last_document_token.i, 4)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['first_document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][0]['last_document_token_index'], 0)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['first_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['last_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 4)
+ self.assertEqual(matches[0]['word_matches'][2]['first_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['last_document_token_index'], 4)
def test_entity_multiword_information_in_word_match_objects_not_at_sentence_boundaries(self):
holmes_manager_with_variable_search_phrases.remove_all_documents()
@@ -860,12 +888,28 @@ def test_entity_multiword_information_in_word_match_objects_not_at_sentence_boun
"A dog chases an ENTITYPERSON")
matches = holmes_manager_with_variable_search_phrases.match()
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[0].document_token.i, 1)
- self.assertEqual(matches[0].word_matches[0].first_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[0].last_document_token.i, 1)
- self.assertEqual(matches[0].word_matches[1].document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].first_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[1].last_document_token.i, 2)
- self.assertEqual(matches[0].word_matches[2].document_token.i, 5)
- self.assertEqual(matches[0].word_matches[2].first_document_token.i, 3)
- self.assertEqual(matches[0].word_matches[2].last_document_token.i, 5)
+ self.assertEqual(matches[0]['word_matches'][0]['document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][0]['first_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][0]['last_document_token_index'], 1)
+ self.assertEqual(matches[0]['word_matches'][1]['document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['first_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][1]['last_document_token_index'], 2)
+ self.assertEqual(matches[0]['word_matches'][2]['document_token_index'], 5)
+ self.assertEqual(matches[0]['word_matches'][2]['first_document_token_index'], 3)
+ self.assertEqual(matches[0]['word_matches'][2]['last_document_token_index'], 5)
+
+ def test_corpus_frequency_information(self):
+ holmes_manager_with_variable_search_phrases.remove_all_documents()
+ holmes_manager_with_variable_search_phrases.parse_and_register_document(
+ "Yesterday Fido chased Richard Paul Hudson in Prague with Fido and Balu.", '1')
+ holmes_manager_with_variable_search_phrases.parse_and_register_document(
+ "Yesterday Balu chased Hudson in Munich.", '2')
+ dictionary, maximum = holmes_manager_with_variable_search_phrases.\
+ get_corpus_frequency_information()
+ self.assertEqual(dictionary, {'ENTITYDATE': 2, 'yesterday': 2, 'ENTITYPERSON': 6, 'fido': 2, 'chased': 2, 'chase': 2, 'richard': 1, 'paul': 1, 'hudson': 2, 'richard paul hudson': 1, 'in': 2, 'ENTITYGPE': 2, 'prague': 1, 'with': 1, 'and': 1, 'balu': 2, 'munich': 1})
+ self.assertEqual(maximum, 6)
+
+ def test_predicative_adjective_in_relative_clause(self):
+ matches = self._get_matches(nocoref_holmes_manager,
+ "He saw his son, who was excited.")
+ self.assertEqual(len(matches), 1)
diff --git a/holmes_extractor/tests/en/test_structural_matching_with_coreference_EN.py b/tests/en/test_structural_matching_with_coreference_EN.py
similarity index 85%
rename from holmes_extractor/tests/en/test_structural_matching_with_coreference_EN.py
rename to tests/en/test_structural_matching_with_coreference_EN.py
index 25e87f8..fb6c695 100644
--- a/holmes_extractor/tests/en/test_structural_matching_with_coreference_EN.py
+++ b/tests/en/test_structural_matching_with_coreference_EN.py
@@ -5,8 +5,9 @@
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-coref_holmes_manager = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- perform_coreference_resolution=True)
+coref_holmes_manager = holmes.Manager(model='en_core_web_trf', ontology=ontology,
+ perform_coreference_resolution=True,
+ number_of_workers=2)
coref_holmes_manager.register_search_phrase("A dog chases a cat")
coref_holmes_manager.register_search_phrase("A big horse chases a cat")
coref_holmes_manager.register_search_phrase("A tiger chases a little cat")
@@ -26,20 +27,24 @@
"Somebody writes a book about an animal")
coref_holmes_manager.register_search_phrase("Hermione breaks")
coref_holmes_manager.register_search_phrase("Somebody attempts to explain")
-no_coref_holmes_manager = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- perform_coreference_resolution=False)
+coref_holmes_manager.register_search_phrase("An adopted boy")
+coref_holmes_manager.register_search_phrase("A running boy")
+no_coref_holmes_manager = holmes.Manager(model='en_core_web_trf', ontology=ontology,
+ perform_coreference_resolution=False,
+ number_of_workers=1)
no_coref_holmes_manager.register_search_phrase("A dog chases a cat")
-embeddings_coref_holmes_manager = holmes.Manager(model='en_core_web_lg',
- overall_similarity_threshold=0.85)
+embeddings_coref_holmes_manager = holmes.Manager(model='en_core_web_trf',
+ overall_similarity_threshold=0.85,
+ number_of_workers=2)
embeddings_coref_holmes_manager.register_search_phrase('A man loves a woman')
class CoreferenceEnglishMatchingTest(unittest.TestCase):
def _check_word_match(self, match, word_match_index, document_token_index, extracted_word):
- word_match = match.word_matches[word_match_index]
- self.assertEqual(word_match.document_token.i, document_token_index)
- self.assertEqual(word_match.extracted_word, extracted_word)
+ word_match = match['word_matches'][word_match_index]
+ self.assertEqual(word_match['document_token_index'], document_token_index)
+ self.assertEqual(word_match['extracted_word'], extracted_word)
def test_simple_pronoun_coreference_same_sentence(self):
coref_holmes_manager.remove_all_documents()
@@ -75,7 +80,7 @@ def test_simple_pronoun_coreference_same_sentence_plural_antecedent(self):
def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_both_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I saw a dog and a dog and they were chasing a cat.")
+ "I saw a dog and a dog, while they were chasing a cat.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
self._check_word_match(matches[0], 0, 3, 'dog')
@@ -84,7 +89,7 @@ def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_both
def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_left_matches(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I saw a dog and a horse and they were chasing a cat.")
+ "I saw a dog and a horse while they were chasing a cat.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'dog')
@@ -100,34 +105,34 @@ def test_simple_pronoun_coreference_same_sentence_conjunction_in_antecedent_righ
def test_simple_pronoun_coreference_same_sentence_conjunction_pronouns_both_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter and Jane, and he and she needed insurance.")
+ "I talked to Peter Jones and Jane Jones, while he and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 3, 'Peter')
- self._check_word_match(matches[1], 0, 5, 'Jane')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 7, 'Jane Jones')
def test_simple_pronoun_coreference_same_sentence_conjunction_lefthand_is_pronoun(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter, and he and Jane needed insurance.")
+ "I talked to Peter Jones, while he and Jane Jones needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 3, 'Peter')
- self._check_word_match(matches[1], 0, 8, 'Jane')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 10, 'Jane Jones')
def test_simple_pronoun_coreference_same_sentence_conjunction_righthand_is_pronoun(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Jane, and Peter and she needed insurance.")
+ "I talked to Jane Jones, while Peter Jones and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 6, 'Peter')
- self._check_word_match(matches[1], 0, 3, 'Jane')
+ self._check_word_match(matches[0], 0, 8, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 4, 'Jane Jones')
def test_simple_pronoun_coreference_same_sentence_conjunction_lefthand_noun_not_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Jane, and a horse and she needed insurance.")
+ "I talked to Jane, while a horse and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'Jane')
@@ -135,10 +140,10 @@ def test_simple_pronoun_coreference_same_sentence_conjunction_lefthand_noun_not_
def test_simple_pronoun_coreference_same_sentence_conjunction_righthand_noun_not_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter, and he and a horse need insurance.")
+ "I talked to Peter Jones, while he and a horse need insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 3, 'Peter')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
def test_simple_pronoun_coreference_diff_sentence(self):
coref_holmes_manager.remove_all_documents()
@@ -192,45 +197,45 @@ def test_simple_pronoun_coreference_diff_sentence_conjunction_in_antecedent_righ
def test_simple_pronoun_coreference_diff_sentence_conjunction_pronouns_both_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter and Jane. He and she needed insurance.")
+ "I talked to Peter Jones and Jane Jones. He and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 3, 'Peter')
- self._check_word_match(matches[1], 0, 5, 'Jane')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 7, 'Jane Jones')
def test_simple_pronoun_coreference_diff_sentence_conjunction_lefthand_is_pronoun(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter. He and Jane needed insurance.")
+ "I talked to Peter Jones. He and Jane Jones needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 3, 'Peter')
- self._check_word_match(matches[1], 0, 7, 'Jane')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 9, 'Jane Jones')
def test_simple_pronoun_coreference_diff_sentence_conjunction_righthand_is_pronoun(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Jane. Peter and she needed insurance.")
+ "I talked to Jane Jones. Both Peter Jones and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 0, 5, 'Peter')
- self._check_word_match(matches[1], 0, 3, 'Jane')
+ self._check_word_match(matches[0], 0, 8, 'Peter Jones')
+ self._check_word_match(matches[1], 0, 4, 'Jane Jones')
def test_simple_pronoun_coreference_diff_sentence_conjunction_lefthand_noun_not_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Jane. A horse and she needed insurance.")
+ "I talked to Jane Jones. A horse and she needed insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 3, 'Jane')
+ self._check_word_match(matches[0], 0, 4, 'Jane Jones')
def test_simple_pronoun_coreference_diff_sentence_conjunction_righthand_noun_not_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I talked to Peter. He and a horse need insurance.")
+ "I talked to Peter Jones. He and a horse need insurance.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 3, 'Peter')
+ self._check_word_match(matches[0], 0, 4, 'Peter Jones')
def test_pronoun_coreferent_has_dependency_same_sentence(self):
coref_holmes_manager.remove_all_documents()
@@ -320,11 +325,11 @@ def test_noun_coreferent_has_dependency_same_sentence(self):
def test_plural_noun_coreferent_has_dependency_same_sentence(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I saw big horses and the horses were chasing a cat.")
+ "I saw some big horses and the horses were chasing a cat.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 2, 'big')
- self._check_word_match(matches[0], 1, 6, 'horse')
+ self._check_word_match(matches[0], 0, 3, 'big')
+ self._check_word_match(matches[0], 1, 7, 'horse')
def test_noun_coreferents_with_pronoun_conjunction_same_sentence_noun_matches(self):
coref_holmes_manager.remove_all_documents()
@@ -423,11 +428,11 @@ def test_noun_coreferent_has_dependency_diff_sentence(self):
def test_plural_noun_coreferent_has_dependency_diff_sentence(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "I saw big horses. The horses were chasing a cat.")
+ "I saw some big horses. The horses were chasing a cat.")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 2, 'big')
- self._check_word_match(matches[0], 1, 6, 'horse')
+ self._check_word_match(matches[0], 0, 3, 'big')
+ self._check_word_match(matches[0], 1, 7, 'horse')
def test_noun_coreferents_with_pronoun_conjunction_diff_sentence_noun_matches(self):
coref_holmes_manager.remove_all_documents()
@@ -438,15 +443,6 @@ def test_noun_coreferents_with_pronoun_conjunction_diff_sentence_noun_matches(se
self._check_word_match(matches[0], 0, 10, 'big')
self._check_word_match(matches[0], 1, 11, 'horse')
- def test_noun_coreferent_has_dependency_diff_sentence_relative_clause(self):
- coref_holmes_manager.remove_all_documents()
- coref_holmes_manager.parse_and_register_document(
- "I saw a big horse. The horse who was chasing a cat was happy.")
- matches = coref_holmes_manager.match()
- self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 0, 3, 'big')
- self._check_word_match(matches[0], 1, 7, 'horse')
-
def test_pronoun_coreferent_has_dependency_three_sentences(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
@@ -463,7 +459,7 @@ def test_pronoun_coreferent_in_active_verbal_governing_clause(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'dog')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_pronoun_coreferent_in_passive_verbal_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -472,7 +468,7 @@ def test_pronoun_coreferent_in_passive_verbal_governing_clause(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 2, 3, 'cat')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_pronoun_coreferent_in_active_adjectival_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -481,7 +477,7 @@ def test_pronoun_coreferent_in_active_adjectival_governing_clause(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'dog')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_pronoun_coreferent_in_passive_adjectival_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -490,7 +486,7 @@ def test_pronoun_coreferent_in_passive_adjectival_governing_clause(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 2, 3, 'cat')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_noun_coreferent_in_active_verbal_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -500,7 +496,7 @@ def test_noun_coreferent_in_active_verbal_governing_clause(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'big')
self._check_word_match(matches[0], 1, 7, 'horse')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_noun_coreferent_in_passive_verbal_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -510,7 +506,7 @@ def test_noun_coreferent_in_passive_verbal_governing_clause(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 2, 3, 'little')
self._check_word_match(matches[0], 3, 7, 'cat')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_noun_coreferent_in_active_adjectival_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -520,7 +516,7 @@ def test_noun_coreferent_in_active_adjectival_governing_clause(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'big')
self._check_word_match(matches[0], 1, 7, 'horse')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_noun_coreferent_in_passive_adjectival_governing_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -530,7 +526,7 @@ def test_noun_coreferent_in_passive_adjectival_governing_clause(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 2, 3, 'little')
self._check_word_match(matches[0], 3, 7, 'cat')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_pronoun_coreferent_in_ambiguous_noun_or_verb_dependency(self):
coref_holmes_manager.remove_all_documents()
@@ -539,7 +535,7 @@ def test_pronoun_coreferent_in_ambiguous_noun_or_verb_dependency(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 3, 'university')
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_reflexive_pronoun_coreferent(self):
coref_holmes_manager.remove_all_documents()
@@ -576,7 +572,7 @@ def test_reflexive_pronoun_coreferents_with_conjunction_diff_noun(self):
self._check_word_match(matches[1], 0, 4, 'leopard')
self._check_word_match(matches[1], 0, 4, 'leopard')
- def test_different_extracted_word_preceding_hyponym(self):
+ def test_different_extracted_word(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
"We discussed Peters plc. The big company was in difficulties. It had made a loss")
@@ -584,14 +580,6 @@ def test_different_extracted_word_preceding_hyponym(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 1, 7, 'Peters plc')
- def test_different_extracted_word_preceding_individual(self):
- coref_holmes_manager.remove_all_documents()
- coref_holmes_manager.parse_and_register_document(
- "We discussed Bakers plc. The big company was in difficulties. It had made a loss")
- matches = coref_holmes_manager.match()
- self.assertEqual(len(matches), 1)
- self._check_word_match(matches[0], 1, 7, 'Bakers plc')
-
def test_repeated_noun(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
@@ -600,7 +588,7 @@ def test_repeated_noun(self):
self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 7, 'dog')
- def test_repeated_noun_match_first_mention(self):
+ def test_repeated_noun_match_both_mentions(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
"We saw a tired dog. The dog was chasing a donkey.")
@@ -611,31 +599,14 @@ def test_repeated_noun_match_first_mention(self):
self._check_word_match(matches[1], 0, 3, 'tired')
self._check_word_match(matches[1], 1, 7, 'dog')
- def test_repeated_noun_match_both_mentions(self):
- coref_holmes_manager.remove_all_documents()
- coref_holmes_manager.parse_and_register_document(
- "We saw a tired dog. The tired dog was chasing a donkey.")
- matches = coref_holmes_manager.match()
- self.assertEqual(len(matches), 4)
- self._check_word_match(matches[0], 0, 3, 'tired')
- self._check_word_match(matches[0], 1, 4, 'dog')
- self._check_word_match(matches[1], 0, 7, 'tired')
- self._check_word_match(matches[1], 1, 4, 'dog')
- self._check_word_match(matches[2], 0, 7, 'tired')
- self._check_word_match(matches[2], 1, 8, 'dog')
- self._check_word_match(matches[3], 0, 3, 'tired')
- self._check_word_match(matches[3], 1, 8, 'dog')
-
def test_mentions_following_structural_match(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "A big horse was chasing a cat. The big horse was happy.")
+ "A big horse was chasing a cat. The horse was happy.")
matches = coref_holmes_manager.match()
- self.assertEqual(len(matches), 2)
+ self.assertEqual(len(matches), 1)
self._check_word_match(matches[0], 0, 1, 'big')
self._check_word_match(matches[0], 1, 2, 'horse')
- self._check_word_match(matches[1], 0, 9, 'big')
- self._check_word_match(matches[1], 1, 2, 'horse')
def test_relative_clause(self):
coref_holmes_manager.remove_all_documents()
@@ -643,37 +614,37 @@ def test_relative_clause(self):
"I saw a cat. The dog that had been chasing it was tired")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
- self._check_word_match(matches[0], 2, 3, 'cat')
+ self._check_word_match(matches[1], 2, 3, 'cat')
def test_dictionary_sentences_one_sentence(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "A sentence. I saw a dog and it was chasing a cat. Another sentence.")
- matches = coref_holmes_manager.match_returning_dictionaries()
+ "A sentence. I saw a dog and he was chasing a cat. Another sentence.")
+ matches = coref_holmes_manager.match()
self.assertEqual(matches[0]['sentences_within_document'],
- "I saw a dog and it was chasing a cat.")
+ "I saw a dog and he was chasing a cat.")
def test_dictionary_sentences_two_sentences(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "A sentence. I saw a dog.It was chasing a cat. Another sentence.")
- matches = coref_holmes_manager.match_returning_dictionaries()
+ "A sentence. I saw a dog.He was chasing a cat. Another sentence.")
+ matches = coref_holmes_manager.match()
self.assertEqual(matches[0]['sentences_within_document'],
- "I saw a dog. It was chasing a cat.")
+ "I saw a dog. He was chasing a cat.")
def test_dictionary_sentences_three_sentences(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
- "A sentence. I saw a dog. I was happy.It was chasing a cat. Another sentence.")
- matches = coref_holmes_manager.match_returning_dictionaries()
+ "A sentence. I saw a dog. I was happy.He was chasing a cat. Another sentence.")
+ matches = coref_holmes_manager.match()
self.assertEqual(matches[0]['sentences_within_document'],
- "I saw a dog. I was happy. It was chasing a cat.")
+ "I saw a dog. I was happy. He was chasing a cat.")
def test_dictionary_sentences_three_sentences_none_surrounding(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
"I saw a dog.I was happy. It was chasing a cat.")
- matches = coref_holmes_manager.match_returning_dictionaries()
+ matches = coref_holmes_manager.match()
self.assertEqual(matches[0]['sentences_within_document'],
"I saw a dog. I was happy. It was chasing a cat.")
@@ -684,7 +655,7 @@ def test_no_loop_with_difficult_sentence(self):
the kindest heart; he had loved her so much, and she had loved
him in return; they had kissed and loved each other, and the
boy had been her joy, her second life.""")
- matches = embeddings_coref_holmes_manager.match_returning_dictionaries()
+ matches = embeddings_coref_holmes_manager.match()
def test_maximum_mentions_difference(self):
coref_holmes_manager.remove_all_documents()
@@ -705,20 +676,20 @@ def test_involves_coreference_true(self):
coref_holmes_manager.parse_and_register_document(
"""I saw a dog. It was chasing a cat.""")
matches = coref_holmes_manager.match()
- self.assertTrue(matches[0].involves_coreference)
- self.assertTrue(matches[0].word_matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[1].involves_coreference)
- self.assertFalse(matches[0].word_matches[2].involves_coreference)
+ self.assertTrue(matches[0]['involves_coreference'])
+ self.assertTrue(matches[0]['word_matches'][0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][1]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][2]['involves_coreference'])
def test_involves_coreference_false(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
"""A dog was chasing a cat.""")
matches = coref_holmes_manager.match()
- self.assertFalse(matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
- self.assertFalse(matches[0].word_matches[0].involves_coreference)
+ self.assertFalse(matches[0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
+ self.assertFalse(matches[0]['word_matches'][0]['involves_coreference'])
def test_adjective_verb_phrase_as_search_phrase_matches_simple(self):
coref_holmes_manager.remove_all_documents()
@@ -726,7 +697,7 @@ def test_adjective_verb_phrase_as_search_phrase_matches_simple(self):
"""We discussed holidays. They were very hard to find.""")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self.assertFalse(matches[0].is_uncertain)
+ self.assertFalse(matches[0]['uncertain'])
def test_adjective_verb_phrase_as_search_phrase_no_match_with_normal_phrase(self):
coref_holmes_manager.remove_all_documents()
@@ -742,7 +713,7 @@ def test_adjective_verb_phrase_as_search_phrase_matches_compound(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_objective_adjective_verb_phrase_matches_normal_search_phrase_simple(self):
coref_holmes_manager.remove_all_documents()
@@ -750,7 +721,7 @@ def test_objective_adjective_verb_phrase_matches_normal_search_phrase_simple(sel
"""We discussed policies. They was very hard to find""")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_objective_adjective_verb_phrase_matches_normal_search_phrase_compound(self):
coref_holmes_manager.remove_all_documents()
@@ -759,7 +730,7 @@ def test_objective_adjective_verb_phrase_matches_normal_search_phrase_compound(s
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_simple(self):
coref_holmes_manager.remove_all_documents()
@@ -767,7 +738,7 @@ def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_simple(se
"""We saw the man. He was very glad to sing""")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self.assertTrue(matches[0].is_uncertain)
+ self.assertTrue(matches[0]['uncertain'])
def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_compound(self):
coref_holmes_manager.remove_all_documents()
@@ -776,7 +747,7 @@ def test_subjective_adjective_verb_phrase_matches_normal_search_phrase_compound(
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 4)
for match in matches:
- self.assertTrue(match.is_uncertain)
+ self.assertTrue(match['uncertain'])
def test_prepositional_phrase_no_conjunction(self):
coref_holmes_manager.remove_all_documents()
@@ -785,7 +756,7 @@ def test_prepositional_phrase_no_conjunction(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_prepositional_phrase_with_conjunction(self):
coref_holmes_manager.remove_all_documents()
@@ -794,7 +765,7 @@ def test_prepositional_phrase_with_conjunction(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_coreference_of_noun_phrase_with_conjunction_only_one_matches(self):
coref_holmes_manager.remove_all_documents()
@@ -803,7 +774,7 @@ def test_coreference_of_noun_phrase_with_conjunction_only_one_matches(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_coreference_of_noun_phrase_with_conjunction_both_match(self):
coref_holmes_manager.remove_all_documents()
@@ -812,7 +783,7 @@ def test_coreference_of_noun_phrase_with_conjunction_both_match(self):
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 2)
for match in matches:
- self.assertFalse(match.is_uncertain)
+ self.assertFalse(match['uncertain'])
def test_coreference_of_noun_phrase_with_conjunction_multiple_clusters(self):
coref_holmes_manager.remove_all_documents()
@@ -827,15 +798,43 @@ def test_coreference_and_derivation(self):
"""They demanded an explanation. Somebody attempted it.""")
matches = coref_holmes_manager.match()
self.assertEqual(len(matches), 1)
- self.assertEqual(matches[0].word_matches[1].type, 'derivation')
+ self.assertEqual(matches[0]['word_matches'][1]['match_type'], 'derivation')
- def test_parent_token_indexes(self):
+ def test_coreference_linked_parent_token_indexes(self):
coref_holmes_manager.remove_all_documents()
coref_holmes_manager.parse_and_register_document(
"I saw a house. I saw it in the village.", 'village')
- doc = coref_holmes_manager.threadsafe_container.get_document(
- 'village')
- self.assertTrue(
- coref_holmes_manager.semantic_analyzer.is_involved_in_coreference(doc[7]))
- self.assertEqual(doc[10]._.holmes.parent_dependencies,
+ doc = coref_holmes_manager.get_document('village')
+ self.assertTrue(doc[7]._.holmes.is_involved_in_coreference)
+ self.assertEqual(doc[10]._.holmes.coreference_linked_parent_dependencies,
[[3, 'pobjp'], [6, 'pobjp'], [7, 'pobjp'], [8, 'pobj']])
+
+ def test_dobj_matches_amod(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document("I saw a boy. Someone had adopted him")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+ self.assertTrue(matches[0]['uncertain'])
+ self._check_word_match(matches[0], 1, 3, 'boy')
+
+ def test_nsubj_matches_amod(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document("I saw a boy. He was running")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 1)
+
+ def test_dobj_matches_amod_with_conjunction(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "I saw a boy and a boy. Someone had adopted them")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
+ self.assertTrue(matches[0]['uncertain'])
+ self.assertTrue(matches[1]['uncertain'])
+
+ def test_nsubj_matches_amod_with_conjunction(self):
+ coref_holmes_manager.remove_all_documents()
+ coref_holmes_manager.parse_and_register_document(
+ "Yesterday I saw a boy and a boy. They were running")
+ matches = coref_holmes_manager.match()
+ self.assertEqual(len(matches), 2)
diff --git a/holmes_extractor/tests/en/test_supervised_topic_classification_EN.py b/tests/en/test_supervised_topic_classification_EN.py
similarity index 94%
rename from holmes_extractor/tests/en/test_supervised_topic_classification_EN.py
rename to tests/en/test_supervised_topic_classification_EN.py
index 0391753..98d34a4 100644
--- a/holmes_extractor/tests/en/test_supervised_topic_classification_EN.py
+++ b/tests/en/test_supervised_topic_classification_EN.py
@@ -1,17 +1,20 @@
import unittest
import holmes_extractor as holmes
-from holmes_extractor.extensive_matching import SupervisedTopicClassifier
+from holmes_extractor.classification import SupervisedTopicClassifier
import os
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
(script_directory, 'test_ontology.owl')))
-holmes_manager = holmes.Manager('en_core_web_lg',
- perform_coreference_resolution=True, ontology=ontology)
-no_ontology_holmes_manager = holmes.Manager('en_core_web_lg',
- perform_coreference_resolution=True)
-no_coref_holmes_manager = holmes.Manager('en_core_web_lg',
- perform_coreference_resolution=False, ontology=ontology)
+holmes_manager = holmes.Manager('en_core_web_trf',
+ perform_coreference_resolution=True, ontology=ontology,
+ number_of_workers=1)
+no_ontology_holmes_manager = holmes.Manager('en_core_web_trf',
+ perform_coreference_resolution=True,
+ number_of_workers=1)
+no_coref_holmes_manager = holmes.Manager('en_core_web_trf',
+ perform_coreference_resolution=False, ontology=ontology,
+ number_of_workers=1)
class EnglishSupervisedTopicClassificationTest(unittest.TestCase):
@@ -52,7 +55,7 @@ def test_get_labels_to_classification_frequencies_ontology_multiword_matching(se
sttb = holmes_manager.get_supervised_topic_training_basis(
oneshot=False)
sttb.parse_and_register_training_document(
- "A gymnast jumps over a vaulting horse", 'gym')
+ "A gymnast jumps over a wastage horse", 'gym')
sttb.parse_and_register_training_document("fast", 'dummy')
sttb.prepare()
freq = sttb.labels_to_classification_frequencies
@@ -200,7 +203,7 @@ def test_multiple_document_classes(self):
sttb.parse_and_register_training_document(
"A gymnast jumps over a horse", 'gym')
sttb.parse_and_register_training_document(
- "A gymnast jumps over a vaulting horse", 'gym')
+ "A gymnast jumps over a wastage horse", 'gym')
sttb.prepare()
freq = sttb.labels_to_classification_frequencies
self.assertEqual(
@@ -241,15 +244,15 @@ def test_whole_scenario_with_classification_ontology(self):
cv_threshold=0, mlp_max_iter=10000)
self.assertEqual(['prepgovernor-noun: animal-lead', 'word: animal', 'word: computer',
'word: lead', 'word: robot'],
- list(trainer._sorted_label_dict.keys()))
+ list(trainer.sorted_label_dict.keys()))
self.assertEqual([[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0],
[1.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0,
0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0],
- [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0]], trainer._input_matrix.toarray().tolist())
+ [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0]], trainer.input_matrix.toarray().tolist())
self.assertEqual([[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0,
1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0,
- 1.0, 0.0, 0.0, 0.0]], trainer._output_matrix.toarray().tolist())
+ 1.0, 0.0, 0.0, 0.0]], trainer.output_matrix.toarray().tolist())
self.assertEqual((5, 5, 6), trainer._hidden_layer_sizes)
stc = trainer.classifier()
self.assertEqual(stc.parse_and_classify(
@@ -263,7 +266,7 @@ def test_whole_scenario_with_classification_ontology(self):
stc2 = no_ontology_holmes_manager.deserialize_supervised_topic_classifier(
serialized_supervised_topic_classifier_model, verbose=True)
self.assertEqual(['prepgovernor-noun: animal-lead', 'word: animal', 'word: computer',
- 'word: lead', 'word: robot'], list(stc2._model.sorted_label_dict.keys()))
+ 'word: lead', 'word: robot'], list(stc2.model.sorted_label_dict.keys()))
self.assertEqual(stc2.parse_and_classify(
"You are a robot."), ['computers'])
self.assertEqual(stc2.parse_and_classify("You are a cat."), ['animal'])
@@ -298,17 +301,17 @@ def test_whole_scenario_with_classification_ontology_and_match_all_words(self):
cv_threshold=0, mlp_max_iter=10000)
self.assertEqual(['prepgovernor-noun: animal-lead', 'word: animal', 'word: computer',
'word: lead', 'word: on', 'word: robot'],
- list(trainer._sorted_label_dict.keys()))
+ list(trainer.sorted_label_dict.keys()))
self.assertEqual([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0], [
0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [
0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
- [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]], trainer._input_matrix.toarray().tolist())
+ [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]], trainer.input_matrix.toarray().tolist())
self.assertEqual([[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0,
1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0,
- 1.0, 0.0, 0.0, 0.0]], trainer._output_matrix.toarray().tolist())
+ 1.0, 0.0, 0.0, 0.0]], trainer.output_matrix.toarray().tolist())
self.assertEqual((6, 6, 6), trainer._hidden_layer_sizes)
stc = trainer.classifier()
self.assertEqual(stc.parse_and_classify(
@@ -323,7 +326,7 @@ def test_whole_scenario_with_classification_ontology_and_match_all_words(self):
serialized_supervised_topic_classifier_model)
self.assertEqual(['prepgovernor-noun: animal-lead', 'word: animal', 'word: computer',
'word: lead', 'word: on', 'word: robot'],
- list(stc2._model.sorted_label_dict.keys()))
+ list(stc2.model.sorted_label_dict.keys()))
self.assertEqual(stc2.parse_and_classify(
"You are a robot."), ['computers'])
self.assertEqual(stc2.parse_and_classify("You are a cat."), ['animal'])
@@ -348,18 +351,18 @@ def test_filtering(self):
'gym')
sttb.prepare()
trainer = sttb.train(minimum_occurrences=4, cv_threshold=0.0)
- self.assertEqual(list(trainer._sorted_label_dict.keys()),
+ self.assertEqual(list(trainer.sorted_label_dict.keys()),
['predicate-actor: chasing-animal',
'predicate-actor: chasing-animal/predicate-patient: chasing-animal',
'predicate-patient: chasing-animal', 'word: animal'])
- self.assertEqual(set(map(lambda phr: phr.label, trainer._phraselet_infos)),
+ self.assertEqual(set(map(lambda phr: phr.label, trainer.phraselet_infos)),
{'predicate-actor: chasing-animal',
'predicate-patient: chasing-animal', 'word: animal'})
trainer2 = sttb.train(minimum_occurrences=4, cv_threshold=1)
- self.assertEqual(list(trainer2._sorted_label_dict.keys()),
+ self.assertEqual(list(trainer2.sorted_label_dict.keys()),
['predicate-actor: chasing-animal',
'predicate-actor: chasing-animal/predicate-patient: chasing-animal',
'predicate-patient: chasing-animal'])
- self.assertEqual(set(map(lambda phr: phr.label, trainer2._phraselet_infos)),
+ self.assertEqual(set(map(lambda phr: phr.label, trainer2.phraselet_infos)),
{'predicate-actor: chasing-animal',
'predicate-patient: chasing-animal'})
diff --git a/holmes_extractor/tests/en/test_topic_matching_EN.py b/tests/en/test_topic_matching_EN.py
similarity index 50%
rename from holmes_extractor/tests/en/test_topic_matching_EN.py
rename to tests/en/test_topic_matching_EN.py
index ef03509..fa0f4c4 100644
--- a/holmes_extractor/tests/en/test_topic_matching_EN.py
+++ b/tests/en/test_topic_matching_EN.py
@@ -1,27 +1,34 @@
import unittest
import holmes_extractor as holmes
-from holmes_extractor.extensive_matching import TopicMatcher
+from holmes_extractor.topic_matching import TopicMatcher
import os
script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join((script_directory, 'test_ontology.owl')),
symmetric_matching=True)
-holmes_manager_coref = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- overall_similarity_threshold=0.65, perform_coreference_resolution=True)
-holmes_manager_coref_embedding_on_root = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- overall_similarity_threshold=0.65, embedding_based_matching_on_root_words=True)
-holmes_manager_coref_no_embeddings = holmes.Manager(model='en_core_web_lg', ontology=ontology,
- overall_similarity_threshold=1, perform_coreference_resolution=True)
+holmes_manager_coref = holmes.Manager(model='en_core_web_trf', ontology=ontology,
+ perform_coreference_resolution=True,
+ number_of_workers=2)
+ontology_for_sm_tests = holmes.Ontology(os.sep.join((script_directory, 'test_ontology.owl')))
+
class EnglishTopicMatchingTest(unittest.TestCase):
- def _check_equals(self, text_to_match, document_text, highest_score, manager):
+ def _check_equals(self, text_to_match, document_text, highest_score, manager,
+ word_embedding_match_threshold=0.42, use_frequency_factor=True):
manager.remove_all_documents()
manager.parse_and_register_document(document_text)
- topic_matches = manager.topic_match_documents_against(text_to_match, relation_score=20,
- reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5)
- self.assertEqual(int(topic_matches[0].score), highest_score)
+ topic_matches = manager.topic_match_documents_against(text_to_match,
+ word_embedding_match_threshold=
+ word_embedding_match_threshold,
+ relation_score=20,
+ reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5,
+ different_match_cutoff_score=10,
+ relation_matching_frequency_threshold=0.0,
+ embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=use_frequency_factor)
+ self.assertEqual(int(topic_matches[0]['score']), highest_score)
def test_no_match(self):
holmes_manager_coref.remove_all_documents()
@@ -43,6 +50,34 @@ def test_direct_matching(self):
self._check_equals("A plant grows", "A plant grows",
34, holmes_manager_coref)
+ def test_direct_matching_frequency_factor_2_word(self):
+ self._check_equals("A plant grows", "A plant grows. A plant",
+ 34, holmes_manager_coref)
+
+ def test_direct_matching_frequency_factor_3_word(self):
+ self._check_equals("A plant grows", "A plant grows. A plant and a plant",
+ 16, holmes_manager_coref)
+
+ def test_direct_matching_frequency_factor_3_word_control(self):
+ self._check_equals("A plant grows", "A plant grows. A plant and a plant",
+ 34, holmes_manager_coref, use_frequency_factor=False)
+
+ def test_direct_matching_frequency_factor_3_word_with_higher_maximum(self):
+ self._check_equals("A plant grows", "A plant grows. A plant and a plant. Word word word word word.",
+ 22, holmes_manager_coref)
+
+ def test_direct_matching_frequency_factor_2_relation(self):
+ self._check_equals("A plant grows", "A plant grows. A plant grows.",
+ 34, holmes_manager_coref)
+
+ def test_direct_matching_frequency_factor_3_relation(self):
+ self._check_equals("A plant grows", "A plant grows. A plant grows. A plant grows.",
+ 8, holmes_manager_coref)
+
+ def test_direct_matching_frequency_factor_3_relation_with_higher_maximum(self):
+ self._check_equals("A plant grows", "A plant grows. A plant grows. A plant grows. Word word word word word.",
+ 14, holmes_manager_coref)
+
def test_direct_matching_nonsense_word(self):
self._check_equals("My friend visited gegwghg", "Peter visited gegwghg", 34,
holmes_manager_coref)
@@ -56,21 +91,53 @@ def test_coref_matching(self):
holmes_manager_coref)
def test_entity_matching(self):
- self._check_equals("My friend visited ENTITYGPE", "Peter visited Paris", 34,
+ self._check_equals("My house visited ENTITYGPE", "Peter visited London", 34,
+ holmes_manager_coref)
+
+ def test_entity_matching_frequency_factor(self):
+ self._check_equals("My house visited ENTITYGPE", "Peter visited Paris. London. Berlin.", 15,
+ holmes_manager_coref)
+
+ def test_entity_embedding_matching(self):
+ self._check_equals("My friend visited ENTITYGPE", "Peter visited London", 57,
+ holmes_manager_coref)
+
+ def test_entity_embedding_matching_frequency_factor(self):
+ self._check_equals("My friend visited ENTITYGPE", "Peter visited Paris. London. Berlin.", 32,
holmes_manager_coref)
def test_entitynoun_matching(self):
self._check_equals("My friend visited ENTITYNOUN", "Peter visited a city", 25,
holmes_manager_coref)
+ def test_entitynoun_matching_control(self):
+ self._check_equals("My friend visited ENTITYNOUN", "Peter visited a city. Word. word.", 25,
+ holmes_manager_coref)
+
def test_ontology_matching_synonym(self):
self._check_equals("I saw an pussy", "Somebody saw a cat", 31,
holmes_manager_coref)
+ def test_ontology_matching_synonym_frequency_factor(self):
+ self._check_equals("I saw an pussy", "Somebody saw a cat. A cat. A cat.", 14,
+ holmes_manager_coref)
+
+ def test_ontology_matching_synonym_frequency_factor_different_ontology_words(self):
+ self._check_equals("I saw an pussy", "Somebody saw a cat. A kitten. A cat.", 31,
+ holmes_manager_coref)
+
def test_ontology_matching_hyponym_depth_1(self):
self._check_equals("I saw an animal", "Somebody saw a cat", 28,
holmes_manager_coref)
+ def test_ontology_matching_hyponym_depth_1_frequency_factor(self):
+ self._check_equals("I saw an animal", "Somebody saw a cat. An cat. A cat.", 13,
+ holmes_manager_coref)
+
+ def test_ontology_matching_hyponym_depth_1_frequency_factor_different_ontology_words(self):
+ self._check_equals("I saw an animal", "Somebody saw a cat. An kitten. A cat.", 28,
+ holmes_manager_coref)
+
def test_ontology_matching_hyponym_depth_2(self):
self._check_equals("I saw an animal", "Somebody saw a kitten", 26,
holmes_manager_coref)
@@ -91,6 +158,14 @@ def test_ontology_matching_multiword_in_document(self):
self._check_equals("I saw an animal", "Somebody saw Mimi Momo", 26,
holmes_manager_coref)
+ def test_ontology_matching_multiword_in_document_frequency_factor(self):
+ self._check_equals("I saw an animal", "Somebody saw Mimi Momo. Mimi Momo. Mimi Momo.", 12,
+ holmes_manager_coref)
+
+ def test_ontology_matching_multiword_in_document_frequency_factor_control(self):
+ self._check_equals("I saw an animal", "Somebody saw Mimi Momo. Momo. Momo.", 26,
+ holmes_manager_coref)
+
def test_ontology_matching_multiword_in_search_text(self):
self._check_equals("I saw Mimi Momo", "Somebody saw an animal", 26,
holmes_manager_coref)
@@ -111,17 +186,30 @@ def test_embedding_matching_not_root(self):
self._check_equals("I saw a king", "Somebody saw a queen", 15,
holmes_manager_coref)
- def test_embedding_matching_root(self):
- self._check_equals("I saw a king", "Somebody saw a queen", 19,
- holmes_manager_coref_embedding_on_root)
-
def test_embedding_matching_root_overall_similarity_too_low(self):
- self._check_equals("I saw a king", "Somebody viewed a queen", 4,
- holmes_manager_coref_embedding_on_root)
+
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.parse_and_register_document(
+ "I saw a king.")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("Somebody viewed a queen",
+ relation_score=20, reverse_only_relation_score=15, single_word_score=10,
+ single_word_any_tag_score=5,
+ relation_matching_frequency_threshold=0.0,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(len(topic_matches), 0)
def test_embedding_matching_root_word_only(self):
- self._check_equals("king", "queen", 4,
- holmes_manager_coref_embedding_on_root)
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.parse_and_register_document(
+ "king")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("queen",
+ relation_score=20, reverse_only_relation_score=15, single_word_score=10,
+ single_word_any_tag_score=5,
+ relation_matching_frequency_threshold=0.0,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(len(topic_matches), 0)
def test_matching_only_adjective(self):
self._check_equals("nice", "nice", 5, holmes_manager_coref)
@@ -169,13 +257,17 @@ def test_reverse_matching_noun_no_coreference(self):
self._check_equals("A car with an engine", "An automobile with an engine", 51,
holmes_manager_coref)
+ def test_reverse_matching_noun_no_coreference_frequency_factor(self):
+ self._check_equals("A car with an engine", "An automobile with an engine. An engine. An engine.", 28,
+ holmes_manager_coref)
+
def test_reverse_matching_noun_no_coreference_control_no_embeddings(self):
self._check_equals("A car with an engine", "An automobile with an engine", 29,
- holmes_manager_coref_no_embeddings)
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_reverse_matching_noun_no_coreference_control_same_word(self):
self._check_equals("A car with an engine", "A car with an engine", 75,
- holmes_manager_coref_no_embeddings)
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_forward_matching_noun_entity_governor_match(self):
self._check_equals("An ENTITYPERSON with a car", "Richard Hudson with a vehicle", 23,
@@ -203,155 +295,177 @@ def test_reverse_matching_noun_entitynoun_governed(self):
def test_relation_matching_suppressed(self):
holmes_manager_coref.remove_all_documents()
- holmes_manager_coref.parse_and_register_document("A dog chases a cat")
- topic_matches = holmes_manager_coref.topic_match_documents_against("A dog chases a cat",
+ holmes_manager_coref.parse_and_register_document("A dog chases a cat. A dog sees a cat. A dog sees a cat. A person was chasing a person. A person chased a person.")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("A dog chases a cat.",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=0,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 24)
+ relation_matching_frequency_threshold=1.0,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 24)
def test_suppressed_relation_matching_picked_up_during_reverse_matching(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "A dog chases a cat. A lion chases a tiger.")
+ "Chasing. Chasing. A dog chases a cat. A lion chases a tiger.")
topic_matches = holmes_manager_coref.topic_match_documents_against("A dog chases a cat",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 82)
+ relation_matching_frequency_threshold=0.9,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 82)
def test_suppressed_relation_matching_picked_up_during_reverse_matching_with_coreference(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "There was a man and there was a woman. He saw her. A lion sees a tiger.")
- topic_matches = holmes_manager_coref.topic_match_documents_against("A man sees a woman",
+ "There was a cat. A dog chased it. A lion chases a tiger. Chasing. Chasing. ")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("A dog chases a cat",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 83)
-
- def test_relation_matching_suppressed_control_embedding_based_matching_on_root_words(self):
- holmes_manager_coref_embedding_on_root.remove_all_documents()
- holmes_manager_coref_embedding_on_root.parse_and_register_document(
- "A dog chases a cat")
- topic_matches = holmes_manager_coref_embedding_on_root.topic_match_documents_against(
- "A dog chases a cat",
- relation_score=20, reverse_only_relation_score=15, single_word_score=10,
- single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=0,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 82)
+ relation_matching_frequency_threshold=0.9,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 84)
+
+ def test_suppressed_relation_matching_picked_up_during_reverse_matching_with_reverse_dependency(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.parse_and_register_document(
+ "Someone adopts the child. The child is here. Children. Children. Children.")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("An adopted child",
+ relation_score=20, reverse_only_relation_score=15, single_word_score=10,
+ single_word_any_tag_score=5,
+ relation_matching_frequency_threshold=0.9,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+
+ self.assertEqual(int(topic_matches[0]['score']), 34)
def test_reverse_matching_suppressed_with_relation_matching(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I was in Germany. I know Germany.")
+ "I was in Germany. I know Germany. Germany. Germany.")
+ topic_matches = holmes_manager_coref.topic_match_documents_against("in Germany",
+ relation_score=20, reverse_only_relation_score=15, single_word_score=10,
+ single_word_any_tag_score=5,
+ relation_matching_frequency_threshold=0.1, embedding_matching_frequency_threshold=0.6)
+ self.assertEqual(int(topic_matches[0]['score']), 10)
+
+ def test_reverse_matching_suppressed_with_relation_matching_embedding_value_same(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.parse_and_register_document(
+ "I was in Germany. I know Germany. Germany. Germany.")
topic_matches = holmes_manager_coref.topic_match_documents_against("in Germany",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 14)
+ relation_matching_frequency_threshold=0.1, embedding_matching_frequency_threshold=0.1)
+ self.assertEqual(int(topic_matches[0]['score']), 10)
- def test_reverse_matching_suppressed_with_relation_matching_embedding_value_also_1(self):
+ def test_reverse_matching_suppressed_with_relation_matching_control(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I was in Germany. I know Germany.")
+ "I was in Germany. I know Germany. Germany. Germany.")
topic_matches = holmes_manager_coref.topic_match_documents_against("in Germany",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=1)
- self.assertEqual(int(topic_matches[0].score), 14)
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0)
+ self.assertEqual(int(topic_matches[0]['score']), 7)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_parent(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "An automobile with an engine")
+ "An automobile with an engine. An engine. An engine.")
topic_matches = holmes_manager_coref.topic_match_documents_against("A car with an engine",
+ word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 29)
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 29)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_parent_control(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "An automobile with an engine")
+ "An automobile with an engine. An engine. An engine.")
topic_matches = holmes_manager_coref.topic_match_documents_against("A car with an engine",
+ word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=1)
- self.assertEqual(int(topic_matches[0].score), 51)
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 51)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_child(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "An engine with an automobile")
+ "An engine with an automobile. An engine. An engine.")
topic_matches = holmes_manager_coref.topic_match_documents_against("An engine with a car",
+ word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 14)
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 14)
def test_reverse_matching_suppressed_with_embedding_reverse_matching_child_control(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "An engine with an automobile")
+ "An engine with an automobile. An engine. An engine.")
topic_matches = holmes_manager_coref.topic_match_documents_against("An engine with a car",
+ word_embedding_match_threshold=0.42,
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_embedding_matching=1)
- self.assertEqual(int(topic_matches[0].score), 25)
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 25)
def test_entity_matching_suppressed_with_relation_matching_for_governor(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I was tired Richard Paul Hudson. I was a tired Richard Paul Hudson.")
+ "I was tired Richard Paul Hudson. I was a tired Richard Paul Hudson. I spoke to Richard Paul Hudson and he was tired.")
topic_matches = holmes_manager_coref.topic_match_documents_against("tired ENTITYPERSON",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 14)
+ relation_matching_frequency_threshold=1.0, embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 14)
def test_entity_matching_suppressed_with_relation_matching_for_governor_control(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I was Richard Paul Hudson. I was a tired Richard Paul Hudson.")
+ "I was tired Richard Paul Hudson. I was a tired Richard Paul Hudson. I spoke to Richard Paul Hudson and he was tired.")
topic_matches = holmes_manager_coref.topic_match_documents_against("tired ENTITYPERSON",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 34)
+ relation_matching_frequency_threshold=0.0, embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 34)
def test_entity_matching_suppressed_with_relation_matching_for_governed(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I knew Richard Paul Hudson. I knew Richard Paul Hudson.")
+ "I knew Richard Paul Hudson. I knew Richard Paul Hudson. I knew someone and spoke to Richard Paul Hudson.")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"someone knows an ENTITYPERSON",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 14)
+ relation_matching_frequency_threshold=1.0,
+ embedding_matching_frequency_threshold=1.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 14)
def test_entity_matching_suppressed_with_relation_matching_for_governed_control(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.parse_and_register_document(
- "I met Richard Paul Hudson. I knew Richard Paul Hudson.")
+ "I knew Richard Paul Hudson. I knew Richard Paul Hudson. I knew someone and spoke to Richard Paul Hudson.")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"someone knows an ENTITYPERSON",
relation_score=20, reverse_only_relation_score=15, single_word_score=10,
single_word_any_tag_score=5,
- maximum_number_of_single_word_matches_for_relation_matching=1,
- maximum_number_of_single_word_matches_for_embedding_matching=0)
- self.assertEqual(int(topic_matches[0].score), 34)
+ relation_matching_frequency_threshold=0.0,
+ embedding_matching_frequency_threshold=0.0,
+ use_frequency_factor=False)
+ self.assertEqual(int(topic_matches[0]['score']), 34)
def test_reverse_matching_noun_coreference_on_governor(self):
self._check_equals("A car with an engine", "I saw an automobile. I saw it with an engine",
@@ -361,27 +475,27 @@ def test_reverse_matching_noun_coreference_on_governor(self):
def test_reverse_matching_noun_coreference_on_governor_control_no_embeddings(self):
self._check_equals("A car with an engine", "I saw an automobile. I saw it with an engine",
29,
- holmes_manager_coref_no_embeddings)
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_reverse_matching_noun_coreference_on_governor_control_same_word(self):
self._check_equals("A car with an engine", "I saw a car. I saw it with an engine",
73,
- holmes_manager_coref_no_embeddings)
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_reverse_matching_noun_coreference_on_governed(self):
self._check_equals(
- "An engine with a car", "I saw an automobile. There was an engine with it", 25,
+ "An engine with a car", "I saw an automobile. I saw the engine with it", 25,
holmes_manager_coref)
def test_reverse_matching_noun_coreference_on_governed_control_no_embeddings(self):
self._check_equals(
- "An engine with a car", "I saw an automobile. There was an engine with it", 14,
- holmes_manager_coref_no_embeddings)
+ "An engine with a car", "I saw an automobile. I saw the engine with it", 14,
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_reverse_matching_noun_coreference_on_governed_control_same_word(self):
self._check_equals(
- "An engine with a car", "I saw a car. There was an engine with it", 76,
- holmes_manager_coref_no_embeddings)
+ "An engine with a car", "I saw a car. I saw the engine with it", 76,
+ holmes_manager_coref, word_embedding_match_threshold=1.0)
def test_reverse_matching_verb_with_coreference_and_conjunction(self):
self._check_equals("A company is bought", "A company is bought and purchased", 34,
@@ -485,26 +599,26 @@ def test_multiword_in_text_to_search_single_in_document_with_coref_root(self):
def test_multiword_in_text_to_search_and_in_document_not_root_match_on_embeddings(self):
self._check_equals("Richard Paul Hudson came",
"I saw Richard Paul Hudson", 19,
- holmes_manager_coref_embedding_on_root)
+ holmes_manager_coref)
def test_multiword_in_text_to_search_and_in_document_root_match_on_embeddings(self):
self._check_equals("the tired Richard Paul Hudson",
"I saw Richard Paul Hudson", 19,
- holmes_manager_coref_embedding_on_root)
+ holmes_manager_coref)
def test_multiword_in_text_to_search_and_in_document_not_root_no_embeddings(self):
self._check_equals("Richard Paul Hudson came",
"I saw Richard Paul Hudson", 19,
- holmes_manager_coref_embedding_on_root)
+ holmes_manager_coref)
def test_multiword_in_text_to_search_and_in_document_root_no_embeddings(self):
self._check_equals("the tired Richard Paul Hudson",
"I saw Richard Paul Hudson", 19,
- holmes_manager_coref_embedding_on_root)
+ holmes_manager_coref)
def test_matches_in_opposite_directions(self):
- self._check_equals("Mirror of Erised",
- "Mirror of Erised", 39,
+ self._check_equals("the mirror of Erised",
+ "the mirror of Erised", 39,
holmes_manager_coref)
def test_derived_form_text_to_match_single_word(self):
@@ -512,14 +626,14 @@ def test_derived_form_text_to_match_single_word(self):
"inform", 10,
holmes_manager_coref)
- def test_derived_form_document_text_single_word(self):
- self._check_equals("inform",
- "information", 5,
+ def test_derived_form_text_to_match_single_word_frequency_factor(self):
+ self._check_equals("information",
+ "inform. inform. inform.", 3,
holmes_manager_coref)
- def test_derived_form_text_to_match_single_word(self):
- self._check_equals("information",
- "inform", 10,
+ def test_derived_form_document_text_single_word(self):
+ self._check_equals("give",
+ "gift", 5,
holmes_manager_coref)
def test_derived_form_single_word_control(self):
@@ -568,43 +682,58 @@ def test_derived_forms_matched_by_ontology_2(self):
holmes_manager_coref)
def test_derived_multiword_child_also_matched_by_ontology_1(self):
- self._check_equals("He used a vault horse",
- "He used a vaulting horse", 34,
+ self._check_equals("He used a waste horse",
+ "He used a wastage horse", 34,
holmes_manager_coref)
def test_derived_multiword_child_also_matched_by_ontology_2(self):
- self._check_equals("He used a vaulting horse",
- "He used a vault horse", 32,
+ self._check_equals("He used a wastage horse",
+ "He used a waste horse", 32,
holmes_manager_coref)
def test_derived_multiword_child_also_matched_by_ontology_3(self):
- self._check_equals("He used a vault horse",
+ self._check_equals("He used a waste horse",
"He used gymnastics equipment", 26,
holmes_manager_coref)
def test_derived_multiword_child_also_matched_by_ontology_4(self):
self._check_equals("He used gymnastics equipment",
- "He used a vault horse", 26,
+ "He used a waste horse", 26,
holmes_manager_coref)
def test_derived_multiword_parent_also_matched_by_ontology_1(self):
- self._check_equals("A big vault horse",
- "A big vaulting horse", 34,
+ self._check_equals("A big waste horse",
+ "A big wastage horse", 34,
holmes_manager_coref)
def test_derived_multiword_parent_also_matched_by_ontology_2(self):
- self._check_equals("A big vaulting horse",
- "A big vault horse", 32,
+ self._check_equals("A big wastage horse",
+ "A big waste horse", 32,
holmes_manager_coref)
def test_derived_multiword_parent_also_matched_by_ontology_3(self):
- self._check_equals("A big vault horse",
+ self._check_equals("A big waste horse",
"A big gymnastics equipment", 26,
holmes_manager_coref)
def test_derived_multiword_parent_also_matched_by_ontology_4(self):
self._check_equals("A big gymnastics equipment",
- "A big vault horse", 26,
+ "A big waste horse", 26,
+ holmes_manager_coref)
+
+ def test_reverse_dependencies_1(self):
+ self._check_equals("An adopted child",
+ "Someone adopts a child", 34,
+ holmes_manager_coref)
+
+ def test_reverse_dependencies_2(self):
+ self._check_equals("Someone adopts a child",
+ "An adopted child", 34,
+ holmes_manager_coref)
+
+ def test_reverse_dependencies_control(self):
+ self._check_equals("Adopted and child",
+ "An adopted child", 14,
holmes_manager_coref)
def test_coreference_double_match_on_governed(self):
@@ -613,13 +742,11 @@ def test_coreference_double_match_on_governed(self):
"I saw a man. The man walked")
topic_matches = holmes_manager_coref.topic_match_documents_against("A man walks",
relation_score=20, single_word_score=10, single_word_any_tag_score=5)
- self.assertEqual(int(topic_matches[0].score), 34)
- self.assertEqual(topic_matches[0].sentences_start_index, 5)
- self.assertEqual(topic_matches[0].sentences_end_index, 7)
- self.assertEqual(topic_matches[0].start_index, 6)
- self.assertEqual(topic_matches[0].end_index, 7)
- self.assertEqual(topic_matches[0].relative_start_index, 1)
- self.assertEqual(topic_matches[0].relative_end_index, 2)
+ self.assertEqual(int(topic_matches[0]['score']), 34)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 5)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 7)
+ self.assertEqual(topic_matches[0]['start_index'], 6)
+ self.assertEqual(topic_matches[0]['end_index'], 7)
def test_coreference_double_match_on_governor(self):
holmes_manager_coref.remove_all_documents()
@@ -627,13 +754,11 @@ def test_coreference_double_match_on_governor(self):
"I saw a big man. The man walked")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"A big man", relation_score=20, single_word_score=10, single_word_any_tag_score=5)
- self.assertEqual(int(topic_matches[0].score), 34)
- self.assertEqual(topic_matches[0].sentences_start_index, 0)
- self.assertEqual(topic_matches[0].sentences_end_index, 8)
- self.assertEqual(topic_matches[0].start_index, 3)
- self.assertEqual(topic_matches[0].end_index, 7)
- self.assertEqual(topic_matches[0].relative_start_index, 3)
- self.assertEqual(topic_matches[0].relative_end_index, 7)
+ self.assertEqual(int(topic_matches[0]['score']), 34)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 0)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 8)
+ self.assertEqual(topic_matches[0]['start_index'], 3)
+ self.assertEqual(topic_matches[0]['end_index'], 7)
def test_coreference_double_match_same_distance(self):
holmes_manager_coref.remove_all_documents()
@@ -641,13 +766,11 @@ def test_coreference_double_match_same_distance(self):
"The man was big. Man walked.")
topic_matches = holmes_manager_coref.topic_match_documents_against("A big man",
relation_score=20, single_word_score=10, single_word_any_tag_score=5)
- self.assertEqual(int(topic_matches[0].score), 34)
- self.assertEqual(topic_matches[0].sentences_start_index, 0)
- self.assertEqual(topic_matches[0].sentences_end_index, 7)
- self.assertEqual(topic_matches[0].start_index, 1)
- self.assertEqual(topic_matches[0].end_index, 5)
- self.assertEqual(topic_matches[0].relative_start_index, 1)
- self.assertEqual(topic_matches[0].relative_end_index, 5)
+ self.assertEqual(int(topic_matches[0]['score']), 34)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 0)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 7)
+ self.assertEqual(topic_matches[0]['start_index'], 1)
+ self.assertEqual(topic_matches[0]['end_index'], 5)
def test_indexes(self):
holmes_manager_coref.remove_all_documents()
@@ -655,12 +778,10 @@ def test_indexes(self):
"This is an irrelevant sentence. I think a plant grows.")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"A plant grows")
- self.assertEqual(topic_matches[0].sentences_start_index, 6)
- self.assertEqual(topic_matches[0].sentences_end_index, 11)
- self.assertEqual(topic_matches[0].start_index, 9)
- self.assertEqual(topic_matches[0].end_index, 10)
- self.assertEqual(topic_matches[0].relative_start_index, 3)
- self.assertEqual(topic_matches[0].relative_end_index, 4)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 6)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 11)
+ self.assertEqual(topic_matches[0]['start_index'], 9)
+ self.assertEqual(topic_matches[0]['end_index'], 10)
def test_indexes_with_preceding_non_matched_dependent(self):
holmes_manager_coref.remove_all_documents()
@@ -668,12 +789,10 @@ def test_indexes_with_preceding_non_matched_dependent(self):
"I saw a big dog.")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"A big dog")
- self.assertEqual(topic_matches[0].sentences_start_index, 0)
- self.assertEqual(topic_matches[0].sentences_end_index, 5)
- self.assertEqual(topic_matches[0].start_index, 3)
- self.assertEqual(topic_matches[0].end_index, 4)
- self.assertEqual(topic_matches[0].relative_start_index, 3)
- self.assertEqual(topic_matches[0].relative_end_index, 4)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 0)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 5)
+ self.assertEqual(topic_matches[0]['start_index'], 3)
+ self.assertEqual(topic_matches[0]['end_index'], 4)
def test_indexes_with_subsequent_non_matched_dependent(self):
holmes_manager_coref.remove_all_documents()
@@ -681,12 +800,10 @@ def test_indexes_with_subsequent_non_matched_dependent(self):
"The dog I saw was big.")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"A big dog")
- self.assertEqual(topic_matches[0].sentences_start_index, 0)
- self.assertEqual(topic_matches[0].sentences_end_index, 6)
- self.assertEqual(topic_matches[0].start_index, 1)
- self.assertEqual(topic_matches[0].end_index, 5)
- self.assertEqual(topic_matches[0].relative_start_index, 1)
- self.assertEqual(topic_matches[0].relative_end_index, 5)
+ self.assertEqual(topic_matches[0]['sentences_start_index'], 0)
+ self.assertEqual(topic_matches[0]['sentences_end_index'], 6)
+ self.assertEqual(topic_matches[0]['start_index'], 1)
+ self.assertEqual(topic_matches[0]['end_index'], 5)
def test_only_one_result_per_document(self):
holmes_manager_coref.remove_all_documents()
@@ -722,10 +839,10 @@ def test_match_cutoff(self):
""")
topic_matches = holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat")
- self.assertEqual(topic_matches[0].start_index, 117)
- self.assertEqual(topic_matches[0].end_index, 120)
+ self.assertEqual(topic_matches[0]['start_index'], 117)
+ self.assertEqual(topic_matches[0]['end_index'], 120)
- def test_result_ordering_by_match_length_different_documents(self):
+ def test_result_ordering_by_match_length_different_documents_2(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.remove_all_search_phrases()
holmes_manager_coref.parse_and_register_document("""
@@ -741,9 +858,9 @@ def test_result_ordering_by_match_length_different_documents(self):
topic_matches = holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat")
self.assertEqual(
- topic_matches[0].end_index - topic_matches[0].start_index, 7)
+ topic_matches[0]['end_index'] - topic_matches[0]['start_index'], 6)
self.assertEqual(
- topic_matches[1].end_index - topic_matches[1].start_index, 4)
+ topic_matches[1]['end_index'] - topic_matches[1]['start_index'], 3)
def test_dictionaries(self):
holmes_manager_coref.remove_all_documents()
@@ -752,15 +869,15 @@ def test_dictionaries(self):
holmes_manager_coref.parse_and_register_document("Dogs and cats.",
"animals")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat")
+ holmes_manager_coref.topic_match_documents_against(
+ "The dog chased the cat", use_frequency_factor=False)
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A dog chased a cat. A cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'sentences_character_start_index_in_document': 515, 'sentences_character_end_index_in_document': 541, 'score': 99.34666666666668, 'word_infos': [[2, 5, 'overlapping_relation', False, "Matches DOG directly."], [6, 12, 'overlapping_relation', False, "Matches CHASE directly."], [15, 18, 'overlapping_relation', True, "Matches CAT directly."], [22, 25, 'single', False, "Matches CAT directly."]]}, {'document_label': '', 'text': 'A dog chased a cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 19, 'score': 99.34666666666668, 'word_infos': [[2, 5, 'overlapping_relation', False, "Matches DOG directly."], [6, 12, 'overlapping_relation', False, "Matches CHASE directly."], [15, 18, 'overlapping_relation', True, "Matches CAT directly."]]}, {'document_label': 'animals', 'text': 'Dogs and cats.', 'text_to_match': 'The dog chased the cat', 'rank': '3', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 14, 'score': 9.866666666666667, 'word_infos': [[0, 4, 'single', False, "Matches DOG directly."], [9, 13, 'single', True, "Matches CAT directly."]]}])
+ [{'document_label': '', 'text': 'A dog chased a cat. A cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'index_within_document': 115, 'subword_index': None, 'start_index': 112, 'end_index': 118, 'sentences_start_index': 111, 'sentences_end_index': 119, 'sentences_character_start_index': 515, 'sentences_character_end_index': 541, 'score': 993.4666666666667, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches DOG directly.'], [6, 12, 'overlapping_relation', False, 'Matches CHASE directly.'], [15, 18, 'overlapping_relation', True, 'Matches CAT directly.'], [22, 25, 'single', False, 'Matches CAT directly.']], 'answers': []}, {'document_label': '', 'text': 'A dog chased a cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 19, 'score': 993.4666666666667, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches DOG directly.'], [6, 12, 'overlapping_relation', False, 'Matches CHASE directly.'], [15, 18, 'overlapping_relation', True, 'Matches CAT directly.']], 'answers': []}, {'document_label': 'animals', 'text': 'Dogs and cats.', 'text_to_match': 'The dog chased the cat', 'rank': '3', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 14, 'score': 98.66666666666667, 'word_infos': [[0, 4, 'single', False, 'Matches DOG directly.'], [9, 13, 'single', True, 'Matches CAT directly.']], 'answers': []}])
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
- "The dog chased the cat", tied_result_quotient=0.01)
+ holmes_manager_coref.topic_match_documents_against(
+ "The dog chased the cat", tied_result_quotient=0.01, use_frequency_factor=False)
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A dog chased a cat. A cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'sentences_character_start_index_in_document': 515, 'sentences_character_end_index_in_document': 541, 'score': 99.34666666666668, 'word_infos': [[2, 5, 'overlapping_relation', False, "Matches DOG directly."], [6, 12, 'overlapping_relation', False, "Matches CHASE directly."], [15, 18, 'overlapping_relation', True, "Matches CAT directly."], [22, 25, 'single', False, "Matches CAT directly."]]}, {'document_label': '', 'text': 'A dog chased a cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 19, 'score': 99.34666666666668, 'word_infos': [[2, 5, 'overlapping_relation', False, "Matches DOG directly."], [6, 12, 'overlapping_relation', False, "Matches CHASE directly."], [15, 18, 'overlapping_relation', True, "Matches CAT directly."]]}, {'document_label': 'animals', 'text': 'Dogs and cats.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 14, 'score': 9.866666666666667, 'word_infos': [[0, 4, 'single', False, "Matches DOG directly."], [9, 13, 'single', True, "Matches CAT directly."]]}])
+ [{'document_label': '', 'text': 'A dog chased a cat. A cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'index_within_document': 115, 'subword_index': None, 'start_index': 112, 'end_index': 118, 'sentences_start_index': 111, 'sentences_end_index': 119, 'sentences_character_start_index': 515, 'sentences_character_end_index': 541, 'score': 993.4666666666667, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches DOG directly.'], [6, 12, 'overlapping_relation', False, 'Matches CHASE directly.'], [15, 18, 'overlapping_relation', True, 'Matches CAT directly.'], [22, 25, 'single', False, 'Matches CAT directly.']], 'answers': []}, {'document_label': '', 'text': 'A dog chased a cat.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 5, 'sentences_character_start_index': 0, 'sentences_character_end_index': 19, 'score': 993.4666666666667, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches DOG directly.'], [6, 12, 'overlapping_relation', False, 'Matches CHASE directly.'], [15, 18, 'overlapping_relation', True, 'Matches CAT directly.']], 'answers': []}, {'document_label': 'animals', 'text': 'Dogs and cats.', 'text_to_match': 'The dog chased the cat', 'rank': '1=', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 3, 'sentences_character_start_index': 0, 'sentences_character_end_index': 14, 'score': 98.66666666666667, 'word_infos': [[0, 4, 'single', False, 'Matches DOG directly.'], [9, 13, 'single', True, 'Matches CAT directly.']], 'answers': []}])
def test_dictionaries_with_multiword_in_relation_not_final(self):
holmes_manager_coref.remove_all_documents()
@@ -768,43 +885,43 @@ def test_dictionaries_with_multiword_in_relation_not_final(self):
holmes_manager_coref.parse_and_register_document(
"Richard Paul Hudson came home")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"Richard Paul Hudson was coming")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Richard Paul Hudson came home', 'text_to_match': 'Richard Paul Hudson was coming', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 29, 'score': 40.8, 'word_infos': [[0, 19, 'relation', False, "Matches RICHARD PAUL HUDSON directly."], [20, 24, 'relation', True, "Matches COME directly."]]}])
+ [{'document_label': '', 'text': 'Richard Paul Hudson came home', 'text_to_match': 'Richard Paul Hudson was coming', 'rank': '1', 'index_within_document': 3, 'subword_index': None, 'start_index': 0, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 29, 'score': 408.0, 'word_infos': [[0, 19, 'relation', False, 'Matches RICHARD PAUL HUDSON directly.'], [20, 24, 'relation', True, 'Matches COME directly.']], 'answers': []}])
def test_dictionaries_with_multiword_alone(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.remove_all_search_phrases()
holmes_manager_coref.parse_and_register_document("Richard Paul Hudson")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"Richard Paul Hudson")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Richard Paul Hudson', 'text_to_match': 'Richard Paul Hudson', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 19, 'score': 8.92, 'word_infos': [[0, 19, 'single', True, "Matches RICHARD PAUL HUDSON directly."]]}])
+ [{'document_label': '', 'text': 'Richard Paul Hudson', 'text_to_match': 'Richard Paul Hudson', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 19, 'score': 89.2, 'word_infos': [[0, 19, 'single', True, 'Matches RICHARD PAUL HUDSON directly.']], 'answers': []}])
def test_dictionaries_with_multiword_alone_and_entity_token_in_text_to_match(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.remove_all_search_phrases()
holmes_manager_coref.parse_and_register_document("Richard Paul Hudson")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"ENTITYPERSON")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Richard Paul Hudson', 'text_to_match': 'ENTITYPERSON', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 19, 'score': 5.0, 'word_infos': [[0, 19, 'single', True, "Matches the ENTITYPERSON placeholder."]]}])
+ [{'document_label': '', 'text': 'Richard Paul Hudson', 'text_to_match': 'ENTITYPERSON', 'rank': '1', 'index_within_document': 2, 'subword_index': None, 'start_index': 0, 'end_index': 2, 'sentences_start_index': 0, 'sentences_end_index': 2, 'sentences_character_start_index': 0, 'sentences_character_end_index': 19, 'score': 50.0, 'word_infos': [[0, 19, 'single', True, 'Has an entity label matching ENTITYPERSON.']], 'answers': []}])
def test_dictionaries_with_multiword_as_single_word_and_relation(self):
- holmes_manager_coref_no_embeddings.remove_all_documents()
- holmes_manager_coref_no_embeddings.remove_all_search_phrases()
- holmes_manager_coref_no_embeddings.parse_and_register_document(
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.remove_all_search_phrases()
+ holmes_manager_coref.parse_and_register_document(
"Can somebody give Harry Potter his present")
topic_match_dictionaries = \
- holmes_manager_coref_no_embeddings.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"Somebody gives a present to Harry")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'Can somebody give Harry Potter his present', 'text_to_match': 'Somebody gives a present to Harry', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 42, 'score': 92.6351111111111, 'word_infos': [[13, 17, 'overlapping_relation', False, "Matches GIVE directly."], [18, 30, 'overlapping_relation', False, "Is a synonym of HARRY in the ontology."], [35, 42, 'overlapping_relation', True, "Matches PRESENT directly."]]}])
+ [{'document_label': '', 'text': 'Can somebody give Harry Potter his present', 'text_to_match': 'Somebody gives a present to Harry', 'rank': '1', 'index_within_document': 6, 'subword_index': None, 'start_index': 2, 'end_index': 6, 'sentences_start_index': 0, 'sentences_end_index': 6, 'sentences_character_start_index': 0, 'sentences_character_end_index': 42, 'score': 926.3511111111111, 'word_infos': [[13, 17, 'overlapping_relation', False, 'Matches GIVE directly.'], [18, 30, 'overlapping_relation', False, 'Is a synonym of HARRY in the ontology.'], [35, 42, 'overlapping_relation', True, 'Matches PRESENT directly.']], 'answers': []}])
- def test_result_ordering_by_match_length_different_documents(self):
+ def test_result_ordering_by_match_length_different_documents_1(self):
holmes_manager_coref.remove_all_documents()
holmes_manager_coref.remove_all_search_phrases()
holmes_manager_coref.parse_and_register_document(
@@ -813,8 +930,8 @@ def test_result_ordering_by_match_length_different_documents(self):
"A dog chased a cat. A cat.", '2')
topic_matches = holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat")
- self.assertEqual(topic_matches[0].end_index, 7)
- self.assertEqual(topic_matches[1].end_index, 4)
+ self.assertEqual(topic_matches[0]['end_index'], 7)
+ self.assertEqual(topic_matches[1]['end_index'], 4)
def test_filtering_with_topic_matches(self):
holmes_manager_coref.remove_all_documents()
@@ -860,23 +977,23 @@ def test_filtering_with_topic_match_dictionaries(self):
holmes_manager_coref.parse_and_register_document(
"The dog chased the cat", "T22")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat")
self.assertEqual(len(topic_match_dictionaries), 4)
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat", document_label_filter="T")
self.assertEqual(len(topic_match_dictionaries), 4)
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat", document_label_filter="T1")
self.assertEqual(len(topic_match_dictionaries), 2)
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat", document_label_filter="T22")
self.assertEqual(len(topic_match_dictionaries), 1)
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"The dog chased the cat", document_label_filter="X")
self.assertEqual(len(topic_match_dictionaries), 0)
@@ -886,10 +1003,10 @@ def test_adjective_describing_ontology_multiword_with_topic_match_dictionaries(s
holmes_manager_coref.parse_and_register_document(
"A big Unhyphenated Single Multiword")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"A big Unhyphenated Single Multiword")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A big Unhyphenated Single Multiword', 'text_to_match': 'A big Unhyphenated Single Multiword', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 35, 'score': 36.92, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [6, 35, 'relation', True, 'Matches UNHYPHENATED SINGLE MULTIWORD directly.']]}])
+ [{'document_label': '', 'text': 'A big Unhyphenated Single Multiword', 'text_to_match': 'A big Unhyphenated Single Multiword', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 35, 'score': 369.2, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [6, 35, 'relation', True, 'Matches UNHYPHENATED SINGLE MULTIWORD directly.']], 'answers': []}])
def test_adjective_describing_entity_multiword_with_topic_match_dictionaries(self):
holmes_manager_coref.remove_all_documents()
@@ -897,10 +1014,10 @@ def test_adjective_describing_entity_multiword_with_topic_match_dictionaries(sel
holmes_manager_coref.parse_and_register_document(
"A big Richard Paul Hudson")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"A big Richard Paul Hudson")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A big Richard Paul Hudson', 'text_to_match': 'A big Richard Paul Hudson', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 25, 'score': 40.84, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [6, 25, 'relation', True, 'Matches RICHARD PAUL HUDSON directly.']]}])
+ [{'document_label': '', 'text': 'A big Richard Paul Hudson', 'text_to_match': 'A big Richard Paul Hudson', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 408.4, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [6, 25, 'relation', True, 'Matches RICHARD PAUL HUDSON directly.']], 'answers': []}])
def test_adjective_describing_double_multiword_with_topic_match_dictionaries_1(self):
holmes_manager_coref.remove_all_documents()
@@ -908,10 +1025,10 @@ def test_adjective_describing_double_multiword_with_topic_match_dictionaries_1(s
holmes_manager_coref.parse_and_register_document(
"A big Richard Mimi Momo")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"A big Richard Mimi Momo")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Richard Mimi Momo', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 23, 'score': 101.78666666666666, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches BIG directly.'], [6, 13, 'overlapping_relation', False, 'Matches RICHARD directly.'], [14, 23, 'overlapping_relation', True, 'Matches MIMI MOMO directly.']]}])
+ [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Richard Mimi Momo', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 23, 'score': 1017.8666666666667, 'word_infos': [[2, 5, 'overlapping_relation', False, 'Matches BIG directly.'], [6, 13, 'overlapping_relation', False, 'Matches RICHARD directly.'], [14, 23, 'overlapping_relation', True, 'Matches MIMI MOMO directly.']], 'answers': []}])
def test_adjective_describing_double_multiword_with_topic_match_dictionaries_2(self):
holmes_manager_coref.remove_all_documents()
@@ -919,10 +1036,10 @@ def test_adjective_describing_double_multiword_with_topic_match_dictionaries_2(s
holmes_manager_coref.parse_and_register_document(
"A big Richard Mimi Momo")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"A big Mimi Momo")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Mimi Momo', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 23, 'score': 36.92, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [14, 23, 'relation', True, 'Matches MIMI MOMO directly.']]}])
+ [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Mimi Momo', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 23, 'score': 369.2, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [14, 23, 'relation', True, 'Matches MIMI MOMO directly.']], 'answers': []}])
def test_adjective_describing_double_multiword_with_topic_match_dictionaries_3(self):
holmes_manager_coref.remove_all_documents()
@@ -930,7 +1047,168 @@ def test_adjective_describing_double_multiword_with_topic_match_dictionaries_3(s
holmes_manager_coref.parse_and_register_document(
"A big Richard Mimi Momo")
topic_match_dictionaries = \
- holmes_manager_coref.topic_match_documents_returning_dictionaries_against(
+ holmes_manager_coref.topic_match_documents_against(
"A big Momo")
self.assertEqual(topic_match_dictionaries,
- [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Momo', 'rank': '1', 'sentences_character_start_index_in_document': 0, 'sentences_character_end_index_in_document': 23, 'score': 31.92, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [19, 23, 'relation', True, 'Matches MOMO directly.']]}])
+ [{'document_label': '', 'text': 'A big Richard Mimi Momo', 'text_to_match': 'A big Momo', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 23, 'score': 319.2, 'word_infos': [[2, 5, 'relation', False, 'Matches BIG directly.'], [19, 23, 'relation', True, 'Matches MOMO directly.']], 'answers': []}])
+
+ def test_different_match_cutoff_score_low(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.remove_all_search_phrases()
+ holmes_manager_coref.parse_and_register_document(
+ "A dog and a dog then and then and then and then and then a dog")
+ topic_match_dictionaries = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog")
+ self.assertEqual(len(topic_match_dictionaries), 1)
+
+ def test_different_match_cutoff_score_high_1(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.remove_all_search_phrases()
+ holmes_manager_coref.parse_and_register_document(
+ "A dog then and then and then and then and then a dog")
+ topic_match_dictionaries = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog", different_match_cutoff_score=10000)
+ self.assertEqual(len(topic_match_dictionaries), 2)
+ topic_matches = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog", different_match_cutoff_score=10000)
+ self.assertEqual(len(topic_matches), 2)
+
+ def test_different_match_cutoff_score_zero(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.remove_all_search_phrases()
+ holmes_manager_coref.parse_and_register_document(
+ "A dog then and then and then and then and then a dog")
+ topic_match_dictionaries = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog", different_match_cutoff_score=0)
+ self.assertEqual(len(topic_match_dictionaries), 1)
+ topic_matches = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog", different_match_cutoff_score=0)
+ self.assertEqual(len(topic_matches), 1)
+
+
+ def test_different_match_cutoff_score_control_1(self):
+ holmes_manager_coref.remove_all_documents()
+ holmes_manager_coref.remove_all_search_phrases()
+ holmes_manager_coref.parse_and_register_document(
+ "A dog then and then and then and then and then a dog")
+ topic_match_dictionaries = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog")
+ self.assertEqual(len(topic_match_dictionaries), 1)
+ topic_matches = \
+ holmes_manager_coref.topic_match_documents_against(
+ "A dog")
+ self.assertEqual(len(topic_matches), 1)
+
+ def test_one_worker_frequency_factor(self):
+ m = holmes.Manager('en_core_web_sm', ontology=ontology_for_sm_tests,
+ number_of_workers=1)
+ m.parse_and_register_document("I saw a dog. It was chasing a cat", 'specific')
+ m.parse_and_register_document("The dog chased the animal", 'exact')
+ m.parse_and_register_document("The cat chased the dog", 'specific-reversed')
+ m.parse_and_register_document("The animal chased the dog", 'exact-reversed')
+ self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
+ 'specific-reversed'])
+ self.assertEqual(m.topic_match_documents_against(
+ "A dog chases an animal",
+ relation_score=30,
+ reverse_only_relation_score=20,
+ single_word_score=5,
+ single_word_any_tag_score=2,
+ different_match_cutoff_score=5,
+ relation_matching_frequency_threshold=0.2),
+ [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 17.654017250803907, 'word_infos': [[4, 7, 'overlapping_relation', False, 'Matches DOG directly.'], [8, 14, 'overlapping_relation', False, 'Matches CHASE directly.'], [19, 25, 'overlapping_relation', True, 'Matches ANIMAL directly.']], 'answers': []}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'index_within_document': 9, 'subword_index': None, 'start_index': 3, 'end_index': 9, 'sentences_start_index': 0, 'sentences_end_index': 9, 'sentences_character_start_index': 0, 'sentences_character_end_index': 33, 'score': 14.777271168442839, 'word_infos': [[8, 11, 'overlapping_relation', False, 'Matches DOG directly.'], [20, 27, 'overlapping_relation', False, 'Is a synonym of CHASE in the ontology.'], [30, 33, 'overlapping_relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 8.083873269940398, 'word_infos': [[4, 10, 'single', False, 'Matches ANIMAL directly.'], [11, 17, 'relation', False, 'Matches CHASE directly.'], [22, 25, 'relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 22, 'score': 7.731063509742494, 'word_infos': [[4, 7, 'single', False, 'Is a child of ANIMAL in the ontology.'], [8, 14, 'relation', False, 'Matches CHASE directly.'], [19, 22, 'relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}])
+ m.close()
+
+ def test_two_workers_frequency_factor_control(self):
+ m = holmes.Manager('en_core_web_sm', ontology=ontology_for_sm_tests, number_of_workers=2)
+ m.parse_and_register_document("I saw a dog. It was chasing a cat", 'specific')
+ m.parse_and_register_document("The dog chased the animal", 'exact')
+ m.parse_and_register_document("The cat chased the dog", 'specific-reversed')
+ m.parse_and_register_document("The animal chased the dog", 'exact-reversed')
+ self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
+ 'specific-reversed'])
+ self.assertEqual(m.topic_match_documents_against(
+ "A dog chases an animal",
+ relation_score=30,
+ reverse_only_relation_score=20,
+ single_word_score=5,
+ single_word_any_tag_score=2,
+ different_match_cutoff_score=5,
+ relation_matching_frequency_threshold=0.2),
+ [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 17.654017250803907, 'word_infos': [[4, 7, 'overlapping_relation', False, 'Matches DOG directly.'], [8, 14, 'overlapping_relation', False, 'Matches CHASE directly.'], [19, 25, 'overlapping_relation', True, 'Matches ANIMAL directly.']], 'answers': []}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'index_within_document': 9, 'subword_index': None, 'start_index': 3, 'end_index': 9, 'sentences_start_index': 0, 'sentences_end_index': 9, 'sentences_character_start_index': 0, 'sentences_character_end_index': 33, 'score': 14.777271168442839, 'word_infos': [[8, 11, 'overlapping_relation', False, 'Matches DOG directly.'], [20, 27, 'overlapping_relation', False, 'Is a synonym of CHASE in the ontology.'], [30, 33, 'overlapping_relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 8.083873269940398, 'word_infos': [[4, 10, 'single', False, 'Matches ANIMAL directly.'], [11, 17, 'relation', False, 'Matches CHASE directly.'], [22, 25, 'relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}, {'document_label': 'specific-reversed', 'text': 'The cat chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3=', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 22, 'score': 7.731063509742494, 'word_infos': [[4, 7, 'single', False, 'Is a child of ANIMAL in the ontology.'], [8, 14, 'relation', False, 'Matches CHASE directly.'], [19, 22, 'relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}])
+ m.close()
+
+ def test_number_of_results(self):
+ m = holmes.Manager('en_core_web_sm', ontology=ontology_for_sm_tests,
+ number_of_workers=2)
+ m.parse_and_register_document("I saw a dog. It was chasing a cat", 'specific')
+ m.parse_and_register_document("The dog chased the animal", 'exact')
+ m.parse_and_register_document("The cat chased the dog", 'specific-reversed')
+ m.parse_and_register_document("The animal chased the dog", 'exact-reversed')
+ self.assertEqual(m.document_labels(), ['exact', 'exact-reversed', 'specific',
+ 'specific-reversed'])
+ self.assertEqual(m.topic_match_documents_against(
+ "A dog chases an animal", number_of_results=3, use_frequency_factor=True,
+ relation_score=30,
+ reverse_only_relation_score=20,
+ single_word_score=5,
+ single_word_any_tag_score=2,
+ different_match_cutoff_score=5,
+ relation_matching_frequency_threshold=0.2),
+ [{'document_label': 'exact', 'text': 'The dog chased the animal', 'text_to_match': 'A dog chases an animal', 'rank': '1', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 17.654017250803907, 'word_infos': [[4, 7, 'overlapping_relation', False, 'Matches DOG directly.'], [8, 14, 'overlapping_relation', False, 'Matches CHASE directly.'], [19, 25, 'overlapping_relation', True, 'Matches ANIMAL directly.']], 'answers': []}, {'document_label': 'specific', 'text': 'I saw a dog. It was chasing a cat', 'text_to_match': 'A dog chases an animal', 'rank': '2', 'index_within_document': 9, 'subword_index': None, 'start_index': 3, 'end_index': 9, 'sentences_start_index': 0, 'sentences_end_index': 9, 'sentences_character_start_index': 0, 'sentences_character_end_index': 33, 'score': 14.777271168442839, 'word_infos': [[8, 11, 'overlapping_relation', False, 'Matches DOG directly.'], [20, 27, 'overlapping_relation', False, 'Is a synonym of CHASE in the ontology.'], [30, 33, 'overlapping_relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}, {'document_label': 'exact-reversed', 'text': 'The animal chased the dog', 'text_to_match': 'A dog chases an animal', 'rank': '3', 'index_within_document': 4, 'subword_index': None, 'start_index': 1, 'end_index': 4, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 25, 'score': 8.083873269940398, 'word_infos': [[4, 10, 'single', False, 'Matches ANIMAL directly.'], [11, 17, 'relation', False, 'Matches CHASE directly.'], [22, 25, 'relation', True, 'Is a child of ANIMAL in the ontology.']], 'answers': []}], )
+ m.close()
+
+ def test_multithreading_filtering_with_topic_match_dictionaries(self):
+ m = holmes.Manager('en_core_web_sm', number_of_workers=2,
+ ontology=ontology_for_sm_tests)
+
+ m.parse_and_register_document("The dog chased the cat", 'T11')
+ m.parse_and_register_document("The dog chased the cat", 'T12')
+ m.parse_and_register_document("The dog chased the cat", 'T21')
+ m.parse_and_register_document("The dog chased the cat", 'T22')
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "The dog chased the cat")
+ self.assertEqual(len(topic_match_dictionaries), 4)
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "The dog chased the cat", document_label_filter="T")
+ self.assertEqual(len(topic_match_dictionaries), 4)
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "The dog chased the cat", document_label_filter="T1")
+ self.assertEqual(len(topic_match_dictionaries), 2)
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "The dog chased the cat", document_label_filter="T22")
+ self.assertEqual(len(topic_match_dictionaries), 1)
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "The dog chased the cat", document_label_filter="X")
+ self.assertEqual(len(topic_match_dictionaries), 0)
+ m.close()
+
+ def test_different_match_cutoff_score_high_2(self):
+ m = holmes.Manager('en_core_web_sm', number_of_workers=2,
+ ontology=ontology_for_sm_tests)
+ m.parse_and_register_document("A dog then and then and then and then and then a dog")
+ topic_match_dictionaries = \
+ m.topic_match_documents_against(
+ "A dog", different_match_cutoff_score=10000)
+ self.assertEqual(len(topic_match_dictionaries), 2)
+ m.close()
+
+ def test_different_match_cutoff_score_control_2(self):
+ m = holmes.Manager('en_core_web_sm', number_of_workers=2,
+ ontology=ontology_for_sm_tests)
+ m.parse_and_register_document("A dog then and then and then and then and then a dog")
+ topic_match_dictionaries = m.topic_match_documents_against(
+ "A dog")
+ self.assertEqual(len(topic_match_dictionaries), 1)
+ m.close()