From cdc8332bed6769c82f5db8393ace0ae02e59efbd Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Sat, 9 Jul 2022 10:16:36 +0300 Subject: [PATCH] #327 fix --- arekit/common/opinions/base.py | 2 +- arekit/common/opinions/collection.py | 2 +- arekit/common/synonyms.py | 139 ----------------- arekit/common/synonyms/__init__.py | 0 arekit/common/synonyms/base.py | 145 ++++++++++++++++++ arekit/common/synonyms/grouping.py | 21 +++ .../contrib/experiment_rusentrel/factory.py | 2 +- arekit/contrib/source/rusentrel/entities.py | 2 +- .../contrib/source/rusentrel/news_reader.py | 2 +- .../contrib/utils/synonyms/stemmer_based.py | 2 +- tests/contrib/networks/text/news.py | 2 +- tests/contrib/source/text/news.py | 2 +- tests/contrib/utils/test_eval.py | 2 +- 13 files changed, 175 insertions(+), 148 deletions(-) create mode 100644 arekit/common/synonyms/__init__.py create mode 100644 arekit/common/synonyms/base.py create mode 100644 arekit/common/synonyms/grouping.py diff --git a/arekit/common/opinions/base.py b/arekit/common/opinions/base.py index 8ebcc398..2ad0249f 100644 --- a/arekit/common/opinions/base.py +++ b/arekit/common/opinions/base.py @@ -1,6 +1,6 @@ from arekit.common.labels.base import Label from arekit.common.opinions.enums import OpinionEndTypes -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection class Opinion(object): diff --git a/arekit/common/opinions/collection.py b/arekit/common/opinions/collection.py index 2e3e8936..53bc2a28 100644 --- a/arekit/common/opinions/collection.py +++ b/arekit/common/opinions/collection.py @@ -4,7 +4,7 @@ from arekit.common.labels.base import Label from arekit.common.opinions.base import Opinion from arekit.common.opinions.enums import OpinionEndTypes -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection class OpinionCollection(object): diff --git a/arekit/common/synonyms.py b/arekit/common/synonyms.py index ae2c93d3..b28b04f6 100644 --- a/arekit/common/synonyms.py +++ b/arekit/common/synonyms.py @@ -1,142 +1,3 @@ -import collections -from arekit.common import log_utils -class SynonymsCollection(object): - - def __init__(self, iter_group_values_lists, is_read_only, debug): - assert(isinstance(iter_group_values_lists, collections.Iterable)) - assert(isinstance(is_read_only, bool)) - assert(isinstance(debug, bool)) - - # Assumes to be filled - self.__by_sid = {} - self.__by_index = [] - - self.__is_read_only = is_read_only - self.__debug = debug - self.__fill(iter_grop_values_lists=iter_group_values_lists) - - # region properties - - @property - def IsReadOnly(self): - return self.__is_read_only - - # endregion - - # region public 'add' methods - - def add_synonym_value(self, value): - assert(isinstance(value, str)) - - if self.__contains_synonym_value(value): - raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8')) - - if self.__is_read_only: - raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8')) - - sid = self._create_external_sid(value) - self.__by_sid[sid] = self.__get_groups_count() - self.__by_index.append([value]) - - # endregion - - # region public 'contains' methods - - def contains_synonym_value(self, value): - return self.__contains_synonym_value(value) - - # endregion - - # region public 'get' methods - - def get_synonym_group_index(self, value): - assert(isinstance(value, str)) - return self.__get_group_index(value) - - # endregion - - # region public 'create' methods - - def create_synonym_id(self, value): - return self._create_external_sid(value) - - # endregion - - # region protected methods - - def _contains_sid(self, v_id): - return v_id in self.__by_sid - - def _create_internal_sid(self, value): - """ Utilized during filling stage. - """ - raise NotImplementedError() - - def _create_external_sid(self, value): - raise NotImplementedError() - - # endregion - - # region public 'iter' methods - - def iter_synonym_values(self, value): - assert(isinstance(value, str)) - sid = self._create_external_sid(value) - index = self.__by_sid[sid] - return iter(self.__by_index[index]) - - def iter_by_index(self): - return iter(self.__by_index) - - def iter_group(self, group_index): - assert(isinstance(group_index, int)) - return iter(self.__by_index[group_index]) - - # endregion - - # region private methods - - def __fill(self, iter_grop_values_lists): - for group in iter_grop_values_lists: - self.__process_group(group) - - def __process_group(self, group_values_list): - group_index = len(self.__by_index) - synonym_list = [] - - for synonym_value in group_values_list: - - value = synonym_value.strip() - - sid = self._create_internal_sid(value) - - if self._contains_sid(sid) and self.__debug: - log_utils.log_synonym_existed(value) - continue - - synonym_list.append(value) - self.__by_sid[sid] = group_index - - self.__by_index.append(synonym_list) - - def __get_groups_count(self): - return len(self.__by_index) - - def __get_group_index(self, value): - sid = self._create_external_sid(value) - return self.__by_sid[sid] - - def __contains_synonym_value(self, value): - return self._contains_sid(self._create_external_sid(value)) - - # endregion - - # region overridden methods - - def __len__(self): - return len(self.__by_index) - - # endregion diff --git a/arekit/common/synonyms/__init__.py b/arekit/common/synonyms/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/arekit/common/synonyms/base.py b/arekit/common/synonyms/base.py new file mode 100644 index 00000000..a77b47cc --- /dev/null +++ b/arekit/common/synonyms/base.py @@ -0,0 +1,145 @@ +import collections + +from arekit.common import log_utils + + +class SynonymsCollection(object): + + def __init__(self, iter_group_values_lists, is_read_only, debug): + assert(isinstance(iter_group_values_lists, collections.Iterable)) + assert(isinstance(is_read_only, bool)) + assert(isinstance(debug, bool)) + + # Assumes to be filled + self.__by_sid = {} + self.__by_index = [] + + self.__is_read_only = is_read_only + self.__debug = debug + self.__fill(iter_grop_values_lists=iter_group_values_lists) + + # region properties + + @property + def IsReadOnly(self): + return self.__is_read_only + + # endregion + + # region public 'add' methods + + def add_synonym_value(self, value): + assert(isinstance(value, str)) + + if self.__contains_synonym_value(value): + raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8')) + + if self.__is_read_only: + raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8')) + + sid = self._create_external_sid(value) + self.__by_sid[sid] = self.__get_groups_count() + self.__by_index.append([value]) + + # endregion + + # region public 'contains' methods + + def contains_synonym_value(self, value): + return self.__contains_synonym_value(value) + + # endregion + + # region public 'get' methods + + def get_synonym_group_index(self, value): + """ NOTE: Before use this, please take a look at the grouping (see #327 issue). + It is better to use that class API rather than pass that method for `value_to_group_id_func` + """ + assert(isinstance(value, str)) + return self.__get_group_index(value) + + # endregion + + # region public 'create' methods + + def create_synonym_id(self, value): + return self._create_external_sid(value) + + # endregion + + # region protected methods + + def _contains_sid(self, v_id): + return v_id in self.__by_sid + + def _create_internal_sid(self, value): + """ Utilized during filling stage. + """ + raise NotImplementedError() + + def _create_external_sid(self, value): + raise NotImplementedError() + + # endregion + + # region public 'iter' methods + + def iter_synonym_values(self, value): + assert(isinstance(value, str)) + sid = self._create_external_sid(value) + index = self.__by_sid[sid] + return iter(self.__by_index[index]) + + def iter_by_index(self): + return iter(self.__by_index) + + def iter_group(self, group_index): + assert(isinstance(group_index, int)) + return iter(self.__by_index[group_index]) + + # endregion + + # region private methods + + def __fill(self, iter_grop_values_lists): + for group in iter_grop_values_lists: + self.__process_group(group) + + def __process_group(self, group_values_list): + group_index = len(self.__by_index) + synonym_list = [] + + for synonym_value in group_values_list: + + value = synonym_value.strip() + + sid = self._create_internal_sid(value) + + if self._contains_sid(sid) and self.__debug: + log_utils.log_synonym_existed(value) + continue + + synonym_list.append(value) + self.__by_sid[sid] = group_index + + self.__by_index.append(synonym_list) + + def __get_groups_count(self): + return len(self.__by_index) + + def __get_group_index(self, value): + sid = self._create_external_sid(value) + return self.__by_sid[sid] + + def __contains_synonym_value(self, value): + return self._contains_sid(self._create_external_sid(value)) + + # endregion + + # region overridden methods + + def __len__(self): + return len(self.__by_index) + + # endregion \ No newline at end of file diff --git a/arekit/common/synonyms/grouping.py b/arekit/common/synonyms/grouping.py new file mode 100644 index 00000000..39b0e7fa --- /dev/null +++ b/arekit/common/synonyms/grouping.py @@ -0,0 +1,21 @@ +from arekit.common.synonyms.base import SynonymsCollection + + +class SynonymsCollectionValuesGroupingProviders: + """ Providers for the grouping. + """ + + @staticmethod + def provide_existed_or_register_missed_value(synonyms, value): + """ grouping with a potential expansion. + """ + assert(isinstance(synonyms, SynonymsCollection)) + if not synonyms.contains_synonym_value(value): + synonyms.add_synonym_value(value) + return synonyms.get_synonym_group_index(value) + + @staticmethod + def provide_existed_value(synonyms, value): + """ grouping by using only existed value. + """ + return synonyms.get_synonym_group_index(value) diff --git a/arekit/contrib/experiment_rusentrel/factory.py b/arekit/contrib/experiment_rusentrel/factory.py index e1ed8867..811069b3 100644 --- a/arekit/contrib/experiment_rusentrel/factory.py +++ b/arekit/contrib/experiment_rusentrel/factory.py @@ -1,7 +1,7 @@ from arekit.common.experiment.api.ctx_base import ExperimentContext from arekit.common.experiment.api.io_utils import BaseIOUtils from arekit.common.folding.types import FoldingType -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.contrib.experiment_rusentrel.common import create_text_parser from arekit.contrib.experiment_rusentrel.exp_ds.factory import create_ruattitudes_experiment from arekit.contrib.experiment_rusentrel.exp_ds.folding import create_ruattitudes_experiment_data_folding diff --git a/arekit/contrib/source/rusentrel/entities.py b/arekit/contrib/source/rusentrel/entities.py index 227f8450..bc766803 100644 --- a/arekit/contrib/source/rusentrel/entities.py +++ b/arekit/contrib/source/rusentrel/entities.py @@ -1,5 +1,5 @@ from arekit.common.entities.collection import EntityCollection -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.contrib.source.brat.annot import BratAnnotationParser from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils diff --git a/arekit/contrib/source/rusentrel/news_reader.py b/arekit/contrib/source/rusentrel/news_reader.py index 65dc8d33..c9fffdb2 100644 --- a/arekit/contrib/source/rusentrel/news_reader.py +++ b/arekit/contrib/source/rusentrel/news_reader.py @@ -1,5 +1,5 @@ from arekit.common.news.base import News -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.contrib.source.brat.news_reader import BratDocumentSentencesReader from arekit.contrib.source.rusentrel.entities import RuSentRelDocumentEntityCollection from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils diff --git a/arekit/contrib/utils/synonyms/stemmer_based.py b/arekit/contrib/utils/synonyms/stemmer_based.py index 3a80740e..bc834942 100644 --- a/arekit/contrib/utils/synonyms/stemmer_based.py +++ b/arekit/contrib/utils/synonyms/stemmer_based.py @@ -1,4 +1,4 @@ -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.common.text.stemmer import Stemmer diff --git a/tests/contrib/networks/text/news.py b/tests/contrib/networks/text/news.py index 8b14e84c..6007b24b 100644 --- a/tests/contrib/networks/text/news.py +++ b/tests/contrib/networks/text/news.py @@ -1,6 +1,6 @@ from arekit.common.news.parser import NewsParser from arekit.common.opinions.collection import OpinionCollection -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.common.text.parser import BaseTextParser from arekit.contrib.source.rusentrel.news_reader import RuSentRelNewsReader from arekit.contrib.source.rusentrel.opinions.collection import RuSentRelOpinionCollection diff --git a/tests/contrib/source/text/news.py b/tests/contrib/source/text/news.py index 684770c1..c39441f1 100644 --- a/tests/contrib/source/text/news.py +++ b/tests/contrib/source/text/news.py @@ -1,6 +1,6 @@ from arekit.common.news.parser import NewsParser from arekit.common.opinions.collection import OpinionCollection -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.common.text.parser import BaseTextParser from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions diff --git a/tests/contrib/utils/test_eval.py b/tests/contrib/utils/test_eval.py index c5833047..efd67ce9 100644 --- a/tests/contrib/utils/test_eval.py +++ b/tests/contrib/utils/test_eval.py @@ -10,7 +10,7 @@ from arekit.common.evaluation.evaluators.cmp_table import DocumentCompareTable from arekit.common.evaluation.evaluators.modes import EvaluationModes from arekit.common.opinions.collection import OpinionCollection -from arekit.common.synonyms import SynonymsCollection +from arekit.common.synonyms.base import SynonymsCollection from arekit.common.utils import progress_bar_iter from arekit.contrib.experiment_rusentrel.labels.formatters.rusentrel import RuSentRelExperimentLabelsFormatter from arekit.contrib.experiment_rusentrel.synonyms.provider import RuSentRelSynonymsCollectionProvider