Skip to content

Commit

Permalink
#327 fix
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jul 9, 2022
1 parent a40ccdf commit cdc8332
Show file tree
Hide file tree
Showing 13 changed files with 175 additions and 148 deletions.
2 changes: 1 addition & 1 deletion arekit/common/opinions/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from arekit.common.labels.base import Label
from arekit.common.opinions.enums import OpinionEndTypes
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection


class Opinion(object):
Expand Down
2 changes: 1 addition & 1 deletion arekit/common/opinions/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from arekit.common.labels.base import Label
from arekit.common.opinions.base import Opinion
from arekit.common.opinions.enums import OpinionEndTypes
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection


class OpinionCollection(object):
Expand Down
139 changes: 0 additions & 139 deletions arekit/common/synonyms.py
Original file line number Diff line number Diff line change
@@ -1,142 +1,3 @@
import collections

from arekit.common import log_utils


class SynonymsCollection(object):

def __init__(self, iter_group_values_lists, is_read_only, debug):
assert(isinstance(iter_group_values_lists, collections.Iterable))
assert(isinstance(is_read_only, bool))
assert(isinstance(debug, bool))

# Assumes to be filled
self.__by_sid = {}
self.__by_index = []

self.__is_read_only = is_read_only
self.__debug = debug
self.__fill(iter_grop_values_lists=iter_group_values_lists)

# region properties

@property
def IsReadOnly(self):
return self.__is_read_only

# endregion

# region public 'add' methods

def add_synonym_value(self, value):
assert(isinstance(value, str))

if self.__contains_synonym_value(value):
raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8'))

if self.__is_read_only:
raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8'))

sid = self._create_external_sid(value)
self.__by_sid[sid] = self.__get_groups_count()
self.__by_index.append([value])

# endregion

# region public 'contains' methods

def contains_synonym_value(self, value):
return self.__contains_synonym_value(value)

# endregion

# region public 'get' methods

def get_synonym_group_index(self, value):
assert(isinstance(value, str))
return self.__get_group_index(value)

# endregion

# region public 'create' methods

def create_synonym_id(self, value):
return self._create_external_sid(value)

# endregion

# region protected methods

def _contains_sid(self, v_id):
return v_id in self.__by_sid

def _create_internal_sid(self, value):
""" Utilized during filling stage.
"""
raise NotImplementedError()

def _create_external_sid(self, value):
raise NotImplementedError()

# endregion

# region public 'iter' methods

def iter_synonym_values(self, value):
assert(isinstance(value, str))
sid = self._create_external_sid(value)
index = self.__by_sid[sid]
return iter(self.__by_index[index])

def iter_by_index(self):
return iter(self.__by_index)

def iter_group(self, group_index):
assert(isinstance(group_index, int))
return iter(self.__by_index[group_index])

# endregion

# region private methods

def __fill(self, iter_grop_values_lists):
for group in iter_grop_values_lists:
self.__process_group(group)

def __process_group(self, group_values_list):
group_index = len(self.__by_index)
synonym_list = []

for synonym_value in group_values_list:

value = synonym_value.strip()

sid = self._create_internal_sid(value)

if self._contains_sid(sid) and self.__debug:
log_utils.log_synonym_existed(value)
continue

synonym_list.append(value)
self.__by_sid[sid] = group_index

self.__by_index.append(synonym_list)

def __get_groups_count(self):
return len(self.__by_index)

def __get_group_index(self, value):
sid = self._create_external_sid(value)
return self.__by_sid[sid]

def __contains_synonym_value(self, value):
return self._contains_sid(self._create_external_sid(value))

# endregion

# region overridden methods

def __len__(self):
return len(self.__by_index)

# endregion
Empty file.
145 changes: 145 additions & 0 deletions arekit/common/synonyms/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import collections

from arekit.common import log_utils


class SynonymsCollection(object):

def __init__(self, iter_group_values_lists, is_read_only, debug):
assert(isinstance(iter_group_values_lists, collections.Iterable))
assert(isinstance(is_read_only, bool))
assert(isinstance(debug, bool))

# Assumes to be filled
self.__by_sid = {}
self.__by_index = []

self.__is_read_only = is_read_only
self.__debug = debug
self.__fill(iter_grop_values_lists=iter_group_values_lists)

# region properties

@property
def IsReadOnly(self):
return self.__is_read_only

# endregion

# region public 'add' methods

def add_synonym_value(self, value):
assert(isinstance(value, str))

if self.__contains_synonym_value(value):
raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8'))

if self.__is_read_only:
raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8'))

sid = self._create_external_sid(value)
self.__by_sid[sid] = self.__get_groups_count()
self.__by_index.append([value])

# endregion

# region public 'contains' methods

def contains_synonym_value(self, value):
return self.__contains_synonym_value(value)

# endregion

# region public 'get' methods

def get_synonym_group_index(self, value):
""" NOTE: Before use this, please take a look at the grouping (see #327 issue).
It is better to use that class API rather than pass that method for `value_to_group_id_func`
"""
assert(isinstance(value, str))
return self.__get_group_index(value)

# endregion

# region public 'create' methods

def create_synonym_id(self, value):
return self._create_external_sid(value)

# endregion

# region protected methods

def _contains_sid(self, v_id):
return v_id in self.__by_sid

def _create_internal_sid(self, value):
""" Utilized during filling stage.
"""
raise NotImplementedError()

def _create_external_sid(self, value):
raise NotImplementedError()

# endregion

# region public 'iter' methods

def iter_synonym_values(self, value):
assert(isinstance(value, str))
sid = self._create_external_sid(value)
index = self.__by_sid[sid]
return iter(self.__by_index[index])

def iter_by_index(self):
return iter(self.__by_index)

def iter_group(self, group_index):
assert(isinstance(group_index, int))
return iter(self.__by_index[group_index])

# endregion

# region private methods

def __fill(self, iter_grop_values_lists):
for group in iter_grop_values_lists:
self.__process_group(group)

def __process_group(self, group_values_list):
group_index = len(self.__by_index)
synonym_list = []

for synonym_value in group_values_list:

value = synonym_value.strip()

sid = self._create_internal_sid(value)

if self._contains_sid(sid) and self.__debug:
log_utils.log_synonym_existed(value)
continue

synonym_list.append(value)
self.__by_sid[sid] = group_index

self.__by_index.append(synonym_list)

def __get_groups_count(self):
return len(self.__by_index)

def __get_group_index(self, value):
sid = self._create_external_sid(value)
return self.__by_sid[sid]

def __contains_synonym_value(self, value):
return self._contains_sid(self._create_external_sid(value))

# endregion

# region overridden methods

def __len__(self):
return len(self.__by_index)

# endregion
21 changes: 21 additions & 0 deletions arekit/common/synonyms/grouping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from arekit.common.synonyms.base import SynonymsCollection


class SynonymsCollectionValuesGroupingProviders:
""" Providers for the grouping.
"""

@staticmethod
def provide_existed_or_register_missed_value(synonyms, value):
""" grouping with a potential expansion.
"""
assert(isinstance(synonyms, SynonymsCollection))
if not synonyms.contains_synonym_value(value):
synonyms.add_synonym_value(value)
return synonyms.get_synonym_group_index(value)

@staticmethod
def provide_existed_value(synonyms, value):
""" grouping by using only existed value.
"""
return synonyms.get_synonym_group_index(value)
2 changes: 1 addition & 1 deletion arekit/contrib/experiment_rusentrel/factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from arekit.common.experiment.api.ctx_base import ExperimentContext
from arekit.common.experiment.api.io_utils import BaseIOUtils
from arekit.common.folding.types import FoldingType
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.contrib.experiment_rusentrel.common import create_text_parser
from arekit.contrib.experiment_rusentrel.exp_ds.factory import create_ruattitudes_experiment
from arekit.contrib.experiment_rusentrel.exp_ds.folding import create_ruattitudes_experiment_data_folding
Expand Down
2 changes: 1 addition & 1 deletion arekit/contrib/source/rusentrel/entities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from arekit.common.entities.collection import EntityCollection
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.contrib.source.brat.annot import BratAnnotationParser
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils

Expand Down
2 changes: 1 addition & 1 deletion arekit/contrib/source/rusentrel/news_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from arekit.common.news.base import News
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.contrib.source.brat.news_reader import BratDocumentSentencesReader
from arekit.contrib.source.rusentrel.entities import RuSentRelDocumentEntityCollection
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils
Expand Down
2 changes: 1 addition & 1 deletion arekit/contrib/utils/synonyms/stemmer_based.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.common.text.stemmer import Stemmer


Expand Down
2 changes: 1 addition & 1 deletion tests/contrib/networks/text/news.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from arekit.common.news.parser import NewsParser
from arekit.common.opinions.collection import OpinionCollection
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.common.text.parser import BaseTextParser
from arekit.contrib.source.rusentrel.news_reader import RuSentRelNewsReader
from arekit.contrib.source.rusentrel.opinions.collection import RuSentRelOpinionCollection
Expand Down
2 changes: 1 addition & 1 deletion tests/contrib/source/text/news.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from arekit.common.news.parser import NewsParser
from arekit.common.opinions.collection import OpinionCollection
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.common.text.parser import BaseTextParser

from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions
Expand Down
2 changes: 1 addition & 1 deletion tests/contrib/utils/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from arekit.common.evaluation.evaluators.cmp_table import DocumentCompareTable
from arekit.common.evaluation.evaluators.modes import EvaluationModes
from arekit.common.opinions.collection import OpinionCollection
from arekit.common.synonyms import SynonymsCollection
from arekit.common.synonyms.base import SynonymsCollection
from arekit.common.utils import progress_bar_iter
from arekit.contrib.experiment_rusentrel.labels.formatters.rusentrel import RuSentRelExperimentLabelsFormatter
from arekit.contrib.experiment_rusentrel.synonyms.provider import RuSentRelSynonymsCollectionProvider
Expand Down

0 comments on commit cdc8332

Please sign in to comment.