Skip to content

Commit

Permalink
Merge pull request #438 from nicolay-r/master
Browse files Browse the repository at this point in the history
Sync with latest updates
  • Loading branch information
nicolay-r authored Jan 18, 2023
2 parents bca6dc1 + 395671f commit a2f6fe8
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 38 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Please follows th

## Applications

* **AREnets** [[github]](https://github.com/nicolay-r/AREnets)
* is an OpenNRE like project, but the kernel based on tensorflow library, with implementation of neural networks on top of it, designed for Attitude
* **ARElight** [[site]](https://nicolay-r.github.io/arelight-page/) [[github]](https://github.com/nicolay-r/ARElight)
* **Infer attitudes** from large Mass-media documents or **sample texts** for your Machine Learning models applications

Expand Down
1 change: 1 addition & 0 deletions arekit/contrib/networks/input/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
SynonymObject = "syn_objs"
SynonymSubject = "syn_subjs"
PosTags = "pos_tags"
Text = "text"

ArgsSep = ','
51 changes: 33 additions & 18 deletions arekit/contrib/networks/input/rows_parser.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,35 @@
import pandas as pd

from arekit.common.data import const
from arekit.common.utils import filter_whitespaces, split_by_whitespaces
from . import const as network_input_const

import arekit.contrib.networks.input.const as network_input_const

empty_list = []


def no_value():
return None


def __process_values_list(value):
return value.split(network_input_const.ArgsSep)


def __process_indices_list(value):
return [int(v) for v in str(value).split(network_input_const.ArgsSep)]
return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)]


def __process_int_values_list(value):
return __process_indices_list(value)


def __handle_text(value):
""" The core method of the input text processing.
"""
assert(isinstance(value, str) or isinstance(value, list))
return filter_whitespaces([term for term in split_by_whitespaces(value)]
if isinstance(value, str) else value)


parse_value = {
const.ID: lambda value: value,
const.DOC_ID: lambda value: int(value),
Expand All @@ -35,18 +46,19 @@ def __process_int_values_list(value):
network_input_const.SynonymObject: lambda value: __process_indices_list(value),
network_input_const.SynonymSubject: lambda value: __process_indices_list(value),
network_input_const.PosTags: lambda value: __process_int_values_list(value),
"text_a": lambda value: filter_whitespaces([term for term in split_by_whitespaces(value)])
network_input_const.Text: lambda value: __handle_text(value)
}


class ParsedSampleRow(object):
"""
Provides a parsed information for a sample row.
TODO. Use this class as API
""" Provides a parsed information for a sample row.
"""

def __init__(self, row):
assert(isinstance(row, pd.Series))
""" row: dict
dict of the pairs ("field_name", value)
"""
assert(isinstance(row, dict))

self.__uint_label = None
self.__params = {}
Expand All @@ -64,13 +76,16 @@ def __init__(self, row):

self.__params[key] = parse_value[key](value)

def __value_or_none(self, key):
return self.__params[key] if key in self.__params else no_value()

@property
def SampleID(self):
return self.__params[const.ID]

@property
def Terms(self):
return self.__params["text_a"]
return self.__params[network_input_const.Text]

@property
def SubjectIndex(self):
Expand All @@ -86,33 +101,33 @@ def UintLabel(self):

@property
def PartOfSpeechTags(self):
return self.__params[network_input_const.PosTags]
return self.__value_or_none(network_input_const.PosTags)

@property
def TextFrameVariantIndices(self):
return self.__params[network_input_const.FrameVariantIndices]
return self.__value_or_none(network_input_const.FrameVariantIndices)

@property
def TextFrameConnotations(self):
return self.__params[network_input_const.FrameConnotations]
return self.__value_or_none(network_input_const.FrameConnotations)

@property
def EntityInds(self):
return self.__params[const.ENTITIES]
return self.__value_or_none(const.ENTITIES)

@property
def SynonymObjectInds(self):
return self.__params[network_input_const.SynonymObject]
return self.__value_or_none(network_input_const.SynonymObject)

@property
def SynonymSubjectInds(self):
return self.__params[network_input_const.SynonymSubject]
return self.__value_or_none(network_input_const.SynonymSubject)

def __getitem__(self, item):
assert (isinstance(item, str) or item is None)
if item not in self.__params:
return None
return self.__params[item] if item is not None else None
return no_value()
return self.__params[item] if item is not None else no_value()

@classmethod
def parse(cls, row):
Expand Down
37 changes: 22 additions & 15 deletions arekit/contrib/source/brat/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,40 @@ def __non_prefixed_id(value):

@staticmethod
def handle_entity(args):
""" T2 Location 10 23 South America
T1 Location 0 5;16 23 North America
"""
assert(len(args) == 3)

if len(args) < 4:
return None
e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
entity_params = args[1].split()

if not str.isdigit(args[2]) or not str.isdigit(args[3]):
if len(entity_params) > 3:
# We do not support the case of a non-continuous entity mentions.
return None

e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
e_str_type = args[1]
e_begin = int(args[2])
e_end = int(args[3])
e_value = " ".join([arg.strip().replace(',', '') for arg in args[4:]])
e_str_type, e_begin, e_end = entity_params

return BratEntity(id_in_doc=e_id,
e_type=e_str_type,
index_begin=e_begin,
index_end=e_end,
value=e_value)
index_begin=int(e_begin),
index_end=int(e_end),
value=args[2].strip())

@staticmethod
def handle_relation(args):
""" Example:
R1 Origin Arg1:T3 Arg2:T4
"""

# Parse identifier index.
e_id = args[0][1:]

rel_type = args[1]
source_id = args[2].split(':')[1]
target_id = args[3].split(':')[1]
# Parse relation arguments.
rel_type, source, target = args[1].split()

source_id = source.split(':')[1]
target_id = target.split(':')[1]

return BratRelation(id_in_doc=e_id,
source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
Expand All @@ -57,7 +64,7 @@ def parse_annotations(input_file, encoding='utf-8'):
for line in input_file.readlines():
line = line.decode(encoding)

args = line.split()
args = line.split('\t')

record_type = args[0][0]

Expand Down
15 changes: 15 additions & 0 deletions arekit/contrib/utils/data/readers/jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from arekit.contrib.utils.data.readers.base import BaseReader
from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage


class JsonlReader(BaseReader):

def read(self, target):
rows = []
with open(target, "r") as f:
for line in f.readlines():
rows.append(line)
return JsonlBasedRowsStorage(rows)

def target_extension(self):
return ".jsonl"
18 changes: 18 additions & 0 deletions arekit/contrib/utils/data/storages/jsonl_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json

from arekit.common.data.storages.base import BaseRowsStorage


class JsonlBasedRowsStorage(BaseRowsStorage):

def __init__(self, rows):
assert(isinstance(rows, list))
self.__rows = rows

def _iter_rows(self):
for row_index, row in enumerate(self.__rows):
assert(isinstance(row, str))
yield row_index, json.loads(row)

def _get_rows_count(self):
return len(self.__rows)
5 changes: 2 additions & 3 deletions arekit/contrib/utils/evaluation/analyze_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,12 @@ def extract_errors(eval_result, test_samples_filepath, etalon_samples_filepath,
for sample_col in columns_to_copy:
eval_errors_df.at[row_id, sample_col] = sample_row[sample_col]

text_terms =__post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
text_terms = __post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
cropped_text = __crop_text_terms(source_ind=source_ind, target_ind=target_ind, text_terms=text_terms)

eval_errors_df.at[row_id, BaseSingleTextProvider.TEXT_A] = cropped_text

# Replace with the values instead of indices.
entity_inds = __get_entity_inds(sample_row)
# Replace source and target the values instead of indices.
eval_errors_df.at[row_id, const.S_IND] = text_terms[source_ind]
eval_errors_df.at[row_id, const.T_IND] = text_terms[target_ind]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def create_text_opinion_extraction_pipeline(text_parser,
version=version,
doc_id_func=lambda doc_id: doc_id,
keep_doc_ids_only=False,
label_scaler=label_scaler,
limit=limit)

doc_ops = DictionaryBasedDocumentOperations(ru_attitudes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_doc(self, doc_id):
return self.__ru_attitudes[doc_id]


def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, label_scaler, limit=None):
def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
""" Performs reading of RuAttitude formatted documents and
selection according to 'doc_ids_set' parameter.
"""
Expand Down

0 comments on commit a2f6fe8

Please sign in to comment.