diff --git a/docs/tutorials/make-a-training-script.md b/docs/tutorials/make-a-training-script.md index c1f3006f0..ec7a22132 100644 --- a/docs/tutorials/make-a-training-script.md +++ b/docs/tutorials/make-a-training-script.md @@ -167,7 +167,7 @@ training loop with nlp.cache(): loss = torch.zeros((), device="cpu") for name, component in nlp.torch_components(): - output = component.module_forward(batch[component.name]) # (1) + output = component.module_forward(batch[name]) # (1) if "loss" in output: loss += output["loss"] @@ -194,13 +194,13 @@ scorer = create_ner_exact_scorer(nlp.get_pipe('ner').target_span_getter) with nlp.select_pipes(enable=["ner"]): # (1) print(scorer(val_docs, nlp.pipe(deepcopy(val_docs)))) # (2) - nlp.save("model") # (3) + nlp.to_disk("model") # (3) ``` 1. In the case we have multiple pipes in our model, we may want to selectively evaluate each pipe, thus we use the `select_pipes` method to disable every pipe except "ner". 2. We use the `pipe` method to run the "ner" component on the validation dataset. This method is similar to the `__call__` method of EDS-NLP components, but it is used to run a component on a list of spaCy Docs. -3. We could also have saved the model with `torch.save(model, "model.pt")`, but `nlp.save` avoids pickling and allows to inspect the model's files by saving them into a structured directory. +3. We could also have saved the model with `torch.save(model, "model.pt")`, but `nlp.to_disk` avoids pickling and allows to inspect the model's files by saving them into a structured directory. ## Full example @@ -298,7 +298,7 @@ Let's wrap the training code in a function, and make it callable from the comman loss = torch.zeros((), device="cpu") with nlp.cache(): for name, component in nlp.torch_components(): - output = component.module_forward(batch[component.name]) + output = component.module_forward(batch[name]) if "loss" in output: loss += output["loss"] diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py index b4dfaecd7..aa753ea82 100644 --- a/edsnlp/data/standoff.py +++ b/edsnlp/data/standoff.py @@ -1,21 +1,33 @@ +# ruff: noqa: F401 import glob import os import re from collections import Counter, defaultdict from pathlib import Path -from typing import Any, Callable, Dict, Optional, Union +from typing import ( + Any, + Callable, + Dict, + Optional, + Union, +) +import spacy.tokenizer from loguru import logger from edsnlp import registry +from edsnlp.core import PipelineProtocol from edsnlp.core.lazy_collection import LazyCollection from edsnlp.data.base import BaseReader, BaseWriter from edsnlp.data.converters import ( FILENAME, + AttributesMappingArg, + SequenceStr, get_dict2doc_converter, get_doc2dict_converter, ) from edsnlp.utils.collections import flatten_once +from edsnlp.utils.span_getters import SpanSetterArg REGEX_ENTITY = re.compile(r"^(T\d+)\t(\S+)([^\t]+)\t(.*)$") REGEX_NOTE = re.compile(r"^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$") @@ -382,20 +394,55 @@ def read_standoff( docs = list(edsnlp.data.read_standoff("path/to/brat/directory")) ``` + !!! warning "True/False attributes" + + Boolean values are not supported by the BRAT editor, and are stored as empty + (key: empty value) if true, and not stored otherwise. This means that False + values will not be assigned to attributes by default, which can be problematic + when deciding if an entity is negated or not : is the entity not negated, or + has the negation attribute not been annotated ? + + To avoid this issue, you can use the `bool_attributes` argument to specify + which attributes should be considered as boolean when reading a BRAT dataset. + These attributes will be assigned a value of `True` if they are present, and + `False` otherwise. + + ```{ .python .no-check } + doc_iterator = edsnlp.data.read_standoff( + "path/to/brat/directory", + # Mapping from 'BRAT attribute name' to 'Doc attribute name' + span_attributes={"Negation": "negated"}, + bool_attributes=["negated"], # Missing values will be set to False + ) + ``` + Parameters ---------- - path: Union[str, Path] + path : Union[str, Path] Path to the directory containing the BRAT files (will recursively look for files in subdirectories). - nlp: Optional[PipelineProtocol] - The pipeline instance (defaults to `edsnlp.blank("eds")`) used to tokenize the - documents. - span_setter: SpanSetterArg + nlp : Optional[PipelineProtocol] + The pipeline object (optional and likely not needed, prefer to use the + `tokenizer` directly argument instead). + tokenizer : Optional[spacy.tokenizer.Tokenizer] + The tokenizer instance used to tokenize the documents. Likely not needed since + by default it uses the current context tokenizer : + + - the tokenizer of the next pipeline run by `.map_pipeline` in a + [LazyCollection][edsnlp.core.lazy_collection.LazyCollection]. + - or the `eds` tokenizer by default. + span_setter : SpanSetterArg The span setter to use when setting the spans in the documents. Defaults to setting the spans in the `ents` attribute, and creates a new span group for - each BRAT entity label. - span_attributes: Optional[Union[Sequence[str], Mapping[str, str]]] - Mapping from BRAT + each JSON entity label. + span_attributes : Optional[AttributesMappingArg] + Mapping from BRAT attributes to Span extensions (can be a list too). + By default, all attributes are imported as Span extensions with the same name. + keep_raw_attribute_values : bool + Whether to keep the raw attribute values (as strings) or to convert them to + Python objects (e.g. booleans). + bool_attributes : SequenceStr + List of attributes for which missing values should be set to False. Returns -------