aphp · percevalw · Nov 19, 2024 · Nov 19, 2024
diff --git a/changelog.md b/changelog.md
@@ -9,6 +9,8 @@
 - Log the training config at the beginning of the trainings
 - Support a specific model output dir path for trainings (`output_model_dir`), and whether to save the model or not (`save_model`)
 - Specify whether to log the validation results or not (`logger=False`)
+- Added support for the CoNLL format with `edsnlp.data.read_conll` and with a specific `eds.conll_dict2doc` converter
+- Added a Trainable Biaffine Dependency Parser (`eds.biaffine_dep_parser`) component and metrics
 
 ### Fixed
 

diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
@@ -188,4 +188,5 @@ a.discrete-link {
 
 .sourced-heading > a {
     font-size: 1rem;
+    align-content: center;
 }
diff --git a/docs/data/conll.md b/docs/data/conll.md
@@ -0,0 +1,38 @@
+# CoNLL
+
+??? abstract "TLDR"
+
+    ```{ .python .no-check }
+    import edsnlp
+
+    stream = edsnlp.data.read_conll(path)
+    stream = stream.map_pipeline(nlp)
+    ```
+
+You can easily integrate CoNLL formatted files into your project by using EDS-NLP's CoNLL reader.
+
+There are many CoNLL formats corresponding to different shared tasks, but one of the most common is the CoNLL-U format, which is used for dependency parsing. In CoNLL files, each line corresponds to a token and contains various columns with information about the token, such as its index, form, lemma, POS tag, and dependency relation.
+
+EDS-NLP lets you specify the name of the `columns` if they are different from the default CoNLL-U format. If the `columns` parameter is unset, the reader looks for a comment containing `# global.columns` to infer the column names. Otherwise, the columns are
+
+```
+ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC
+```
+
+A typical CoNLL file looks like this:
+
+```{ title="sample.conllu" }
+1	euh	euh	INTJ	_	_	5	discourse	_	SpaceAfter=No
+2	,	,	PUNCT	_	_	1	punct	_	_
+3	il	lui	PRON	_	Gender=Masc|Number=Sing|Person=3|PronType=Prs	5	expl:subj	_	_
+...
+```
+
+## Reading CoNLL files {: #edsnlp.data.conll.read_conll }
+
+::: edsnlp.data.conll.read_conll
+    options:
+        heading_level: 3
+        show_source: false
+        show_toc: false
+        show_bases: false
diff --git a/docs/pipes/trainable/biaffine-dependency-parser.md b/docs/pipes/trainable/biaffine-dependency-parser.md
@@ -0,0 +1,8 @@
+# Trainable Biaffine Dependency Parser {: #edsnlp.pipes.trainable.biaffine_dep_parser.factory.create_component }
+
+::: edsnlp.pipes.trainable.biaffine_dep_parser.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
diff --git a/docs/pipes/trainable/index.md b/docs/pipes/trainable/index.md
@@ -8,13 +8,14 @@ All trainable components implement the [`TorchComponent`][edsnlp.core.torch_comp
 
 <!-- --8<-- [start:components] -->
 
-| Name                  | Description                                                           |
-|-----------------------|-----------------------------------------------------------------------|
-| `eds.transformer`     | Embed text with a transformer model                                   |
-| `eds.text_cnn`        | Contextualize embeddings with a CNN                                   |
-| `eds.span_pooler`     | A span embedding component that aggregates word embeddings            |
-| `eds.ner_crf`         | A trainable component to extract entities                             |
-| `eds.span_classifier` | A trainable component for multi-class multi-label span classification |
-| `eds.span_linker`     | A trainable entity linker (i.e. to a list of concepts)                |
+| Name                      | Description                                                           |
+|---------------------------|-----------------------------------------------------------------------|
+| `eds.transformer`         | Embed text with a transformer model                                   |
+| `eds.text_cnn`            | Contextualize embeddings with a CNN                                   |
+| `eds.span_pooler`         | A span embedding component that aggregates word embeddings            |
+| `eds.ner_crf`             | A trainable component to extract entities                             |
+| `eds.span_classifier`     | A trainable component for multi-class multi-label span classification |
+| `eds.span_linker`         | A trainable entity linker (i.e. to a list of concepts)                |
+| `eds.biaffine_dep_parser` | A trainable biaffine dependency parser                                |
 
 <!-- --8<-- [end:components] -->
diff --git a/docs/references.bib b/docs/references.bib
@@ -161,3 +161,26 @@ @article{petitjean_2024
     url = {https://doi.org/10.1093/jamia/ocae069},
     eprint = {https://academic.oup.com/jamia/article-pdf/31/6/1280/57769016/ocae069.pdf},
 }
+
+@misc{dozat2017deepbiaffineattentionneural,
+      title={Deep Biaffine Attention for Neural Dependency Parsing},
+      author={Timothy Dozat and Christopher D. Manning},
+      year={2017},
+      eprint={1611.01734},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1611.01734},
+}
+
+@inproceedings{grobol:hal-03223424,
+    title = {{Analyse en dépendances du français avec des plongements contextualisés}},
+    author = {Grobol, Loïc and Crabbé, Benoît},
+    url = {https://hal.archives-ouvertes.fr/hal-03223424},
+    year = {2021},
+    booktitle = {{Actes de la 28ème Conférence sur le Traitement Automatique des Langues Naturelles}},
+    eventtitle = {{TALN-RÉCITAL 2021}},
+    venue = {Lille, France},
+    pdf = {https://hal.archives-ouvertes.fr/hal-03223424/file/HOPS_final.pdf},
+    hal_id = {hal-03223424},
+    hal_version = {v1},
+}
diff --git a/edsnlp/data/__init__.py b/edsnlp/data/__init__.py
@@ -7,6 +7,7 @@
     from .base import from_iterable, to_iterable
     from .standoff import read_standoff, write_standoff
     from .brat import read_brat, write_brat
+    from .conll import read_conll
     from .json import read_json, write_json
     from .parquet import read_parquet, write_parquet
     from .spark import from_spark, to_spark

diff --git a/edsnlp/data/conll.py b/edsnlp/data/conll.py
@@ -0,0 +1,256 @@
+import os
+import random
+import warnings
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+from fsspec import filesystem as fsspec
+from loguru import logger
+from typing_extensions import Literal
+
+from edsnlp import registry
+from edsnlp.core.stream import Stream
+from edsnlp.data.base import FileBasedReader
+from edsnlp.data.converters import FILENAME, get_dict2doc_converter
+from edsnlp.utils.collections import shuffle
+from edsnlp.utils.file_system import FileSystem, normalize_fs_path, walk_match
+from edsnlp.utils.stream_sentinels import DatasetEndSentinel
+from edsnlp.utils.typing import AsList
+
+LOCAL_FS = fsspec("file")
+
+DEFAULT_COLUMNS = [
+    "ID",
+    "FORM",
+    "LEMMA",
+    "UPOS",
+    "XPOS",
+    "FEATS",
+    "HEAD",
+    "DEPREL",
+    "DEPS",
+    "MISC",
+]
+
+
+def parse_conll(
+    path: str,
+    cols: Optional[List[str]] = None,
+    fs: FileSystem = LOCAL_FS,
+) -> Iterable[Dict]:
+    """
+    Load a .conll file and return a dictionary with the text, words, and entities.
+    This expects the file to contain multiple sentences, split into words, each one
+    described in a line. Each sentence is separated by an empty line.
+
+    If possible, looks for a `#global.columns` comment at the start of the file to
+    extract the column names.
+
+    Examples:
+
+    ```text
+    ...
+    11	jeune	jeune	ADJ	_	Number=Sing	12	amod	_	_
+    12	fille	fille	NOUN	_	Gender=Fem|Number=Sing	5	obj	_	_
+    13	qui	qui	PRON	_	PronType=Rel	14	nsubj	_	_
+    ...
+    ```
+
+    Parameters
+    ----------
+    path: str
+        Path or glob path of the brat text file (.txt, not .ann)
+    cols: Optional[List[str]]
+        List of column names to use. If None, the first line of the file will be used
+    fs: FileSystem
+        Filesystem to use
+
+    Returns
+    -------
+    Iterator[Dict]
+    """
+    with fs.open(path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    if cols is None:
+        try:
+            cols = next(
+                line.split("=")[1].strip().split()
+                for line in lines
+                if line.strip("# ").startswith("global.columns")
+            )
+        except StopIteration:
+            cols = DEFAULT_COLUMNS
+            warnings.warn(
+                f"No #global.columns comment found in the CoNLL file. "
+                f"Using default {cols}"
+            )
+
+    doc = {"words": []}
+    for line in lines:
+        line = line.strip()
+        if not line:
+            if doc["words"]:
+                yield doc
+                doc = {"words": []}
+            continue
+        if line.startswith("#"):
+            continue
+        parts = line.split("\t")
+        word = {k: v for k, v in zip(cols, parts) if v != "_"}
+        doc["words"].append(word)
+
+    if doc["words"]:
+        yield doc
+
+
+class ConllReader(FileBasedReader):
+    DATA_FIELDS = ()
+
+    def __init__(
+        self,
+        path: Union[str, Path],
+        *,
+        columns: Optional[List[str]] = None,
+        filesystem: Optional[FileSystem] = None,
+        loop: bool = False,
+        shuffle: Literal["dataset", False] = False,
+        seed: Optional[int] = None,
+    ):
+        super().__init__()
+        self.shuffle = shuffle
+        self.emitted_sentinels = {"dataset"}
+        self.rng = random.Random(seed)
+        self.loop = loop
+        self.fs, self.path = normalize_fs_path(filesystem, path)
+        self.columns = columns
+
+        files = walk_match(self.fs, self.path, ".*[.]conllu?")
+        self.files = sorted(files)
+        assert len(self.files), f"No .conll files found in the directory {self.path}"
+        logger.info(f"The directory contains {len(self.files)} .conll files.")
+
+    def read_records(self) -> Iterable[Any]:
+        while True:
+            files = self.files
+            if self.shuffle:
+                files = shuffle(files, self.rng)
+            for item in files:
+                for anns in parse_conll(item, cols=self.columns, fs=self.fs):
+                    anns[FILENAME] = os.path.relpath(item, self.path).rsplit(".", 1)[0]
+                    anns["doc_id"] = anns[FILENAME]
+                    yield anns
+            yield DatasetEndSentinel()
+            if not self.loop:
+                break
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"path={self.path!r}, "
+            f"shuffle={self.shuffle}, "
+            f"loop={self.loop})"
+        )
+
+
+# No writer for CoNLL format yet
+
+
+@registry.readers.register("conll")
+def read_conll(
+    path: Union[str, Path],
+    *,
+    columns: Optional[List[str]] = None,
+    converter: Optional[AsList[Union[str, Callable]]] = ["conll"],
+    filesystem: Optional[FileSystem] = None,
+    shuffle: Literal["dataset", False] = False,
+    seed: Optional[int] = None,
+    loop: bool = False,
+    **kwargs,
+) -> Stream:
+    """
+    The ConllReader (or `edsnlp.data.read_conll`) reads a file or directory of CoNLL
+    files and yields documents.
+
+    The raw output (i.e., by setting `converter=None`) will be in the following form
+    for a single doc:
+
+    ```
+    {
+        "words": [
+            {"ID": "1", "FORM": ...},
+            ...
+        ],
+    }
+    ```
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edsnlp
+
+    nlp = edsnlp.blank("eds")
+    nlp.add_pipe(...)
+    doc_iterator = edsnlp.data.read_conll("path/to/conll/file/or/directory")
+    annotated_docs = nlp.pipe(doc_iterator)
+    ```
+
+    !!! note "Generator vs list"
+
+        `edsnlp.data.read_conll` returns a
+        [Stream][edsnlp.core.stream.Stream].
+        To iterate over the documents multiple times efficiently or to access them by
+        index, you must convert it to a list :
+
+        ```{ .python .no-check }
+        docs = list(edsnlp.data.read_conll("path/to/conll/file/or/directory"))
+        ```
+
+    Parameters
+    ----------
+    path : Union[str, Path]
+        Path to the directory containing the CoNLL files (will recursively look for
+        files in subdirectories).
+    columns: Optional[List[str]]
+        List of column names to use. If None, will try to extract to look for a
+        `#global.columns` comment at the start of the file to extract the column names.
+    shuffle: Literal["dataset", False]
+        Whether to shuffle the data. If "dataset", the whole dataset will be shuffled
+        before starting iterating on it (at the start of every epoch if looping).
+    seed: Optional[int]
+        The seed to use for shuffling.
+    loop: bool
+        Whether to loop over the data indefinitely.
+    nlp : Optional[PipelineProtocol]
+        The pipeline object (optional and likely not needed, prefer to use the
+        `tokenizer` directly argument instead).
+    tokenizer : Optional[spacy.tokenizer.Tokenizer]
+        The tokenizer instance used to tokenize the documents. Likely not needed since
+        by default it uses the current context tokenizer :
+
+        - the tokenizer of the next pipeline run by `.map_pipeline` in a
+          [Stream][edsnlp.core.stream.Stream].
+        - or the `eds` tokenizer by default.
+    converter : Optional[AsList[Union[str, Callable]]]
+        Converter to use to convert the documents to dictionary objects.
+    filesystem: Optional[FileSystem] = None,
+        The filesystem to use to write the files. If None, the filesystem will be
+        inferred from the path (e.g. `s3://` will use S3).
+    """
+
+    data = Stream(
+        reader=ConllReader(
+            path,
+            columns=columns,
+            filesystem=filesystem,
+            loop=loop,
+            shuffle=shuffle,
+            seed=seed,
+        )
+    )
+    if converter:
+        for conv in converter:
+            conv, kwargs = get_dict2doc_converter(conv, kwargs)
+            data = data.map(conv, kwargs=kwargs)
+    return data