explosion · svlandeg · Jan 3, 2023 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/README.md b/README.md
@@ -13,6 +13,10 @@ library.
 - [Weights & Biases](https://www.wandb.com)
 - [MLflow](https://www.mlflow.org/)
 - [ClearML](https://www.clear.ml/)
+- [PyTorch](https://pytorch.org/)
+
+`spacy-loggers` also provides additional utility loggers to facilitate interoperation
+between individual loggers.
 
 If you'd like to add a new logger or logging option, please submit a PR to this
 repo!
@@ -114,7 +118,7 @@ the following actions are performed:
 By default, the tracking API writes data into files in a local `./mlruns` directory.
 
 **Note** that by default, the full (interpolated)
-[training config](https://spacy.io/usage/training#config) is sent over to 
+[training config](https://spacy.io/usage/training#config) is sent over to
 MLflow. If you prefer to **exclude certain information** such as path
 names, you can list those fields in "dot notation" in the
 `remove_config_values` parameter. These fields will then be removed from the
@@ -157,28 +161,27 @@ clearml-init
 `spacy.ClearMLLogger.v1` is a logger that tracks the results of each training step
 using the [ClearML](https://www.clear.ml/) tool. To use
 this logger, ClearML should be installed and you should have initialized (using the command above).
-The logger will send all the gathered information to your ClearML server, either [the hosted free tier](https://app.clear.ml) 
+The logger will send all the gathered information to your ClearML server, either [the hosted free tier](https://app.clear.ml)
 or the open source [self-hosted server](https://github.com/allegroai/clearml-server). This logger captures the following information, all of which is visible in the ClearML web UI:
 
 - The full spaCy config file contents.
 - Code information such as git repository, commit ID and uncommitted changes.
 - Full console output.
 - Miscellaneous info such as time, python version and hardware information.
 - Output scalars:
-    - The final score is logged under the scalar `score`.
-    - Individual component scores are grouped together on one scalar plot (filterable using the web UI).
-    - Loss values of different components are logged with the `loss_` prefix.
+  - The final score is logged under the scalar `score`.
+  - Individual component scores are grouped together on one scalar plot (filterable using the web UI).
+  - Loss values of different components are logged with the `loss_` prefix.
 
 In addition to the above, the following artifacts can also be optionally captured:
 
 - Best model directory (zipped).
 - Latest model directory (zipped).
 - Dataset used to train.
-	- Versioned using ClearML Data and linked to under Configuration -> User Properties on the web UI.
-
+  - Versioned using ClearML Data and linked to under Configuration -> User Properties on the web UI.
 
 **Note** that by default, the full (interpolated)
-[training config](https://spacy.io/usage/training#config) is sent over to 
+[training config](https://spacy.io/usage/training#config) is sent over to
 ClearML. If you prefer to **exclude certain information** such as path
 names, you can list those fields in "dot notation" in the
 `remove_config_values` parameter. These fields will then be removed from the
@@ -199,12 +202,105 @@ log_dataset_dir = corpus
 remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 ```
 
-| Name                   | Type            | Description                                                                                                                                                                                                                     |
-| ---------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_name`         | `str`           | The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.                                                                                                            |
-| `task_name`            | `str`           | The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.                                                                                                                           |
-| `remove_config_values` | `List[str]`     | A list of values to exclude from the config before it is uploaded to ClearML (default: `[]`).                                                                                                                                   |
-| `model_log_interval`   | `Optional[int]` | Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`.                                                         |
-| `log_best_dir`         | `Optional[str]` | Directory containing the best trained model as saved by spaCy (by default in `training/model-best`), to be logged and versioned as a ClearML artifact (default: `None`)                                                         |
-| `log_latest_dir`       | `Optional[str]` | Directory containing the latest trained model as saved by spaCy (by default in `training/model-last`), to be logged and versioned as a ClearML artifact (default: `None`)                                                       |
-| `log_dataset_dir`      | `Optional[str]` | Directory containing the dataset to be logged and versioned as a [ClearML Dataset](https://clear.ml/docs/latest/docs/clearml_data/clearml_data/) (default: `None`).                                                             |
+| Name                   | Type            | Description                                                                                                                                                               |
+| ---------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `project_name`         | `str`           | The name of the project in the ClearML interface. The project will be created automatically if it doesn't exist yet.                                                      |
+| `task_name`            | `str`           | The name of the ClearML task. A task is an experiment that lives inside a project. Can be non-unique.                                                                     |
+| `remove_config_values` | `List[str]`     | A list of values to exclude from the config before it is uploaded to ClearML (default: `[]`).                                                                             |
+| `model_log_interval`   | `Optional[int]` | Steps to wait between logging model checkpoints to the ClearML dasboard (default: `None`). Will have no effect without also setting `log_best_dir` or `log_latest_dir`.   |
+| `log_best_dir`         | `Optional[str]` | Directory containing the best trained model as saved by spaCy (by default in `training/model-best`), to be logged and versioned as a ClearML artifact (default: `None`)   |
+| `log_latest_dir`       | `Optional[str]` | Directory containing the latest trained model as saved by spaCy (by default in `training/model-last`), to be logged and versioned as a ClearML artifact (default: `None`) |
+| `log_dataset_dir`      | `Optional[str]` | Directory containing the dataset to be logged and versioned as a [ClearML Dataset](https://clear.ml/docs/latest/docs/clearml_data/clearml_data/) (default: `None`).       |
+
+## PyTorchLogger
+
+### Installation
+
+This logger requires `torch` to be installed:
+
+```bash
+pip install torch
+```
+
+### Usage
+
+`spacy.PyTorchLogger.v1` is different from the other loggers above in that it does not act as a bridge between spaCy and
+an external framework. Instead, it is used to query PyTorch-specific metrics and make them available to other loggers.
+Therefore, it's primarily intended to be used with [ChainLogger](#chainlogger).
+
+Whenever a logging checkpoint is reached, it queries statistics from the PyTorch backend and stores them in
+the dictionary passed to it. Downstream loggers can thereafter lookup the statistics and log them to their
+preferred framework.
+
+The following PyTorch statistics are currently supported:
+
+- [CUDA memory statistics](https://pytorch.org/docs/stable/generated/torch.cuda.memory_stats.html#torch.cuda.memory_stats)
+
+### Example config
+
+```ini
+[training.logger]
+@loggers = "spacy.ChainLogger.v1"
+logger1 = {"@loggers": "spacy.PyTorchLogger.v1", "prefix": "pytorch", "device": "0", "cuda_mem_metric": "current"}
+logger2 = {"@loggers": "spacy.LookupLogger.v1", "substring": "pytorch"}
+```
+
+| Name              | Type  | Description                                                                                                                                                     |
+| ----------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `prefix`          | `str` | All metric names are prefixed with this string using dot notation, e.g: `<prefix>.<metric>` (default: `pytorch`).                                               |
+| `device`          | `int` | The identifier of the CUDA device (default: `0`).                                                                                                               |
+| `cuda_mem_pool`   | `str` | One of the memory pool values specified in the PyTorch docs: `all`, `large_pool`, `small_pool` (default: `all`).                                                |
+| `cuda_mem_metric` | `str` | One of the memory metric values specified in the PyTorch docs: `current`, `peak`, `allocated`, `freed`. To log all metrics, use `all` instead (default: `all`). |
+
+# Utility Loggers
+
+## ChainLogger
+
+### Usage
+
+This logger can be used to daisy-chain multiple loggers and execute them in-order. Loggers that are executed earlier in the chain
+can pass information to those that come later by adding it to the dictionary that is passed to them.
+
+Currently, up to 10 loggers can be chained together.
+
+### Example config
+
+```ini
+[training.logger]
+@loggers = "spacy.ChainLogger.v1"
+logger1 = {"@loggers": "spacy.PyTorchLogger.v1"}
+logger2 = {"@loggers": "spacy.ConsoleLogger.v1", "progress_bar": "true"}
+```
+
+| Name       | Type                 | Description                                        |
+| ---------- | -------------------- | -------------------------------------------------- |
+| `logger1`  | `Optional[Callable]` | The first logger in the chain (default: `None`).   |
+| `logger2`  | `Optional[Callable]` | The second logger in the chain (default: `None`).  |
+| `logger3`  | `Optional[Callable]` | The third logger in the chain (default: `None`).   |
+| `logger4`  | `Optional[Callable]` | The fourth logger in the chain (default: `None`).  |
+| `logger5`  | `Optional[Callable]` | The fifth logger in the chain (default: `None`).   |
+| `logger6`  | `Optional[Callable]` | The sixth logger in the chain (default: `None`).   |
+| `logger7`  | `Optional[Callable]` | The seventh logger in the chain (default: `None`). |
+| `logger8`  | `Optional[Callable]` | The eighth logger in the chain (default: `None`).  |
+| `logger9`  | `Optional[Callable]` | The ninth logger in the chain (default: `None`).   |
+| `logger10` | `Optional[Callable]` | The tenth logger in the chain (default: `None`).   |
+
+## LookupLogger
+
+### Usage
+
+This logger can be used to lookup statistics in the info dictionary and print them to `stdout`. It is primarily
+intended to be used as a tool when developing new loggers.
+
+### Example config
+
+```ini
+[training.logger]
+@loggers = "spacy.ChainLogger.v1"
+logger1 = {"@loggers": "spacy.PyTorchLogger.v1", "prefix": "pytorch"}
+logger2 = {"@loggers": "spacy.LookupLogger.v1", "patterns": ["^[pP]ytorch"]}
+```
+
+| Name       | Type        | Description                                                                                          |
+| ---------- | ----------- | ---------------------------------------------------------------------------------------------------- |
+| `patterns` | `List[str]` | A list of regular expressions. If a statistic's name matches one of these, it's printed to `stdout`. |
diff --git a/setup.cfg b/setup.cfg
@@ -21,6 +21,9 @@ spacy_loggers =
     spacy.WandbLogger.v1 = spacy_loggers.wandb:wandb_logger_v1
     spacy.MLflowLogger.v1 = spacy_loggers.mlflow:mlflow_logger_v1
     spacy.ClearMLLogger.v1 = spacy_loggers.clearml:clearml_logger_v1
+    spacy.ChainLogger.v1 = spacy_loggers.chain:chain_logger_v1
+    spacy.PyTorchLogger.v1 = spacy_loggers.pytorch:pytorch_logger_v1
+    spacy.LookupLogger.v1 = spacy_loggers.lookup:lookup_logger_v1
 
 [flake8]
 ignore = E203, E266, E501, E731, W503, E741

diff --git a/spacy_loggers/chain.py b/spacy_loggers/chain.py
@@ -0,0 +1,54 @@
+"""
+A utility logger that allows multiple loggers to be daisy-chained.
+"""
+from typing import Dict, Any, Optional, IO
+import sys
+
+from spacy import Language
+from .util import LoggerT
+
+
+def chain_logger_v1(
+    logger1: Optional[LoggerT] = None,
+    logger2: Optional[LoggerT] = None,
+    logger3: Optional[LoggerT] = None,
+    logger4: Optional[LoggerT] = None,
+    logger5: Optional[LoggerT] = None,
+    logger6: Optional[LoggerT] = None,
+    logger7: Optional[LoggerT] = None,
+    logger8: Optional[LoggerT] = None,
+    logger9: Optional[LoggerT] = None,
+    logger10: Optional[LoggerT] = None,
+) -> LoggerT:
+    def setup_logger(nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr):
+        loggers = [
+            logger1,
+            logger2,
+            logger3,
+            logger4,
+            logger5,
+            logger6,
+            logger7,
+            logger8,
+            logger9,
+            logger10,
+        ]
+        if not any(loggers):
+            raise ValueError("No loggers passed to chain logger")
+        callbacks = [
+            setup(nlp, stdout, stderr) for setup in loggers if setup is not None
+        ]
+
+        def log_step(info: Optional[Dict[str, Any]]):
+            nonlocal callbacks
+            for log_stepper, _ in callbacks:
+                log_stepper(info)
+
+        def finalize():
+            nonlocal callbacks
+            for _, finalizer in callbacks:
+                finalizer()
+
+        return log_step, finalize
+
+    return setup_logger
diff --git a/spacy_loggers/lookup.py b/spacy_loggers/lookup.py
@@ -0,0 +1,30 @@
+"""
+A utility logger that looks up specific statistics and prints them to stdout.
+"""
+from typing import Dict, Any, Optional, IO, List
+import sys
+
+from spacy import Language
+from .util import dict_to_dot, LoggerT, matcher_for_regex_patterns
+
+
+def lookup_logger_v1(patterns: List[str]) -> LoggerT:
+    def setup_logger(nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr):
+        if len(patterns) == 0:
+            raise ValueError("Lookup logger should receive at least one pattern")
+        match_stat = matcher_for_regex_patterns(patterns)
+
+        def log_step(info: Optional[Dict[str, Any]]):
+            if info is None:
+                return
+            config_dot = dict_to_dot(info)
+            for k, v in config_dot.items():
+                if match_stat(k):
+                    stdout.writelines([k, " -> ", str(v), "\n"])
+
+        def finalize():
+            pass
+
+        return log_step, finalize
+
+    return setup_logger
diff --git a/spacy_loggers/pytorch.py b/spacy_loggers/pytorch.py
@@ -0,0 +1,77 @@
+"""
+A logger that queries PyTorch metrics and passes that information to downstream loggers.
+"""
+from typing import Dict, Any, Optional, Tuple, IO
+import re
+import sys
+
+from spacy import Language
+from .util import LoggerT
+
+
+def pytorch_logger_v1(
+    prefix: str = "pytorch",
+    device: int = 0,
+    cuda_mem_pool: str = "all",
+    cuda_mem_metric: str = "all",
+) -> LoggerT:
+    try:
+        import torch
+    except ImportError:
+        raise ImportError(
+            "The 'torch' library could not be found - did you install it? "
+            "Alternatively, specify the 'ConsoleLogger' in the "
+            "'training.logger' config section, instead of the 'PyTorchLogger'."
+        )
+
+    def setup_logger(nlp: Language, stdout: IO = sys.stdout, stderr: IO = sys.stderr):
+        expected_cuda_mem_pool = ("all", "large_pool", "small_pool")
+        expected_cuda_mem_metric = ("all", "current", "peak", "allocated", "free")
+
+        if cuda_mem_pool not in expected_cuda_mem_pool:
+            raise ValueError(
+                f"Got CUDA memory pool '{cuda_mem_pool}', but expected one of: '{expected_cuda_mem_pool}'"
+            )
+        elif cuda_mem_metric not in expected_cuda_mem_metric:
+            raise ValueError(
+                f"Got CUDA memory metric '{cuda_mem_metric}', but expected one of: '{expected_cuda_mem_metric}'"
+            )
+
+        def normalize_mem_value_to_mb(name: str, value: int) -> Tuple[str, float]:
+            if "_bytes" in name:
+                return re.sub("_bytes", "_megabytes", name), value / (1024.0**2)
+            else:
+                return name, value
+
+        def log_step(info: Optional[Dict[str, Any]]):
+            if info is None:
+                return
+
+            cuda_mem_stats = torch.cuda.memory_stats(device)
+            for stat, val in cuda_mem_stats.items():
+                splits = stat.split(".")
+                if len(splits) == 3:
+                    name, pool, metric = splits
+                    name, val = normalize_mem_value_to_mb(name, val)
+                    if pool != cuda_mem_pool:
+                        continue
+                    elif cuda_mem_metric != "all" and metric != cuda_mem_metric:
+                        continue
+                    info[f"{prefix}.{name}.{pool}.{metric}"] = val
+                elif len(splits) == 2:
+                    name, metric = splits
+                    name, val = normalize_mem_value_to_mb(name, val)
+                    if cuda_mem_metric != "all" and metric != cuda_mem_metric:
+                        continue
+                    info[f"{prefix}.{name}.{metric}"] = val
+                else:
+                    # Either global statistic or something that we haven't accounted for,
+                    # e.g: a newly added statistic. So, we'll just include it to be safe.
+                    info[f"{prefix}.{stat}"] = val
+
+        def finalize():
+            pass
+
+        return log_step, finalize
+
+    return setup_logger
diff --git a/spacy_loggers/tests/test_chain.py b/spacy_loggers/tests/test_chain.py
@@ -0,0 +1,47 @@
+import pytest
+
+from .util import load_logger_from_config
+
+
+valid_config_string = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[training]
+
+[training.logger]
+@loggers = "spacy.ChainLogger.v1"
+logger1 = {"@loggers": "spacy.ConsoleLogger.v1", "progress_bar": "true"}
+logger9 = {"@loggers": "spacy.LookupLogger.v1", "patterns": ["test"]}
+"""
+
+invalid_config_string = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[training]
+
+[training.logger]
+@loggers = "spacy.ChainLogger.v1"
+"""
+
+
+def test_load_from_config():
+    valid_logger, nlp = load_logger_from_config(valid_config_string)
+    valid_logger(nlp)
+
+    with pytest.raises(ValueError, match="No loggers"):
+        invalid_logger, nlp = load_logger_from_config(invalid_config_string)
+        invalid_logger(nlp)